Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9261,6 +9261,180 @@ qwen3.5-fp8-h100-sglang-mtp:
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }

minimaxm2.5-fp4-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: nvidia/MiniMax-M2.5-NVFP4
model-prefix: minimaxm2.5
runner: gb300-nv
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [2, 4, 16]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [4, 8, 16, 64]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- conc-list: [32, 64, 128]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [64, 128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml"
decode:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
- conc-list: [6144, 8192]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml"
decode:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- isl: 8192
osl: 1024
search-space:
- conc-list: [2, 4, 8, 16]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [32, 64, 128, 256]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [64, 128]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [256]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
- conc-list: [1024, 2048]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

glm5-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
model: nvidia/GLM-5-NVFP4
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2-c6144"

model:
path: "minimax-m2.5-nvfp4"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

setup_script: install-deps.sh

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 3
prefill_workers: 2
decode_workers: 3
gpus_per_prefill: 1
gpus_per_decode: 2
spread_workers: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
UCX_TLS: "cuda_copy,rc"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
UCX_TLS: "cuda_copy,rc"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
trust-remote-code: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
stream-interval: 128

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
max-num-seqs: 864
gpu-memory-utilization: 0.90
stream-interval: 128

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "6144x8192"
random_range_ratio: 0.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2"

model:
path: "minimax-m2.5-nvfp4"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

setup_script: install-deps.sh

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 3
prefill_workers: 2
decode_workers: 3
gpus_per_prefill: 1
gpus_per_decode: 2
spread_workers: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
UCX_TLS: "cuda_copy,rc"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
UCX_TLS: "cuda_copy,rc"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
trust-remote-code: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
max-num-seqs: 864
gpu-memory-utilization: 0.90
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "2048"
random_range_ratio: 0.8
Comment on lines +60 to +73
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 All 12 new MiniMax-M2.5 recipes set trust-remote-code: true only in the prefill vllm_config block — the decode block is missing it. Since MiniMax-M2.5 ships custom HuggingFace modeling code and decode workers independently load the model, every decode engine will fail at startup with the HF trust_remote_code=True required error, blocking all 12 recipes from reaching a ready state. Add trust-remote-code: true under each decode: block (the dsv4 reference recipes already do this — see benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml lines 75 and 94).

Extended reasoning...

What's wrong

Every one of the 12 new MiniMax-M2.5 recipes added under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/ sets trust-remote-code: true in the prefill block of vllm_config but not in the decode block. Verified by counting occurrences in each file — every recipe yields exactly 1 match, all in the prefill block:

1k1k/dep2-2p3d-c6144.yaml: 1
1k1k/dep2-2p3d.yaml: 1
1k1k/dep8-2p1d.yaml: 1
1k1k/tp4-1p1d.yaml: 1
1k1k/tp4-1p2d.yaml: 1
1k1k/tp4ep-1p1d.yaml: 1
1k1k/tp4ep-1p3d.yaml: 1
8k1k/dep4-4p1d.yaml: 1
8k1k/dep8-4p1d.yaml: 1
8k1k/tp4-1p1d.yaml: 1
8k1k/tp4ep-1p1d.yaml: 1
8k1k/tp4ep-2p1d.yaml: 1

The reference dsv4 disagg recipe benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml sets trust-remote-code: true in both prefill (line 75) and decode (line 94) — that is the deliberate, established convention in this directory tree for models that need it.

Why it manifests

The vllm_config.prefill and vllm_config.decode blocks are passed as independent CLI argument sets to two separate vLLM engine processes — one for each worker role. There is no inheritance: every other flag (kv-transfer-config, kv-cache-dtype, no-enable-prefix-caching, max-model-len, max-cudagraph-capture-size, max-num-batched-tokens, stream-interval) is explicitly repeated in both blocks in every minimax recipe, confirming the author knows the blocks don't inherit.

Decode workers each spin up their own vLLM engine that independently calls HuggingFace's AutoModelForCausalLM.from_pretrained(...) on nvidia/MiniMax-M2.5-NVFP4. MiniMax-M2.5 ships custom modeling code (modeling_minimax.py / configuration_minimax.py), so HF refuses to load it without trust_remote_code=True and raises:

ValueError: ... requires you to execute the configuration file ... in that repo on your local machine. ... pass trust_remote_code=True to remove this error.

This fires at engine startup, before any inference happens, so the decode engine never reaches a ready state and the disaggregated job hangs indefinitely waiting for decode readiness.

Why this is provably a bug

The asymmetry cannot be intentional:

  • If MiniMax-M2.5 does not need trust_remote_code, then the prefill setting in all 12 recipes is dead config — and the author wouldn't have added it.
  • If it does need it (which the prefill setting itself asserts), then the decode workers will crash at HF model loading.

Both branches make the decode omission a bug. The existing in-repo launcher experimental/token_position_decode_slo/minimax-m2.5/serve_minimax_tep8_sbatch.sh (line 49) passes --trust-remote-code to vLLM, further confirming the model requires it. There is also no global default in runners/launch_gb300-nv.sh that injects --trust-remote-code for decode workers.

Impact

All 12 recipes — driving the entire new minimaxm2.5-fp4-gb300-dynamo-vllm config in .github/configs/nvidia-master.yaml (1k/1k and 8k/1k sweeps across TP4, TP4+EP, DEP2, DEP8, and multi-decode layouts) — will fail to start. No benchmark numbers will be produced for this entire config until the fix lands.

Step-by-step proof (worked example: 1k1k/dep2-2p3d.yaml)

  1. CI selects the conc-list: [2048] entry in nvidia-master.yaml whose additional-settings points to recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml.
  2. srtctl apply reads that file; the YAML's vllm_config.prefill includes trust-remote-code: true, but vllm_config.decode (lines 60–73 in the new file) does not.
  3. srt-slurm launches 2 prefill workers and 3 decode workers as separate SLURM steps. Each decode worker invokes vllm serve nvidia/MiniMax-M2.5-NVFP4 --kv-transfer-config ... --kv-cache-dtype fp8 --data-parallel-size 2 ... --no-enable-prefix-caching ...without --trust-remote-code.
  4. vLLM's engine startup calls AutoConfig.from_pretrained('nvidia/MiniMax-M2.5-NVFP4', trust_remote_code=False). Because the repo ships custom configuration_minimax.py, HF raises ValueError: The repository for nvidia/MiniMax-M2.5-NVFP4 contains custom code which must be executed to correctly load the model. ... pass trust_remote_code=True to remove this error.
  5. Decode worker exits non-zero before serving anything. The disaggregated frontend never sees decode become ready; VLLM_ENGINE_READY_TIMEOUT_S=3600 fires after an hour and the SLURM job is killed with no useful output.

Fix

Add trust-remote-code: true to the decode: block of every recipe under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/ (12 files total):

    decode:
      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
      kv-cache-dtype: "fp8"
      trust-remote-code: true   # <-- add this line
      ...

This mirrors the dsv4 pattern at benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml line 94.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p1xdep8"

model:
path: "minimax-m2.5-nvfp4"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

setup_script: install-deps.sh

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 2
decode_workers: 1
gpus_per_prefill: 1
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
trust-remote-code: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
stream-interval: 128

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
no-enable-prefix-caching: true
max-model-len: 2048
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048
max-num-seqs: 864
gpu-memory-utilization: 0.90
stream-interval: 128

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1024x2048x4096"
random_range_ratio: 0.8
Loading