Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic:
- { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] }

dsv4-fp4-b200-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
Expand Down Expand Up @@ -3049,7 +3049,7 @@ dsv4-fp4-b300-vllm-agentic:
- { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] }

dsv4-fp4-b300-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

# Disable DSv4 SWA scratch reuse: with attention-DP the V2 scheduler grows ctx KV
# (incl. SWA scratch) before delay batching defers a request, and the resulting
# revert_allocate_context -> resize(shrink) can't release the scratch of a
# never-forwarded request (no finish_event), crashing every dpa=true job.
export TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE="${TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE:-0}"
echo "TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE: $TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE"

if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

# Disable DSv4 SWA scratch reuse: with attention-DP the V2 scheduler grows ctx KV
# (incl. SWA scratch) before delay batching defers a request, and the resulting
# revert_allocate_context -> resize(shrink) can't release the scratch of a
# never-forwarded request (no finish_event), crashing every dpa=true job.
export TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE="${TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE:-0}"
echo "TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE: $TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE"

nvidia-smi

SERVER_LOG="$PWD/server.log"
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3378,6 +3378,13 @@
- "Add MTP speculative-decoding sibling for dsv4-fp4-mi355x-vllm (model: deepseek-ai/DeepSeek-V4-Pro) on vllm/vllm-openai-rocm:v0.22.0, per vllm-project/vllm#43385"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1630

- config-keys:
- dsv4-fp4-b200-trt
- dsv4-fp4-b300-trt
description:
- "Revert the non-MTP TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6 and disable DSv4 SWA scratch reuse via TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE=0 in the launcher. Root cause of the prior attention-DP hangs/crashes: the V2 scheduler grows a context request's KV cache (incl. SWA scratch slots) before delay batching can defer it, so revert_allocate_context -> resize(shrink) must release scratch slots of a never-forwarded request, which has no finish_event -> crash on every dpa=true job. Disabling scratch reuse stops those slots from being allocated so the revert shrinks cleanly."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1636

- config-keys:
- dsv4-fp4-mi355x-sglang-mtp
description:
Expand Down