diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..718f95196 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -3049,7 +3049,7 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } dsv4-fp4-b300-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index e4a24dea2..f2b19a8cb 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -47,6 +47,13 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +# Disable DSv4 SWA scratch reuse: with attention-DP the V2 scheduler grows ctx KV +# (incl. SWA scratch) before delay batching defers a request, and the resulting +# revert_allocate_context -> resize(shrink) can't release the scratch of a +# never-forwarded request (no finish_event), crashing every dpa=true job. +export TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE="${TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE:-0}" +echo "TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE: $TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE" + if [[ "$MODEL" != /* ]]; then hf download "$MODEL" fi diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh index b0150e10d..3606c677f 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh @@ -59,6 +59,13 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +# Disable DSv4 SWA scratch reuse: with attention-DP the V2 scheduler grows ctx KV +# (incl. SWA scratch) before delay batching defers a request, and the resulting +# revert_allocate_context -> resize(shrink) can't release the scratch of a +# never-forwarded request (no finish_event), crashing every dpa=true job. +export TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE="${TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE:-0}" +echo "TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE: $TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE" + nvidia-smi SERVER_LOG="$PWD/server.log" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..17d162518 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3378,6 +3378,13 @@ - "Add MTP speculative-decoding sibling for dsv4-fp4-mi355x-vllm (model: deepseek-ai/DeepSeek-V4-Pro) on vllm/vllm-openai-rocm:v0.22.0, per vllm-project/vllm#43385" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1630 +- config-keys: + - dsv4-fp4-b200-trt + - dsv4-fp4-b300-trt + description: + - "Revert the non-MTP TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-2dd03e6 and disable DSv4 SWA scratch reuse via TRTLLM_DSV4_ENABLE_SWA_SCRATCH_REUSE=0 in the launcher. Root cause of the prior attention-DP hangs/crashes: the V2 scheduler grows a context request's KV cache (incl. SWA scratch slots) before delay batching can defer it, so revert_allocate_context -> resize(shrink) must release scratch slots of a never-forwarded request, which has no finish_event -> crash on every dpa=true job. Disabling scratch reuse stops those slots from being allocated so the revert shrinks cleanly." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1636 + - config-keys: - dsv4-fp4-mi355x-sglang-mtp description: