diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh index 28677ae1e..806c59278 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh @@ -25,6 +25,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_USE_RUST_FRONTEND=1 EXTRA_VLLM_ARGS="" # if [ "$TP" -ge 4 ]; then # # AITER CK fused MoE kernels lack compiled tiles for N=intermediate_size/TP diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..e134acb6c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3395,3 +3395,11 @@ description: - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627 + +- config-keys: + - minimaxm2.5-fp4-mi355x-vllm + description: + - "Enable vLLM Rust request frontend by exporting VLLM_USE_RUST_FRONTEND=1 in benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh (v0.22.0 ROCm image ships the vllm-rs binary, so the flag engages it). Environment-only change; serve flags, TP/EP, attention/kernel settings unchanged" + - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" + - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634