SemiAnalysisAI · chunfangamd · Jun 1, 2026 · Jun 1, 2026 · Jun 2, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh
@@ -25,6 +25,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_USE_RUST_FRONTEND=1
 EXTRA_VLLM_ARGS=""
 # if [ "$TP" -ge 4 ]; then
 #     # AITER CK fused MoE kernels lack compiled tiles for N=intermediate_size/TP

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3395,3 +3395,11 @@
   description:
     - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627
+
+- config-keys:
+    - minimaxm2.5-fp4-mi355x-vllm
+  description:
+    - "Enable vLLM Rust request frontend by exporting VLLM_USE_RUST_FRONTEND=1 in benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh (v0.22.0 ROCm image ships the vllm-rs binary, so the flag engages it). Environment-only change; serve flags, TP/EP, attention/kernel settings unchanged"
+    - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
+    - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634