SemiAnalysisAI · haic0 · Jun 1, 2026 · Jun 1, 2026
@@ -589,6 +589,98 @@ kimik2.5-int4-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
+kimik2.5-mxfp4-mi355x-vllm-eagle3:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+
+kimik2.5-int4-mi355x-vllm-eagle3:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+
+kimik2.5-int4-mi355x-vllm-fixed-ar-mtp:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-ar-mtp:
+    - isl: 1024
+      osl: 1024
+      draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
+      num-speculative-tokens: 3
+      rejection-sample-method: synthetic
+      synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+    - isl: 8192
+      osl: 1024
+      draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
+      num-speculative-tokens: 3
+      rejection-sample-method: synthetic
+      synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+
+kimik2.5-fp4-mi355x-vllm-fixed-ar-mtp:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-ar-mtp:
+    - isl: 1024
+      osl: 1024
+      draft-model: lightseekorg/kimi-k2.5-eagle3
+      num-speculative-tokens: 3
+      rejection-sample-method: synthetic
+      synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+    - isl: 8192
+      osl: 1024
+      draft-model: lightseekorg/kimi-k2.5-eagle3
+      num-speculative-tokens: 3
+      rejection-sample-method: synthetic
+      synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+
 kimik2.5-int4-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.21.0
   model: moonshotai/Kimi-K2.5
@@ -724,6 +816,25 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
 
+minimaxm2.5-fp8-mi355x-vllm-eagle3:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
+
 # Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
 # the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
 # its fixed-seq-len sweep is unaffected.

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -53,7 +53,6 @@ on:
       run-eval:
         type: boolean
         required: true
-        default: false
       eval-only:
         description: "Run only evals (skip throughput benchmark)"
         type: boolean
@@ -68,10 +67,30 @@ on:
         required: false
         type: string
       scenario-type:
-        description: "Scenario type (fixed-seq-len or agentic-coding)"
+        description: "Scenario type (fixed-seq-len, agentic-coding, or fixed-ar-mtp)"
         required: false
         type: string
         default: 'fixed-seq-len'
+      draft-model:
+        description: "Draft model for fixed-AR MTP scenarios"
+        required: false
+        type: string
+        default: ''
+      num-speculative-tokens:
+        description: "Number of speculative tokens for fixed-AR MTP scenarios"
+        required: false
+        type: string
+        default: ''
+      rejection-sample-method:
+        description: "Speculative rejection sampling method"
+        required: false
+        type: string
+        default: ''
+      synthetic-acceptance-rates:
+        description: "JSON array of synthetic acceptance rates for fixed-AR MTP scenarios"
+        required: false
+        type: string
+        default: ''
       offloading:
         description: "KV offload backend for agentic scenarios (none/cpu/ssd)"
         required: false
@@ -111,6 +130,10 @@ env:
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
   SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
+  DRAFT_MODEL: ${{ inputs.draft-model }}
+  NUM_SPECULATIVE_TOKENS: ${{ inputs.num-speculative-tokens }}
+  REJECTION_SAMPLE_METHOD: ${{ inputs.rejection-sample-method }}
+  SYNTHETIC_ACCEPTANCE_RATES: ${{ inputs.synthetic-acceptance-rates }}
   OFFLOADING: ${{ inputs.offloading }}
   TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
   DURATION: ${{ inputs.duration }}

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -51,6 +51,7 @@
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
             agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
             multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
+            fixed-ar-mtp-config: ${{ steps.get-jobs.outputs.fixed-ar-mtp-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -71,12 +72,14 @@
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
                   AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
                   MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
+                  FIXED_AR_MTP=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'fixed-ar-mtp']))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'fixed-ar-mtp') and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
                   EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
                   echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
+                  echo "fixed-ar-mtp-config=$FIXED_AR_MTP" >> $GITHUB_OUTPUT
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
                   echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
@@ -190,7 +193,7 @@
             osl: '0'
             max-model-len: '0'
             spec-decoding: 'none'
-            disagg: 'false'
+            disagg: ${{ 'false' }}
             run-eval: false
             scenario-type: agentic-coding
             ref: ${{ inputs.ref }}
@@ -235,7 +238,42 @@
             scenario-type: agentic-coding
             ref: ${{ inputs.ref }}
 
+    test-sweep-fixed-ar-mtp:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.fixed-ar-mtp-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: Fixed-AR-MTP throughput /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.fixed-ar-mtp-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: false
+            scenario-type: fixed-ar-mtp
+            draft-model: ${{ matrix.config.draft-model }}
+            num-speculative-tokens: ${{ matrix.config.num-speculative-tokens }}
+            rejection-sample-method: ${{ matrix.config.rejection-sample-method }}
+            synthetic-acceptance-rates: ${{ toJson(matrix.config.synthetic-acceptance-rates) }}
+            ref: ${{ inputs.ref }}
+
     test-sweep-single-node:
         needs: get-jobs
         if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }}
        uses: ./.github/workflows/benchmark-tmpl.yml
@@ -297,14 +335,14 @@
             ref: ${{ inputs.ref }}
 
     collect-results:
-        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
-        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
+        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-fixed-ar-mtp, test-sweep-agentic, test-sweep-multi-node-agentic]
+        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-fixed-ar-mtp.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
         with:
            result-prefix: "bmk"

    collect-evals:
        needs: [test-sweep-evals, test-sweep-multi-node-evals]
        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
        uses: ./.github/workflows/collect-evals.yml