Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,98 @@ kimik2.5-int4-mi355x-vllm:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

kimik2.5-mxfp4-mi355x-vllm-eagle3:
image: vllm/vllm-openai-rocm:v0.21.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

kimik2.5-int4-mi355x-vllm-eagle3:
image: vllm/vllm-openai-rocm:v0.21.0
model: moonshotai/Kimi-K2.5
model-prefix: kimik2.5
runner: mi355x
precision: int4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

kimik2.5-int4-mi355x-vllm-fixed-ar-mtp:
image: vllm/vllm-openai-rocm:v0.21.0
model: moonshotai/Kimi-K2.5
model-prefix: kimik2.5
runner: mi355x
precision: int4
framework: vllm
multinode: false
scenarios:
fixed-ar-mtp:
- isl: 1024
osl: 1024
draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
num-speculative-tokens: 3
rejection-sample-method: synthetic
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
draft-model: nvidia/Kimi-K2.5-Thinking-Eagle3
num-speculative-tokens: 3
rejection-sample-method: synthetic
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

kimik2.5-fp4-mi355x-vllm-fixed-ar-mtp:
image: vllm/vllm-openai-rocm:v0.21.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-ar-mtp:
- isl: 1024
osl: 1024
draft-model: lightseekorg/kimi-k2.5-eagle3
num-speculative-tokens: 3
rejection-sample-method: synthetic
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
draft-model: lightseekorg/kimi-k2.5-eagle3
num-speculative-tokens: 3
rejection-sample-method: synthetic
synthetic-acceptance-rates: [0.778774, 0.57543, 0.412793]
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

kimik2.5-int4-mi325x-vllm:
image: vllm/vllm-openai-rocm:v0.21.0
model: moonshotai/Kimi-K2.5
Expand Down Expand Up @@ -724,6 +816,25 @@ minimaxm2.5-fp8-mi355x-vllm:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
- { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }

minimaxm2.5-fp8-mi355x-vllm-eagle3:
image: vllm/vllm-openai-rocm:v0.21.0
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
# its fixed-seq-len sweep is unaffected.
Expand Down
27 changes: 25 additions & 2 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ on:
run-eval:
type: boolean
required: true
default: false
eval-only:
description: "Run only evals (skip throughput benchmark)"
type: boolean
Expand All @@ -68,10 +67,30 @@ on:
required: false
type: string
scenario-type:
description: "Scenario type (fixed-seq-len or agentic-coding)"
description: "Scenario type (fixed-seq-len, agentic-coding, or fixed-ar-mtp)"
required: false
type: string
default: 'fixed-seq-len'
draft-model:
description: "Draft model for fixed-AR MTP scenarios"
required: false
type: string
default: ''
num-speculative-tokens:
description: "Number of speculative tokens for fixed-AR MTP scenarios"
required: false
type: string
default: ''
rejection-sample-method:
description: "Speculative rejection sampling method"
required: false
type: string
default: ''
synthetic-acceptance-rates:
description: "JSON array of synthetic acceptance rates for fixed-AR MTP scenarios"
required: false
type: string
default: ''
offloading:
description: "KV offload backend for agentic scenarios (none/cpu/ssd)"
required: false
Expand Down Expand Up @@ -111,6 +130,10 @@ env:
SCENARIO_TYPE: ${{ inputs.scenario-type }}
SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }}
IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
DRAFT_MODEL: ${{ inputs.draft-model }}
NUM_SPECULATIVE_TOKENS: ${{ inputs.num-speculative-tokens }}
REJECTION_SAMPLE_METHOD: ${{ inputs.rejection-sample-method }}
SYNTHETIC_ACCEPTANCE_RATES: ${{ inputs.synthetic-acceptance-rates }}
OFFLOADING: ${{ inputs.offloading }}
TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
DURATION: ${{ inputs.duration }}
Expand Down
46 changes: 42 additions & 4 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
fixed-ar-mtp-config: ${{ steps.get-jobs.outputs.fixed-ar-mtp-config }}
steps:
- name: Checkout code (ref)
if: ${{ inputs.ref && inputs.ref != '' }}
Expand All @@ -71,12 +72,14 @@
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
FIXED_AR_MTP=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'fixed-ar-mtp']))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'fixed-ar-mtp') and not x.get('eval-only', False)]))")
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
echo "fixed-ar-mtp-config=$FIXED_AR_MTP" >> $GITHUB_OUTPUT
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -190,7 +193,7 @@
osl: '0'
max-model-len: '0'
spec-decoding: 'none'
disagg: 'false'
disagg: ${{ 'false' }}
run-eval: false
scenario-type: agentic-coding
ref: ${{ inputs.ref }}
Expand Down Expand Up @@ -235,7 +238,42 @@
scenario-type: agentic-coding
ref: ${{ inputs.ref }}

test-sweep-fixed-ar-mtp:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.fixed-ar-mtp-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: Fixed-AR-MTP throughput /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.fixed-ar-mtp-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}
run-eval: false
scenario-type: fixed-ar-mtp
draft-model: ${{ matrix.config.draft-model }}
num-speculative-tokens: ${{ matrix.config.num-speculative-tokens }}
rejection-sample-method: ${{ matrix.config.rejection-sample-method }}
synthetic-acceptance-rates: ${{ toJson(matrix.config.synthetic-acceptance-rates) }}
ref: ${{ inputs.ref }}

test-sweep-single-node:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: get-jobs
if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
Expand Down Expand Up @@ -297,14 +335,14 @@
ref: ${{ inputs.ref }}

collect-results:
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-fixed-ar-mtp, test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-fixed-ar-mtp.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
with:
result-prefix: "bmk"

collect-evals:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: [test-sweep-evals, test-sweep-multi-node-evals]
if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
uses: ./.github/workflows/collect-evals.yml
Expand Down
Loading