From 3a8a68fae01fd8ea397651b2eb2b8e1a7bc3e1a4 Mon Sep 17 00:00:00 2001
From: Albert Cheng <albertching0112@gmail.com>
Date: Tue, 2 Jun 2026 14:15:57 -0700
Subject: [PATCH] Add GitHub Action to collect SPEED-Bench AL matrix

Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro
SPEED-Bench acceptance-length matrix (thinking on/off x MTP 1-8) on
self-hosted B300 runners, optionally opening a PR that updates
benchmarks/speedbench-reference-al.yaml.

- benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh:
  per (thinking, MTP) cell, serve vLLM, run SPEED-Bench, derive AL from
  /metrics, and emit the YAML matrix. Serves from MODEL_PATH (the local
  pre-staged weights resolved by the launcher), falling back to MODEL for
  a standalone local run. Carries a temporary --chat-template-kwargs shim
  until vllm-project/vllm#44244 lands in the benchmark image (idempotent,
  applied only for thinking-on cells).
- runners/launch_b300-nv.sh: add opt-in BENCH_SCRIPT_OVERRIDE and
  SALLOC_TIME_LIMIT hooks; both default to the prior behavior.
- .github/workflows/speedbench-al.yml: workflow_dispatch entry point;
  MODEL is the HF id so the launcher resolves the staged MODEL_PATH.
---
 .github/workflows/speedbench-al.yml           | 200 +++++++++++
 .../dsv4_fp4_b300_vllm_speedbench_matrix.sh   | 337 ++++++++++++++++++
 runners/launch_b300-nv.sh                     |  10 +-
 3 files changed, 546 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/speedbench-al.yml
 create mode 100755 benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
new file mode 100644
index 000000000..771e53e6c
--- /dev/null
+++ b/.github/workflows/speedbench-al.yml
@@ -0,0 +1,200 @@
+name: SpeedBench AL Collection
+
+# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench
+# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the
+# golden reference consumed by the synthetic-acceptance framework and (optionally)
+# opens a PR updating benchmarks/speedbench-reference-al.yaml.
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Self-hosted GPU runner label (B300)"
+        required: false
+        type: string
+        default: 'b300'
+      image:
+        description: "vLLM container image"
+        required: false
+        type: string
+        default: 'vllm/vllm-openai:v0.21.0'
+      mtp-list:
+        description: "Space-separated MTP levels (num_speculative_tokens)"
+        required: false
+        type: string
+        default: '1 2 3 4 5 6 7 8'
+      thinking-modes:
+        description: "Space-separated thinking modes to collect"
+        required: false
+        type: string
+        default: 'off on'
+      category:
+        description: "SPEED-Bench category"
+        required: false
+        type: string
+        default: 'coding'
+      output-len:
+        description: "Per-request output length"
+        required: false
+        type: string
+        default: '4096'
+      thinking-kwargs:
+        description: "chat_template_kwargs JSON for thinking-on cells (match golden config)"
+        required: false
+        type: string
+        default: '{"thinking": true, "reasoning_effort": "high"}'
+      salloc-time:
+        description: "Slurm allocation minutes (16 server starts ~ several hours)"
+        required: false
+        type: string
+        default: '480'
+      open-pr:
+        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml"
+        required: false
+        type: boolean
+        default: true
+      ref:
+        description: "Git ref (branch/sha) to checkout"
+        required: false
+        type: string
+
+permissions:
+  contents: read
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the
+  # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so
+  # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts
+  # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download.
+  MODEL: deepseek-ai/DeepSeek-V4-Pro
+  MODEL_PREFIX: dsv4
+  PRECISION: fp4
+  FRAMEWORK: vllm
+  EXP_NAME: dsv4_speedbench
+  IMAGE: ${{ inputs.image }}
+  TP: '8'
+  EP_SIZE: '1'
+  DP_ATTENTION: 'false'
+  SPEC_DECODING: mtp
+  # Run the AL-matrix collector instead of the auto-selected throughput script.
+  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+  SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }}
+  # Matrix-collector tunables (propagated into the container via srun --export=ALL).
+  MTP_LIST: ${{ inputs.mtp-list }}
+  THINKING_MODES: ${{ inputs.thinking-modes }}
+  CATEGORY: ${{ inputs.category }}
+  SPEEDBENCH_OUTPUT_LEN: ${{ inputs.output-len }}
+  CHAT_TEMPLATE_KWARGS_ON: ${{ inputs.thinking-kwargs }}
+  OUT_YAML: /workspace/speedbench-reference-al.yaml
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
+
+jobs:
+  collect-al:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 600
+    name: "SpeedBench AL matrix | ${{ inputs.category }} | mtp=[${{ inputs.mtp-list }}] | thinking=[${{ inputs.thinking-modes }}]"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          # Cleanup Docker resources
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          # Cleanup SLURM resources
+          if command -v squeue >/dev/null 2>&1; then
+            echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+            scancel --name="${{ runner.name }}" || true
+            while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+              squeue --name="${{ runner.name }}"
+              sleep 5
+            done
+          fi
+
+          # Cleanup AL-matrix outputs from a prior job on this runner so a stale
+          # matrix from a previous run is never picked up as this job's output.
+          rm -rf "${{ github.workspace }}/speedbench_results" 2>/dev/null || true
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.sha }}
+          clean: true
+          submodules: true
+
+      - name: Cleanup stale outputs (pre-run)
+        run: |
+          rm -f speedbench-reference-al.yaml || true
+          rm -f gpu_metrics.csv || true
+          rm -rf speed_bench_data || true
+
+      - name: Collect AL matrix
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: |
+          set -euo pipefail
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          if [ ! -f "speedbench-reference-al.yaml" ]; then
+            echo "AL collection failed: speedbench-reference-al.yaml not produced." >&2
+            exit 1
+          fi
+          echo "### SpeedBench AL matrix" >> "$GITHUB_STEP_SUMMARY"
+          echo '```yaml' >> "$GITHUB_STEP_SUMMARY"
+          cat speedbench-reference-al.yaml >> "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload AL matrix artifact
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: speedbench-reference-al
+          path: speedbench-reference-al.yaml
+          if-no-files-found: warn
+
+      - name: Open PR updating reference yaml
+        if: ${{ inputs.open-pr && success() }}
+        env:
+          GH_TOKEN: ${{ secrets.REPO_PAT }}
+        run: |
+          set -euo pipefail
+          cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
+
+          BRANCH="speedbench-al/auto-${{ github.run_id }}"
+          git config user.name "github-actions"
+          git config user.email "github-actions@github.com"
+          git checkout -b "$BRANCH"
+          git add benchmarks/speedbench-reference-al.yaml
+          if git diff --cached --quiet; then
+            echo "No change in reference yaml; skipping PR."
+            exit 0
+          fi
+          git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})"
+          git push -u origin "$BRANCH"
+          gh pr create \
+            --title "Update SpeedBench AL reference matrix (auto)" \
+            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
+            --base main \
+            --head "$BRANCH"
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: speedbench_server_logs
+          path: speedbench_results/server_*.log
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup
\ No newline at end of file
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
new file mode 100755
index 000000000..572801b2c
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
@@ -0,0 +1,337 @@
+#!/usr/bin/env bash
+
+# DSV4-Pro B300 vLLM SPEED-Bench AL matrix collector.
+#
+# Produces the golden acceptance-length (AL) reference matrix consumed by the
+# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
+# level (num_speculative_tokens), measure the AL on a single SPEED-Bench
+# category (default: coding) and emit a YAML matrix identical in shape to
+# benchmarks/speedbench-reference-al.yaml.
+#
+# This is the "AL distribution collection" script wired into the
+# speedbench-al.yml GitHub Action (workflow_dispatch / push-button).
+#
+# Usage (inside the vLLM container, on a B300 node):
+#   export MODEL=/data/models/dsv4-pro
+#   bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+#
+# Tunables (env):
+#   MTP_LIST          space-separated MTP levels   (default "1 2 3 4 5 6 7 8")
+#   THINKING_MODES    space-separated: off|on       (default "off on")
+#   CATEGORY          SPEED-Bench category          (default coding)
+#   SPEEDBENCH_OUTPUT_LEN  per-request output len   (default 4096)
+#   OUT_YAML          output matrix path            (default $RESULTS_DIR/speedbench-reference-al.yaml)
+
+set -uo pipefail
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}"
+# Serve from the local weights dir resolved by the launcher (MODEL_PATH points
+# at the pre-staged copy, e.g. /scratch/models/DeepSeek-V4-Pro). Falls back to
+# MODEL for a standalone local run where MODEL is itself a path. A leading "/"
+# makes the download guard below a no-op.
+SERVE_MODEL="${MODEL_PATH:-$MODEL}"
+TP="${TP:-8}"
+DP_ATTENTION="${DP_ATTENTION:-false}"
+EP_SIZE="${EP_SIZE:-1}"
+PORT="${PORT:-8888}"
+
+MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
+THINKING_MODES="${THINKING_MODES:-off on}"
+CATEGORY="${CATEGORY:-coding}"
+SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
+CONCURRENCY="${CONCURRENCY:-1}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+# thinking-on chat_template_kwargs. MUST match the production/golden config:
+# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured
+# with reasoning_effort=high.
+CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}"
+
+SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}"
+RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}"
+OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}"
+
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+mkdir -p "$RESULTS_DIR"
+nvidia-smi
+if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi
+
+# ---- Download SPEED-Bench dataset ----
+echo "=== Downloading SPEED-Bench dataset ==="
+pip install -q datasets tiktoken
+curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \
+  | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR"
+
+if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then
+    echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found"
+    exit 1
+fi
+
+# ---- Temporary shim: add a real --chat-template-kwargs CLI option ----
+# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset
+# pre-renders the chat template client-side WITHOUT chat_template_kwargs and
+# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body
+# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs
+# option through get_samples into CustomDataset.sample's apply_chat_template.
+# TODO: delete this whole block once #44244 is released in the benchmark image;
+# the patch is idempotent (marker check) so it is safe to leave until then.
+apply_chat_template_kwargs_shim() {
+    echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ==="
+    python3 - <<'PYEOF'
+import vllm.benchmarks.serve as S
+import vllm.benchmarks.datasets.datasets as D
+
+def patch(mod, edits, marker):
+    f = mod.__file__
+    src = open(f).read()
+    if marker in src:
+        print("already patched:", f)
+        return
+    for old, new in edits:
+        n = src.count(old)
+        assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..."
+        src = src.replace(old, new, 1)
+    open(f, "w").write(src)
+    print("patched OK ->", f)
+
+# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body
+serve_old = '''    parser.add_argument(
+        "--extra-body",'''
+serve_new = '''    parser.add_argument(
+        "--chat-template-kwargs",
+        type=json.loads,
+        default=None,
+        help="JSON dict forwarded to apply_chat_template during "
+        "client-side prompt rendering, e.g. to enable reasoning mode.",
+    )
+    parser.add_argument(
+        "--extra-body",'''
+patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"')
+
+# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call
+disp_old = '''                output_len=args.speed_bench_output_len,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+disp_new = '''                output_len=args.speed_bench_output_len,
+                chat_template_kwargs=args.chat_template_kwargs,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+
+# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call
+samp_old = '''                # apply template
+                if not skip_chat_template:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+samp_new = '''                # apply template
+                if not skip_chat_template:
+                    _ctk = kwargs.get("chat_template_kwargs") or {}
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                        **_ctk,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+patch(D, [(disp_old, disp_new), (samp_old, samp_new)],
+      marker="chat_template_kwargs=args.chat_template_kwargs")
+PYEOF
+}
+
+# Apply the shim once if any thinking-on cell is requested.
+if [[ " $THINKING_MODES " == *" on "* ]]; then
+    if ! apply_chat_template_kwargs_shim; then
+        echo "CRITICAL: --chat-template-kwargs shim failed — aborting"
+        exit 1
+    fi
+fi
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+MOE_ARGS=()
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
+fi
+
+fetch_metric() {
+    local port="$1" name="$2"
+    curl -s "http://localhost:${port}/metrics" \
+      | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0"
+}
+
+SERVER_PID=""
+# List all descendant PIDs of $1 recursively, matched by PARENT pid. This can
+# never include this script (the script is an ancestor of the server, not a
+# descendant), so it avoids the self-kill a name-based `pkill -f vllm` caused
+# (the script filename contains "vllm").
+_descendants() {
+    local pid="$1" child
+    for child in $(pgrep -P "$pid" 2>/dev/null || true); do
+        echo "$child"
+        _descendants "$child"
+    done
+}
+cleanup_server() {
+    if [[ -n "$SERVER_PID" ]]; then
+        # Snapshot the server's worker/EngineCore subprocesses BEFORE killing the
+        # parent: once the parent dies the children reparent to init and the tree
+        # link is lost. Killing the captured PIDs guarantees no orphaned worker
+        # survives to hold GPU memory and OOM the next server start.
+        local descendants
+        descendants=$(_descendants "$SERVER_PID")
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+        local pid
+        for pid in $descendants; do
+            kill -9 "$pid" 2>/dev/null || true
+        done
+        # Wait for GPU memory to actually free before the next server starts.
+        local waited=0
+        while [[ $waited -lt 120 ]]; do
+            local used
+            used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1)
+            if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi
+            sleep 3; waited=$((waited + 3))
+        done
+        SERVER_PID=""
+    fi
+}
+trap 'cleanup_server' EXIT
+
+start_gpu_monitor
+
+# Per-cell AL is collected into associative arrays keyed by "mode_mtp".
+declare -A AL_RESULT
+
+run_cell() {
+    local mode="$1" mtp="$2"
+    local think_args=()
+    if [[ "$mode" == "on" ]]; then
+        think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON")
+    fi
+
+    echo ""
+    echo "=========================================="
+    echo "  Cell: thinking=$mode  MTP=$mtp  category=$CATEGORY"
+    echo "=========================================="
+
+    local serve_args=(
+        --host 0.0.0.0 --port "$PORT"
+        "${PARALLEL_ARGS[@]}"
+        --pipeline-parallel-size 1
+        --kv-cache-dtype fp8
+        --trust-remote-code
+        --block-size 256
+        --no-enable-prefix-caching
+        "${EP_ARGS[@]}"
+        "${MOE_ARGS[@]}"
+        --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+        --attention_config.use_fp4_indexer_cache True
+        --tokenizer-mode deepseek_v4
+        --tool-call-parser deepseek_v4
+        --enable-auto-tool-choice
+        --reasoning-parser deepseek_v4
+        --max-cudagraph-capture-size 2048
+        --max-model-len 16384
+        --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}"
+    )
+
+    local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log"
+    vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 &
+    SERVER_PID=$!
+
+    if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then
+        echo "  -> server failed to start (thinking=$mode mtp=$mtp), recording N/A"
+        AL_RESULT["${mode}_${mtp}"]="N/A"
+        cleanup_server
+        return
+    fi
+
+    local acc_before drf_before acc_after drf_after
+    acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    vllm bench serve \
+        --model "$SERVE_MODEL" \
+        --port "$PORT" \
+        --dataset-name speed_bench \
+        --dataset-path "$SPEEDBENCH_DIR" \
+        --speed-bench-category "$CATEGORY" \
+        --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \
+        --num-prompts -1 \
+        --max-concurrency "$CONCURRENCY" \
+        --save-result \
+        --result-dir "$RESULTS_DIR" \
+        --result-filename "speedbench_${mode}_mtp${mtp}" \
+        --trust-remote-code \
+        --tokenizer-mode deepseek_v4 \
+        --temperature "$TEMPERATURE" \
+        "${think_args[@]}"
+
+    acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    local delta_acc delta_drf al
+    delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}")
+    delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}")
+    if [[ "$delta_drf" -gt 0 ]]; then
+        al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}")
+    else
+        al="N/A"
+    fi
+    echo "  -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)"
+    AL_RESULT["${mode}_${mtp}"]="$al"
+
+    cleanup_server
+}
+
+for mode in $THINKING_MODES; do
+    for mtp in $MTP_LIST; do
+        run_cell "$mode" "$mtp"
+    done
+done
+
+stop_gpu_monitor
+
+# ---- Emit the YAML matrix ----
+emit_mode_block() {
+    local mode="$1"
+    for mtp in $MTP_LIST; do
+        echo "    $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}"
+    done
+}
+
+{
+    echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
+    echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN"
+    echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON"
+    echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens."
+    echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)."
+    echo "#"
+    echo "# key = num_speculative_tokens (MTP level); value = golden AL"
+    echo "deepseek-v4-pro:"
+    if [[ " $THINKING_MODES " == *" on "* ]]; then
+        echo "  thinking_on:"
+        emit_mode_block on
+    fi
+    if [[ " $THINKING_MODES " == *" off "* ]]; then
+        echo "  thinking_off:"
+        emit_mode_block off
+    fi
+} > "$OUT_YAML"
+
+echo ""
+echo "=========================================="
+echo "  SPEED-Bench AL matrix written to: $OUT_YAML"
+echo "=========================================="
+cat "$OUT_YAML"
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 67e8b48cc..e6bdf1a0d 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -334,6 +334,12 @@ else
         BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
     fi
 
+    # Allow callers (e.g. the speedbench-al.yml AL-collection workflow) to run a
+    # specific script instead of the auto-selected throughput benchmark.
+    if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then
+        BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE"
+    fi
+
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell
@@ -379,7 +385,9 @@ else
         fi
     )
 
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    # Default 180 min; AL-matrix collection (16 server starts) needs longer and
+    # overrides via SALLOC_TIME_LIMIT.
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID \