From 3a8a68fae01fd8ea397651b2eb2b8e1a7bc3e1a4 Mon Sep 17 00:00:00 2001 From: Albert Cheng Date: Tue, 2 Jun 2026 14:15:57 -0700 Subject: [PATCH] Add GitHub Action to collect SPEED-Bench AL matrix Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench acceptance-length matrix (thinking on/off x MTP 1-8) on self-hosted B300 runners, optionally opening a PR that updates benchmarks/speedbench-reference-al.yaml. - benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh: per (thinking, MTP) cell, serve vLLM, run SPEED-Bench, derive AL from /metrics, and emit the YAML matrix. Serves from MODEL_PATH (the local pre-staged weights resolved by the launcher), falling back to MODEL for a standalone local run. Carries a temporary --chat-template-kwargs shim until vllm-project/vllm#44244 lands in the benchmark image (idempotent, applied only for thinking-on cells). - runners/launch_b300-nv.sh: add opt-in BENCH_SCRIPT_OVERRIDE and SALLOC_TIME_LIMIT hooks; both default to the prior behavior. - .github/workflows/speedbench-al.yml: workflow_dispatch entry point; MODEL is the HF id so the launcher resolves the staged MODEL_PATH. --- .github/workflows/speedbench-al.yml | 200 +++++++++++ .../dsv4_fp4_b300_vllm_speedbench_matrix.sh | 337 ++++++++++++++++++ runners/launch_b300-nv.sh | 10 +- 3 files changed, 546 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/speedbench-al.yml create mode 100755 benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml new file mode 100644 index 000000000..771e53e6c --- /dev/null +++ b/.github/workflows/speedbench-al.yml @@ -0,0 +1,200 @@ +name: SpeedBench AL Collection + +# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench +# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the +# golden reference consumed by the synthetic-acceptance framework and (optionally) +# opens a PR updating benchmarks/speedbench-reference-al.yaml. + +on: + workflow_dispatch: + inputs: + runner: + description: "Self-hosted GPU runner label (B300)" + required: false + type: string + default: 'b300' + image: + description: "vLLM container image" + required: false + type: string + default: 'vllm/vllm-openai:v0.21.0' + mtp-list: + description: "Space-separated MTP levels (num_speculative_tokens)" + required: false + type: string + default: '1 2 3 4 5 6 7 8' + thinking-modes: + description: "Space-separated thinking modes to collect" + required: false + type: string + default: 'off on' + category: + description: "SPEED-Bench category" + required: false + type: string + default: 'coding' + output-len: + description: "Per-request output length" + required: false + type: string + default: '4096' + thinking-kwargs: + description: "chat_template_kwargs JSON for thinking-on cells (match golden config)" + required: false + type: string + default: '{"thinking": true, "reasoning_effort": "high"}' + salloc-time: + description: "Slurm allocation minutes (16 server starts ~ several hours)" + required: false + type: string + default: '480' + open-pr: + description: "Open a PR updating benchmarks/speedbench-reference-al.yaml" + required: false + type: boolean + default: true + ref: + description: "Git ref (branch/sha) to checkout" + required: false + type: string + +permissions: + contents: read + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the + # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so + # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts + # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download. + MODEL: deepseek-ai/DeepSeek-V4-Pro + MODEL_PREFIX: dsv4 + PRECISION: fp4 + FRAMEWORK: vllm + EXP_NAME: dsv4_speedbench + IMAGE: ${{ inputs.image }} + TP: '8' + EP_SIZE: '1' + DP_ATTENTION: 'false' + SPEC_DECODING: mtp + # Run the AL-matrix collector instead of the auto-selected throughput script. + BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh + SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }} + # Matrix-collector tunables (propagated into the container via srun --export=ALL). + MTP_LIST: ${{ inputs.mtp-list }} + THINKING_MODES: ${{ inputs.thinking-modes }} + CATEGORY: ${{ inputs.category }} + SPEEDBENCH_OUTPUT_LEN: ${{ inputs.output-len }} + CHAT_TEMPLATE_KWARGS_ON: ${{ inputs.thinking-kwargs }} + OUT_YAML: /workspace/speedbench-reference-al.yaml + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache + +jobs: + collect-al: + runs-on: ${{ inputs.runner }} + timeout-minutes: 600 + name: "SpeedBench AL matrix | ${{ inputs.category }} | mtp=[${{ inputs.mtp-list }}] | thinking=[${{ inputs.thinking-modes }}]" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + # Cleanup Docker resources + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + # Cleanup SLURM resources + if command -v squeue >/dev/null 2>&1; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + fi + + # Cleanup AL-matrix outputs from a prior job on this runner so a stale + # matrix from a previous run is never picked up as this job's output. + rm -rf "${{ github.workspace }}/speedbench_results" 2>/dev/null || true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.sha }} + clean: true + submodules: true + + - name: Cleanup stale outputs (pre-run) + run: | + rm -f speedbench-reference-al.yaml || true + rm -f gpu_metrics.csv || true + rm -rf speed_bench_data || true + + - name: Collect AL matrix + env: + RUNNER_NAME: ${{ runner.name }} + run: | + set -euo pipefail + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + if [ ! -f "speedbench-reference-al.yaml" ]; then + echo "AL collection failed: speedbench-reference-al.yaml not produced." >&2 + exit 1 + fi + echo "### SpeedBench AL matrix" >> "$GITHUB_STEP_SUMMARY" + echo '```yaml' >> "$GITHUB_STEP_SUMMARY" + cat speedbench-reference-al.yaml >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + + - name: Upload AL matrix artifact + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: speedbench-reference-al + path: speedbench-reference-al.yaml + if-no-files-found: warn + + - name: Open PR updating reference yaml + if: ${{ inputs.open-pr && success() }} + env: + GH_TOKEN: ${{ secrets.REPO_PAT }} + run: | + set -euo pipefail + cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml + + BRANCH="speedbench-al/auto-${{ github.run_id }}" + git config user.name "github-actions" + git config user.email "github-actions@github.com" + git checkout -b "$BRANCH" + git add benchmarks/speedbench-reference-al.yaml + if git diff --cached --quiet; then + echo "No change in reference yaml; skipping PR." + exit 0 + fi + git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})" + git push -u origin "$BRANCH" + gh pr create \ + --title "Update SpeedBench AL reference matrix (auto)" \ + --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \ + --base main \ + --head "$BRANCH" + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: speedbench_server_logs + path: speedbench_results/server_*.log + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup \ No newline at end of file diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh new file mode 100755 index 000000000..572801b2c --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh @@ -0,0 +1,337 @@ +#!/usr/bin/env bash + +# DSV4-Pro B300 vLLM SPEED-Bench AL matrix collector. +# +# Produces the golden acceptance-length (AL) reference matrix consumed by the +# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP +# level (num_speculative_tokens), measure the AL on a single SPEED-Bench +# category (default: coding) and emit a YAML matrix identical in shape to +# benchmarks/speedbench-reference-al.yaml. +# +# This is the "AL distribution collection" script wired into the +# speedbench-al.yml GitHub Action (workflow_dispatch / push-button). +# +# Usage (inside the vLLM container, on a B300 node): +# export MODEL=/data/models/dsv4-pro +# bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh +# +# Tunables (env): +# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") +# THINKING_MODES space-separated: off|on (default "off on") +# CATEGORY SPEED-Bench category (default coding) +# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096) +# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) + +set -uo pipefail +source "$(dirname "$0")/../benchmark_lib.sh" + +MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}" +# Serve from the local weights dir resolved by the launcher (MODEL_PATH points +# at the pre-staged copy, e.g. /scratch/models/DeepSeek-V4-Pro). Falls back to +# MODEL for a standalone local run where MODEL is itself a path. A leading "/" +# makes the download guard below a no-op. +SERVE_MODEL="${MODEL_PATH:-$MODEL}" +TP="${TP:-8}" +DP_ATTENTION="${DP_ATTENTION:-false}" +EP_SIZE="${EP_SIZE:-1}" +PORT="${PORT:-8888}" + +MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" +THINKING_MODES="${THINKING_MODES:-off on}" +CATEGORY="${CATEGORY:-coding}" +SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" +CONCURRENCY="${CONCURRENCY:-1}" +TEMPERATURE="${TEMPERATURE:-1.0}" +# thinking-on chat_template_kwargs. MUST match the production/golden config: +# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured +# with reasoning_effort=high. +CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}" + +SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" +RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" +OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}" + +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +mkdir -p "$RESULTS_DIR" +nvidia-smi +if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi + +# ---- Download SPEED-Bench dataset ---- +echo "=== Downloading SPEED-Bench dataset ===" +pip install -q datasets tiktoken +curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR" + +if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then + echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found" + exit 1 +fi + +# ---- Temporary shim: add a real --chat-template-kwargs CLI option ---- +# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset +# pre-renders the chat template client-side WITHOUT chat_template_kwargs and +# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body +# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs +# option through get_samples into CustomDataset.sample's apply_chat_template. +# TODO: delete this whole block once #44244 is released in the benchmark image; +# the patch is idempotent (marker check) so it is safe to leave until then. +apply_chat_template_kwargs_shim() { + echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ===" + python3 - <<'PYEOF' +import vllm.benchmarks.serve as S +import vllm.benchmarks.datasets.datasets as D + +def patch(mod, edits, marker): + f = mod.__file__ + src = open(f).read() + if marker in src: + print("already patched:", f) + return + for old, new in edits: + n = src.count(old) + assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." + src = src.replace(old, new, 1) + open(f, "w").write(src) + print("patched OK ->", f) + +# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body +serve_old = ''' parser.add_argument( + "--extra-body",''' +serve_new = ''' parser.add_argument( + "--chat-template-kwargs", + type=json.loads, + default=None, + help="JSON dict forwarded to apply_chat_template during " + "client-side prompt rendering, e.g. to enable reasoning mode.", + ) + parser.add_argument( + "--extra-body",''' +patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') + +# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call +disp_old = ''' output_len=args.speed_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat,''' +disp_new = ''' output_len=args.speed_bench_output_len, + chat_template_kwargs=args.chat_template_kwargs, + enable_multimodal_chat=args.enable_multimodal_chat,''' + +# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call +samp_old = ''' # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +samp_new = ''' # apply template + if not skip_chat_template: + _ctk = kwargs.get("chat_template_kwargs") or {} + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + **_ctk, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +patch(D, [(disp_old, disp_new), (samp_old, samp_new)], + marker="chat_template_kwargs=args.chat_template_kwargs") +PYEOF +} + +# Apply the shim once if any thinking-on cell is requested. +if [[ " $THINKING_MODES " == *" on "* ]]; then + if ! apply_chat_template_kwargs_shim; then + echo "CRITICAL: --chat-template-kwargs shim failed — aborting" + exit 1 + fi +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi +MOE_ARGS=() +if [ "${DP_ATTENTION}" = "true" ]; then + MOE_ARGS=(--moe-backend deep_gemm_mega_moe) +fi + +fetch_metric() { + local port="$1" name="$2" + curl -s "http://localhost:${port}/metrics" \ + | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0" +} + +SERVER_PID="" +# List all descendant PIDs of $1 recursively, matched by PARENT pid. This can +# never include this script (the script is an ancestor of the server, not a +# descendant), so it avoids the self-kill a name-based `pkill -f vllm` caused +# (the script filename contains "vllm"). +_descendants() { + local pid="$1" child + for child in $(pgrep -P "$pid" 2>/dev/null || true); do + echo "$child" + _descendants "$child" + done +} +cleanup_server() { + if [[ -n "$SERVER_PID" ]]; then + # Snapshot the server's worker/EngineCore subprocesses BEFORE killing the + # parent: once the parent dies the children reparent to init and the tree + # link is lost. Killing the captured PIDs guarantees no orphaned worker + # survives to hold GPU memory and OOM the next server start. + local descendants + descendants=$(_descendants "$SERVER_PID") + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + local pid + for pid in $descendants; do + kill -9 "$pid" 2>/dev/null || true + done + # Wait for GPU memory to actually free before the next server starts. + local waited=0 + while [[ $waited -lt 120 ]]; do + local used + used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1) + if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi + sleep 3; waited=$((waited + 3)) + done + SERVER_PID="" + fi +} +trap 'cleanup_server' EXIT + +start_gpu_monitor + +# Per-cell AL is collected into associative arrays keyed by "mode_mtp". +declare -A AL_RESULT + +run_cell() { + local mode="$1" mtp="$2" + local think_args=() + if [[ "$mode" == "on" ]]; then + think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON") + fi + + echo "" + echo "==========================================" + echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY" + echo "==========================================" + + local serve_args=( + --host 0.0.0.0 --port "$PORT" + "${PARALLEL_ARGS[@]}" + --pipeline-parallel-size 1 + --kv-cache-dtype fp8 + --trust-remote-code + --block-size 256 + --no-enable-prefix-caching + "${EP_ARGS[@]}" + "${MOE_ARGS[@]}" + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --attention_config.use_fp4_indexer_cache True + --tokenizer-mode deepseek_v4 + --tool-call-parser deepseek_v4 + --enable-auto-tool-choice + --reasoning-parser deepseek_v4 + --max-cudagraph-capture-size 2048 + --max-model-len 16384 + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}" + ) + + local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log" + vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 & + SERVER_PID=$! + + if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then + echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A" + AL_RESULT["${mode}_${mtp}"]="N/A" + cleanup_server + return + fi + + local acc_before drf_before acc_after drf_after + acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + vllm bench serve \ + --model "$SERVE_MODEL" \ + --port "$PORT" \ + --dataset-name speed_bench \ + --dataset-path "$SPEEDBENCH_DIR" \ + --speed-bench-category "$CATEGORY" \ + --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \ + --num-prompts -1 \ + --max-concurrency "$CONCURRENCY" \ + --save-result \ + --result-dir "$RESULTS_DIR" \ + --result-filename "speedbench_${mode}_mtp${mtp}" \ + --trust-remote-code \ + --tokenizer-mode deepseek_v4 \ + --temperature "$TEMPERATURE" \ + "${think_args[@]}" + + acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + local delta_acc delta_drf al + delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}") + delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}") + if [[ "$delta_drf" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}") + else + al="N/A" + fi + echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)" + AL_RESULT["${mode}_${mtp}"]="$al" + + cleanup_server +} + +for mode in $THINKING_MODES; do + for mtp in $MTP_LIST; do + run_cell "$mode" "$mtp" + done +done + +stop_gpu_monitor + +# ---- Emit the YAML matrix ---- +emit_mode_block() { + local mode="$1" + for mtp in $MTP_LIST; do + echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}" + done +} + +{ + echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." + echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN" + echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON" + echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)." + echo "#" + echo "# key = num_speculative_tokens (MTP level); value = golden AL" + echo "deepseek-v4-pro:" + if [[ " $THINKING_MODES " == *" on "* ]]; then + echo " thinking_on:" + emit_mode_block on + fi + if [[ " $THINKING_MODES " == *" off "* ]]; then + echo " thinking_off:" + emit_mode_block off + fi +} > "$OUT_YAML" + +echo "" +echo "==========================================" +echo " SPEED-Bench AL matrix written to: $OUT_YAML" +echo "==========================================" +cat "$OUT_YAML" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cc..e6bdf1a0d 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -334,6 +334,12 @@ else BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" fi + # Allow callers (e.g. the speedbench-al.yml AL-collection workflow) to run a + # specific script instead of the auto-selected throughput benchmark. + if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then + BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE" + fi + LOCK_FILE="${SQUASH_FILE}.lock" # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell @@ -379,7 +385,9 @@ else fi ) - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + # Default 180 min; AL-matrix collection (16 server starts) needs longer and + # overrides via SALLOC_TIME_LIMIT. + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \