From 158278c25aae372d6b7e7c835c108206e599b311 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 16:07:44 -0300 Subject: [PATCH 01/17] ci: add gpu benchmarks --- .github/workflows/benchmark-gpu.yml | 337 ++++++++++++++++++++++++++++ .github/workflows/benchmark-pr.yml | 1 + infra/gpu_bench.sh | 95 ++++++++ 3 files changed, 433 insertions(+) create mode 100644 .github/workflows/benchmark-gpu.yml create mode 100755 infra/gpu_bench.sh diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml new file mode 100644 index 000000000..abdea98a5 --- /dev/null +++ b/.github/workflows/benchmark-gpu.yml @@ -0,0 +1,337 @@ +name: Benchmark GPU (PR) + +# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover +# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute +# GPU numbers back to the PR, then always destroy the instance. +# +# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3), +# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the +# GPU work happens on the rented Vast box (provisioned by the template onstart). +# +# Requires repo secrets: +# VAST_API_KEY — https://cloud.vast.ai/manage-keys/ +# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template + +on: + workflow_dispatch: + inputs: + runs: + description: "Number of prove iterations" + default: "3" + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + +concurrency: + group: benchmark-gpu-${{ github.event.issue.number || github.run_id }} + cancel-in-progress: true + +env: + # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk, + # verified + rentable, Blackwell-capable driver, under the price cap ($/hr). + GPU_NAME: RTX_5090 + PRICE_CAP: "3" + VAST_IMAGE_DISK: "64" + +jobs: + benchmark-gpu: + runs-on: ubuntu-latest + # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author. + if: >- + github.event_name == 'workflow_dispatch' || + (github.event_name == 'issue_comment' && + github.event.issue.pull_request && + startsWith(github.event.comment.body, '/bench-gpu') && + contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) + steps: + - name: React to comment + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'eyes' + }); + + - name: Resolve PR ref + run count + id: config + env: + GH_TOKEN: ${{ github.token }} + EVENT_NAME: ${{ github.event_name }} + COMMENT_BODY: ${{ github.event.comment.body }} + PR_NUM: ${{ github.event.issue.number }} + DISPATCH_RUNS: ${{ github.event.inputs.runs }} + DISPATCH_REF: ${{ github.ref_name }} + run: | + if [ "$EVENT_NAME" = "issue_comment" ]; then + SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT" + echo "sha=$SHA" >> "$GITHUB_OUTPUT" + # "/bench-gpu 5" -> 5 iterations; otherwise default. + N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p') + RUNS=${N:-3} + else + echo "pr_num=" >> "$GITHUB_OUTPUT" + echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT" + RUNS=${DISPATCH_RUNS:-3} + fi + # Clamp to [1,10]. + if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then + echo "::warning::run count out of range, defaulting to 3" + RUNS=3 + fi + echo "runs=$RUNS" >> "$GITHUB_OUTPUT" + echo "Using $RUNS prove iteration(s)" + + - name: Install Vast CLI + env: + VAST_API_KEY: ${{ secrets.VAST_API_KEY }} + run: | + pip install --quiet --upgrade vastai + vastai set api-key "$VAST_API_KEY" + + - name: Register ephemeral SSH key + id: sshkey + run: | + mkdir -p "$HOME/.ssh" + KEY="$HOME/.ssh/vast_bench" + COMMENT="gh-actions-bench-${GITHUB_RUN_ID}" + ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null + vastai create ssh-key "$(cat "$KEY.pub")" + echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT" + echo "key_path=$KEY" >> "$GITHUB_OUTPUT" + + - name: Pick a Vast offer + id: offer + run: | + # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + echo "Query: $QUERY" + vastai search offers "$QUERY" --raw -o dph_total > offers.json + OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json) + OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json) + if [ -z "$OFFER_ID" ]; then + echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + exit 1 + fi + echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr" + echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" + echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT" + + - name: Create instance + id: instance + env: + VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }} + OFFER_ID: ${{ steps.offer.outputs.id }} + run: | + vastai create instance "$OFFER_ID" \ + --template_hash "$VAST_TEMPLATE_HASH" \ + --disk "$VAST_IMAGE_DISK" \ + --ssh --direct --raw > create.json + cat create.json + IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json) + if [ -z "$IID" ]; then + echo "::error::Failed to create Vast instance" + exit 1 + fi + # Persist immediately so teardown runs even if later steps fail. + echo "$IID" > "$RUNNER_TEMP/vast_instance_id" + echo "id=$IID" >> "$GITHUB_OUTPUT" + echo "Created instance $IID" + + - name: Wait for SSH + id: ssh + env: + IID: ${{ steps.instance.outputs.id }} + run: | + echo "Waiting for instance $IID to reach 'running' with SSH endpoint..." + HOST=""; PORT="" + for _ in $(seq 1 60); do # ~10 min + vastai show instance "$IID" --raw > inst.json || true + STATUS=$(jq -r '.actual_status // empty' inst.json) + # We create with --direct, so SSH straight to the public IP + the host port + # mapped to container port 22. The .ssh_host/.ssh_port proxy fields are + # unreliable (observed off-by-one vs the real proxy port), so use the direct + # mapping — same endpoint `vastai ssh-url` reports. + HOST=$(jq -r '.public_ipaddr // empty' inst.json) + PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json) + echo " status=$STATUS ssh=$HOST:$PORT" + if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then + break + fi + sleep 10 + done + if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then + echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)" + exit 1 + fi + echo "host=$HOST" >> "$GITHUB_OUTPUT" + echo "port=$PORT" >> "$GITHUB_OUTPUT" + + # Wait for sshd to accept our key. + for _ in $(seq 1 30); do + if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then + echo "sshd reachable"; exit 0 + fi + sleep 10 + done + echo "::error::sshd did not accept connections in time" + exit 1 + + - name: Wait for onstart provisioning + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..." + # The bootstrap's final stdout line is "=== done ===". Vast captures onstart + # output to /var/log/onstart.log; fall back to checking the artifacts it leaves. + for _ in $(seq 1 120); do # ~20 min + if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then + echo "onstart reported done"; exit 0 + fi + # shellcheck disable=SC2016 # $HOME/$(...) must expand on the remote box, not the runner + if $SSH 'test -x "$HOME/.cargo/bin/cargo" \ + && test -f /opt/lambda-vm-sysroot/include/stdlib.h \ + && test -d /workspace/lambda_vm/.git \ + && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then + echo "provisioning artifacts present"; exit 0 + fi + sleep 10 + done + echo "::error::onstart provisioning did not complete in time" + exit 1 + + - name: Check out PR source on the box + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + PR_NUM: ${{ steps.config.outputs.pr_num }} + SHA: ${{ steps.config.outputs.sha }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + if [ -n "$PR_NUM" ]; then + # Fetch the PR head via the base repo's pull ref (works for fork PRs too). + $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'" + else + # workflow_dispatch: check out the requested branch. + $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD" + fi + $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline" + + - name: Run GPU benchmark + id: bench + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + RUNS: ${{ steps.config.outputs.runs }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself. + $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log + + # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:"). + mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}') + mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*' bench.log | awk '{print $3}') + if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then + echo "::error::Failed to parse any GPU metrics from the bench output" + exit 1 + fi + MED_POS=$(( (${#TIMES[@]} + 1) / 2 )) + TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS") + HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS") + ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -) + ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -) + { + echo "time_s=$TIME_MED" + echo "peak_mb=$HEAP_MED" + echo "all_times=$ALL_TIMES" + echo "all_heaps=$ALL_HEAPS" + } >> "$GITHUB_OUTPUT" + + - name: Comment on PR + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + env: + TIME_S: ${{ steps.bench.outputs.time_s }} + PEAK_MB: ${{ steps.bench.outputs.peak_mb }} + ALL_TIMES: ${{ steps.bench.outputs.all_times }} + ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }} + RUNS: ${{ steps.config.outputs.runs }} + GPU_NAME: ${{ env.GPU_NAME }} + OFFER_PRICE: ${{ steps.offer.outputs.price }} + COMMIT_SHA: ${{ steps.config.outputs.sha }} + with: + script: | + const time = process.env.TIME_S; + const peak = process.env.PEAK_MB; + const runs = parseInt(process.env.RUNS || '1'); + const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / '); + const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / '); + const nLabel = runs > 1 ? ` (median of ${runs})` : ''; + const sha = (process.env.COMMIT_SHA || '').substring(0, 8); + + let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`; + body += `GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`\n\n`; + body += `| Metric | GPU |\n`; + body += `|--------|-----|\n`; + body += `| **Prove time** | ${time}s |\n`; + body += `| **Peak heap** | ${peak} MB |\n`; + if (runs > 1) { + body += `\nRuns — time: ${allTimes} · heap: ${allHeaps}\n`; + } + body += `\nCommit: ${sha} · Runner: Vast.ai RTX 5090\n`; + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + const marker = 'Benchmark (GPU) — ethrex'; + const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, repo: context.repo.repo, + comment_id: existing.id, body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number, body, + }); + } + + # --- Teardown: ALWAYS destroy the instance (cost guardrail) --- + - name: Destroy instance + if: always() + run: | + if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then + IID=$(cat "$RUNNER_TEMP/vast_instance_id") + echo "Destroying instance $IID" + vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console" + else + echo "No instance id recorded; nothing to destroy." + fi + + - name: Remove ephemeral SSH key + if: always() + env: + KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }} + run: | + [ -z "$KEY_COMMENT" ] && exit 0 + vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0 + for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do + echo "Deleting ssh-key $kid" + vastai delete ssh-key "$kid" || true + done diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml index ca66bf9a7..2eaebc213 100644 --- a/.github/workflows/benchmark-pr.yml +++ b/.github/workflows/benchmark-pr.yml @@ -55,6 +55,7 @@ jobs: (github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench') && + !startsWith(github.event.comment.body, '/bench-gpu') && contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) steps: - name: React to comment diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh new file mode 100755 index 000000000..1557e1e02 --- /dev/null +++ b/infra/gpu_bench.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled. +# +# Usage: infra/gpu_bench.sh [runs] +# runs number of prove iterations (default 3) +# +# Assumes the box was provisioned by the Vast template onstart +# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01, +# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place; +# CUDA/nvcc come from the base image. This script does NOT provision — it only +# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop. +# +# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml): +# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient) +# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the +# orchestrating workflow parses. + +set -euo pipefail + +RUNS="${1:-3}" + +# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT). +ELF="executor/program_artifacts/rust/ethrex.elf" +INPUT="executor/tests/ethrex_bench_20.bin" +TRANSFERS=20 + +log() { printf '\n=== %s ===\n' "$*"; } + +# --- 0. Locate cargo + sysroot (provisioned by the template onstart) --------- +if [ -f "$HOME/.cargo/env" ]; then + # shellcheck disable=SC1091 + . "$HOME/.cargo/env" +fi +export PATH="$HOME/.cargo/bin:$PATH" +export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}" + +# --- 1. Sanity-check the GPU toolchain --------------------------------------- +log "GPU + toolchain check" +if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "::error::nvidia-smi not found — no GPU driver on this box" >&2 + exit 1 +fi +nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true + +# nvcc may live under /usr/local/cuda/bin without being on PATH. +if ! command -v nvcc >/dev/null 2>&1; then + for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do + if [ -x "$d/nvcc" ]; then + export PATH="$d:$PATH" + export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}" + break + fi + done +fi +if ! command -v nvcc >/dev/null 2>&1; then + echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2 + exit 1 +fi +nvcc --version | tail -n 2 + +if ! command -v cargo >/dev/null 2>&1; then + echo "::error::cargo not found — template onstart provisioning incomplete" >&2 + exit 1 +fi +if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then + echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2 + exit 1 +fi + +# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------ +log "building ethrex guest ELF" +make "$ELF" + +# --- 3. Generate the 20-transfer fixture ------------------------------------- +log "generating $INPUT ($TRANSFERS distinct transfers)" +( cd tooling/ethrex-fixtures && cargo build --release ) +GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures +"$GEN" "$TRANSFERS" "$INPUT" distinct + +# --- 4. Build the CLI with the GPU (cuda) path ------------------------------- +# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes +# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects +# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed. +log "building CLI with --features jemalloc-stats,prover/cuda" +cargo build --release -p cli --features jemalloc-stats,prover/cuda + +# --- 5. Prove loop ----------------------------------------------------------- +log "proving $ELF x$RUNS (GPU)" +for i in $(seq 1 "$RUNS"); do + echo "--- Run $i/$RUNS ---" + ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time + rm -f /tmp/proof.bin +done + +log "done" From a9d848a5ea22261340e33e3c5d66660b8e4671a4 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 16:39:04 -0300 Subject: [PATCH 02/17] add retries --- .github/workflows/benchmark-gpu.yml | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index abdea98a5..e1750ddde 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -109,18 +109,33 @@ jobs: - name: Pick a Vast offer id: offer + env: + # Retry the same query to ride out transient scarcity (datacenter RTX 5090s + # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL. + OFFER_ATTEMPTS: "10" + OFFER_INTERVAL: "30" run: | - # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it. + # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB + # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY" - vastai search offers "$QUERY" --raw -o dph_total > offers.json - OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json) - OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json) + OFFER_ID="" + for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do + vastai search offers "$QUERY" --raw -o dph_total > offers.json || true + OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json) + OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json) + if [ -n "$OFFER_ID" ]; then + echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)" + break + fi + echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..." + sleep "$OFFER_INTERVAL" + done if [ -z "$OFFER_ID" ]; then - echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" exit 1 fi - echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr" echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT" From 5ef0fe2b6608ff317b9a4142f0c04134a229f3e1 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:00:54 -0300 Subject: [PATCH 03/17] ci: use ABBA method to run the benchmark --- .github/workflows/benchmark-gpu.yml | 211 ++++++++++++++-------------- infra/gpu_bench.sh | 95 ------------- scripts/bench_abba.sh | 9 +- 3 files changed, 116 insertions(+), 199 deletions(-) delete mode 100755 infra/gpu_bench.sh diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index e1750ddde..51a7ed63b 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -1,12 +1,14 @@ name: Benchmark GPU (PR) -# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover -# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute -# GPU numbers back to the PR, then always destroy the instance. +# Rent an RTX 5090 on Vast.ai (hourly) and run the drift-free A/B/B/A (ABBA) paired +# prover benchmark — the same method as the CPU `/bench-abba` (scripts/bench_abba.sh) — +# but with the CUDA prover path enabled (BENCH_FEATURES=jemalloc-stats,prover/cuda). +# It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU, +# posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box. # -# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3), -# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the -# GPU work happens on the rented Vast box (provisioned by the template onstart). +# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via +# workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens +# on the rented Vast box (provisioned by the template onstart). # # Requires repo secrets: # VAST_API_KEY — https://cloud.vast.ai/manage-keys/ @@ -15,79 +17,102 @@ name: Benchmark GPU (PR) on: workflow_dispatch: inputs: - runs: - description: "Number of prove iterations" - default: "3" + pairs: + description: "Number of A/B/B/A pairs" + default: "10" issue_comment: types: [created] + # TEMP(testing): lets the workflow run from this branch before it's on the default + # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do + # not). REMOVE this push trigger before merging. + push: + branches: [gpu_benchmarks] permissions: contents: read pull-requests: write + issues: write concurrency: group: benchmark-gpu-${{ github.event.issue.number || github.run_id }} cancel-in-progress: true env: - # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk, - # verified + rentable, Blackwell-capable driver, under the price cap ($/hr). + # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove + # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "3" VAST_IMAGE_DISK: "64" + # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats. + BENCH_FEATURES: "jemalloc-stats,prover/cuda" jobs: benchmark-gpu: runs-on: ubuntu-latest # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author. + # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge. + # REMOVE the push clause before merging. if: >- + github.event_name == 'push' || github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench-gpu') && contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) + # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each). + timeout-minutes: 180 steps: - - name: React to comment - if: github.event_name == 'issue_comment' - uses: actions/github-script@v7 - with: - script: | - await github.rest.reactions.createForIssueComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: context.payload.comment.id, - content: 'eyes' - }); - - - name: Resolve PR ref + run count + - name: Resolve PR ref + pair count id: config env: GH_TOKEN: ${{ github.token }} EVENT_NAME: ${{ github.event_name }} COMMENT_BODY: ${{ github.event.comment.body }} PR_NUM: ${{ github.event.issue.number }} - DISPATCH_RUNS: ${{ github.event.inputs.runs }} + DISPATCH_PAIRS: ${{ github.event.inputs.pairs }} DISPATCH_REF: ${{ github.ref_name }} run: | if [ "$EVENT_NAME" = "issue_comment" ]; then - SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) - echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT" - echo "sha=$SHA" >> "$GITHUB_OUTPUT" - # "/bench-gpu 5" -> 5 iterations; otherwise default. + # Pin the head SHA (works for fork PRs; avoids a force-push race mid-run). + HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH="" + # "/bench-gpu 20" -> 20 pairs; otherwise default. N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p') - RUNS=${N:-3} + PAIRS=${N:-10} else - echo "pr_num=" >> "$GITHUB_OUTPUT" - echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT" - RUNS=${DISPATCH_RUNS:-3} + # workflow_dispatch / push: compare this branch vs main. + OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" + PAIRS=${DISPATCH_PAIRS:-10} fi - # Clamp to [1,10]. - if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then - echo "::warning::run count out of range, defaulting to 3" - RUNS=3 + # Clamp to [2,40] (even is ideal so AB/BA orders balance). + if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then + echo "::warning::pair count out of range [2,40], defaulting to 10" + PAIRS=10 fi - echo "runs=$RUNS" >> "$GITHUB_OUTPUT" - echo "Using $RUNS prove iteration(s)" + { + echo "pr_num=$OUT_PR_NUM" + echo "head_sha=$OUT_HEAD_SHA" + echo "branch=$OUT_BRANCH" + echo "pairs=$PAIRS" + } >> "$GITHUB_OUTPUT" + echo "Using $PAIRS A/B/B/A pairs" + + - name: Acknowledge (react + occupancy notice) + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + env: + PAIRS: ${{ steps.config.outputs.pairs }} + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, repo: context.repo.repo, + comment_id: context.payload.comment.id, content: 'eyes' + }); + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number, + body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.` + }); - name: Install Vast CLI env: @@ -226,94 +251,76 @@ jobs: echo "::error::onstart provisioning did not complete in time" exit 1 - - name: Check out PR source on the box + - name: Run GPU ABBA benchmark + id: bench env: HOST: ${{ steps.ssh.outputs.host }} PORT: ${{ steps.ssh.outputs.port }} KEY: ${{ steps.sshkey.outputs.key_path }} PR_NUM: ${{ steps.config.outputs.pr_num }} - SHA: ${{ steps.config.outputs.sha }} + HEAD_SHA: ${{ steps.config.outputs.head_sha }} + BRANCH: ${{ steps.config.outputs.branch }} + PAIRS: ${{ steps.config.outputs.pairs }} run: | SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + + # Resolve the PR side (REF_A) and the fetch needed to make it resolvable on the box. if [ -n "$PR_NUM" ]; then - # Fetch the PR head via the base repo's pull ref (works for fork PRs too). - $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'" + FETCH="git fetch --force origin refs/pull/$PR_NUM/head" + REF_A="$HEAD_SHA" else - # workflow_dispatch: check out the requested branch. - $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD" + FETCH="git fetch --force origin $BRANCH" + REF_A="origin/$BRANCH" fi - $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline" - - name: Run GPU benchmark - id: bench - env: - HOST: ${{ steps.ssh.outputs.host }} - PORT: ${{ steps.ssh.outputs.port }} - KEY: ${{ steps.sshkey.outputs.key_path }} - RUNS: ${{ steps.config.outputs.runs }} - run: | - SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" - # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself. - $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log + # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree), + # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon + # verdict. BENCH_FEATURES routes the build through the CUDA prover path. + REMOTE="set -e; cd /workspace/lambda_vm; \ + command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ + git fetch --force origin main; $FETCH; \ + SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ + scripts/bench_abba.sh $REF_A origin/main $PAIRS" - # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:"). - mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}') - mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*' bench.log | awk '{print $3}') - if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then - echo "::error::Failed to parse any GPU metrics from the bench output" - exit 1 - fi - MED_POS=$(( (${#TIMES[@]} + 1) / 2 )) - TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS") - HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS") - ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -) - ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -) - { - echo "time_s=$TIME_MED" - echo "peak_mb=$HEAP_MED" - echo "all_times=$ALL_TIMES" - echo "all_heaps=$ALL_HEAPS" - } >> "$GITHUB_OUTPUT" + $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt" + # Extract the result section for the PR comment (same marker bench-abba.yml uses). + sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt" - - name: Comment on PR - if: github.event_name == 'issue_comment' + - name: Comment ABBA result on PR + if: always() && github.event_name == 'issue_comment' uses: actions/github-script@v7 env: - TIME_S: ${{ steps.bench.outputs.time_s }} - PEAK_MB: ${{ steps.bench.outputs.peak_mb }} - ALL_TIMES: ${{ steps.bench.outputs.all_times }} - ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }} - RUNS: ${{ steps.config.outputs.runs }} + HEAD_SHA: ${{ steps.config.outputs.head_sha }} + PAIRS: ${{ steps.config.outputs.pairs }} + OUTCOME: ${{ steps.bench.outcome }} GPU_NAME: ${{ env.GPU_NAME }} OFFER_PRICE: ${{ steps.offer.outputs.price }} - COMMIT_SHA: ${{ steps.config.outputs.sha }} with: script: | - const time = process.env.TIME_S; - const peak = process.env.PEAK_MB; - const runs = parseInt(process.env.RUNS || '1'); - const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / '); - const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / '); - const nLabel = runs > 1 ? ` (median of ${runs})` : ''; - const sha = (process.env.COMMIT_SHA || '').substring(0, 8); + const fs = require('fs'); + const tmp = process.env.RUNNER_TEMP; + const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } }; + const head = (process.env.HEAD_SHA || '').slice(0, 10); + const pairs = process.env.PAIRS; + const gpu = (process.env.GPU_NAME || '').replace('_', ' '); + const price = process.env.OFFER_PRICE; - let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`; - body += `GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`\n\n`; - body += `| Metric | GPU |\n`; - body += `|--------|-----|\n`; - body += `| **Prove time** | ${time}s |\n`; - body += `| **Peak heap** | ${peak} MB |\n`; - if (runs > 1) { - body += `\nRuns — time: ${allTimes} · heap: ${allHeaps}\n`; + let body = `## GPU Benchmark (ABBA) — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`; + body += `${gpu} · Vast.ai datacenter${price ? ` @ \$${price}/hr` : ''} · \`prover/cuda\` · drift-free A/B/B/A\n\n`; + if (process.env.OUTCOME === 'success') { + const res = read(`${tmp}/abba_result.txt`) || read(`${tmp}/abba_out.txt`); + body += '```\n' + res + '\n```\n'; + body += '\n+ = PR faster. Trust the verdict when paired-t and Wilcoxon agree.\n'; + } else { + const tail = read(`${tmp}/abba_out.txt`).split('\n').slice(-30).join('\n'); + body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n'; } - body += `\nCommit: ${sha} · Runner: Vast.ai RTX 5090\n`; const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, + owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, }); - const marker = 'Benchmark (GPU) — ethrex'; + const marker = 'GPU Benchmark (ABBA)'; const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker)); if (existing) { await github.rest.issues.updateComment({ diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh deleted file mode 100755 index 1557e1e02..000000000 --- a/infra/gpu_bench.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash -# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled. -# -# Usage: infra/gpu_bench.sh [runs] -# runs number of prove iterations (default 3) -# -# Assumes the box was provisioned by the Vast template onstart -# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01, -# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place; -# CUDA/nvcc come from the base image. This script does NOT provision — it only -# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop. -# -# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml): -# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient) -# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the -# orchestrating workflow parses. - -set -euo pipefail - -RUNS="${1:-3}" - -# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT). -ELF="executor/program_artifacts/rust/ethrex.elf" -INPUT="executor/tests/ethrex_bench_20.bin" -TRANSFERS=20 - -log() { printf '\n=== %s ===\n' "$*"; } - -# --- 0. Locate cargo + sysroot (provisioned by the template onstart) --------- -if [ -f "$HOME/.cargo/env" ]; then - # shellcheck disable=SC1091 - . "$HOME/.cargo/env" -fi -export PATH="$HOME/.cargo/bin:$PATH" -export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}" - -# --- 1. Sanity-check the GPU toolchain --------------------------------------- -log "GPU + toolchain check" -if ! command -v nvidia-smi >/dev/null 2>&1; then - echo "::error::nvidia-smi not found — no GPU driver on this box" >&2 - exit 1 -fi -nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true - -# nvcc may live under /usr/local/cuda/bin without being on PATH. -if ! command -v nvcc >/dev/null 2>&1; then - for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do - if [ -x "$d/nvcc" ]; then - export PATH="$d:$PATH" - export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}" - break - fi - done -fi -if ! command -v nvcc >/dev/null 2>&1; then - echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2 - exit 1 -fi -nvcc --version | tail -n 2 - -if ! command -v cargo >/dev/null 2>&1; then - echo "::error::cargo not found — template onstart provisioning incomplete" >&2 - exit 1 -fi -if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then - echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2 - exit 1 -fi - -# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------ -log "building ethrex guest ELF" -make "$ELF" - -# --- 3. Generate the 20-transfer fixture ------------------------------------- -log "generating $INPUT ($TRANSFERS distinct transfers)" -( cd tooling/ethrex-fixtures && cargo build --release ) -GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures -"$GEN" "$TRANSFERS" "$INPUT" distinct - -# --- 4. Build the CLI with the GPU (cuda) path ------------------------------- -# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes -# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects -# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed. -log "building CLI with --features jemalloc-stats,prover/cuda" -cargo build --release -p cli --features jemalloc-stats,prover/cuda - -# --- 5. Prove loop ----------------------------------------------------------- -log "proving $ELF x$RUNS (GPU)" -for i in $(seq 1 "$RUNS"); do - echo "--- Run $i/$RUNS ---" - ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time - rm -f /tmp/proof.bin -done - -log "done" diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh index 79bfddf27..57fab5e28 100755 --- a/scripts/bench_abba.sh +++ b/scripts/bench_abba.sh @@ -27,6 +27,8 @@ # REF_B baseline (default: origin/main) # N_PAIRS pairs (default: 20 -> 40 runs, ~33 min on ethrex) # Env: REBUILD=1 forces a rebuild even if cached binaries exist. +# BENCH_FEATURES= cargo features for the cli build (default: jemalloc-stats). +# The GPU ABBA workflow passes "jemalloc-stats,prover/cuda" to bench the GPU path. # # Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect, # ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6% @@ -45,6 +47,9 @@ fi REF_A="$1" REF_B="${2:-origin/main}" N_PAIRS="${3:-20}" +# cli build features. Default matches the CPU bench; the GPU ABBA workflow overrides +# with "jemalloc-stats,prover/cuda" to exercise the CUDA prover path. +BENCH_FEATURES="${BENCH_FEATURES:-jemalloc-stats}" ELF_REL="executor/program_artifacts/rust/ethrex.elf" INPUT_REL="executor/tests/ethrex_bench_20.bin" @@ -102,9 +107,9 @@ if [ "$need_build" = "1" ]; then echo "==> Building both prover binaries in isolated worktree $WT" git worktree add --detach "$WT" "$SHA_B" >/dev/null build_cli() { # $1=sha $2=out (shared target dir -> 2nd build is incremental) - echo "==> Building cli @ ${1:0:10} -> $2" + echo "==> Building cli @ ${1:0:10} -> $2 (features: $BENCH_FEATURES)" git -C "$WT" checkout --quiet "$1" - if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then + if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2 tail -40 "$WORK/build_$2.log" >&2 exit 1 From 337bce05248c8ec4e7ad07d1cee4142c9ddc15e8 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:04:29 -0300 Subject: [PATCH 04/17] ci: use 64gb ram --- .github/workflows/benchmark-gpu.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 51a7ed63b..c885cc03b 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -38,8 +38,8 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove - # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap. + # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, + # verified + rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "3" VAST_IMAGE_DISK: "64" @@ -140,10 +140,10 @@ jobs: OFFER_ATTEMPTS: "10" OFFER_INTERVAL: "30" run: | - # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it. - # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB - # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000 + # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap + # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do @@ -158,7 +158,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" From 3fce3499a6de1ae0fd4d5209ae0946862dbbc501 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:24:17 -0300 Subject: [PATCH 05/17] ci: remove datacenter flag --- .github/workflows/benchmark-gpu.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index c885cc03b..d9d0b203d 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -38,8 +38,8 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, - # verified + rentable, Blackwell-capable driver, <= cap. + # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified + + # rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "3" VAST_IMAGE_DISK: "64" @@ -140,10 +140,9 @@ jobs: OFFER_ATTEMPTS: "10" OFFER_INTERVAL: "30" run: | - # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000 - # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap - # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the + # "64 GB" boxes report ~64467 MB. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do @@ -158,7 +157,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" From 76a137a5542dbc3ea15e4e050e6a03440731c981 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:29:49 -0300 Subject: [PATCH 06/17] fix: units for RAM --- .github/workflows/benchmark-gpu.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index d9d0b203d..d88f5eb92 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -38,7 +38,7 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified + + # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified + # rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "3" @@ -140,9 +140,9 @@ jobs: OFFER_ATTEMPTS: "10" OFFER_INTERVAL: "30" run: | - # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the - # "64 GB" boxes report ~64467 MB. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different + # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do @@ -157,7 +157,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" From 09cf1e5044a19ef6008a7f278b966f78e389a57d Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 18:18:46 -0300 Subject: [PATCH 07/17] fix: min driver and ssh key --- .github/workflows/benchmark-gpu.yml | 79 ++++++++++++++++++----------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index d88f5eb92..69b290ec6 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -19,7 +19,7 @@ on: inputs: pairs: description: "Number of A/B/B/A pairs" - default: "10" + default: "1" # TEMP(testing): fast runs; restore to "10" before merge issue_comment: types: [created] # TEMP(testing): lets the workflow run from this branch before it's on the default @@ -78,16 +78,16 @@ jobs: OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH="" # "/bench-gpu 20" -> 20 pairs; otherwise default. N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p') - PAIRS=${N:-10} + PAIRS=${N:-1} # TEMP(testing): default 1; restore to 10 before merge else # workflow_dispatch / push: compare this branch vs main. OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" - PAIRS=${DISPATCH_PAIRS:-10} + PAIRS=${DISPATCH_PAIRS:-1} # TEMP(testing): default 1; restore to 10 before merge fi - # Clamp to [2,40] (even is ideal so AB/BA orders balance). - if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then - echo "::warning::pair count out of range [2,40], defaulting to 10" - PAIRS=10 + # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge. + if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then + echo "::warning::pair count out of range [1,40], defaulting to 1" + PAIRS=1 fi { echo "pr_num=$OUT_PR_NUM" @@ -121,15 +121,12 @@ jobs: pip install --quiet --upgrade vastai vastai set api-key "$VAST_API_KEY" - - name: Register ephemeral SSH key + - name: Generate ephemeral SSH key id: sshkey run: | mkdir -p "$HOME/.ssh" KEY="$HOME/.ssh/vast_bench" - COMMENT="gh-actions-bench-${GITHUB_RUN_ID}" - ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null - vastai create ssh-key "$(cat "$KEY.pub")" - echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT" + ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-bench-${GITHUB_RUN_ID}" >/dev/null echo "key_path=$KEY" >> "$GITHUB_OUTPUT" - name: Pick a Vast offer @@ -139,16 +136,23 @@ jobs: # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL. OFFER_ATTEMPTS: "10" OFFER_INTERVAL: "30" + # Require driver >= this major so cudarc (default cuda-version-from-build-system) + # matches the runtime driver. Older drivers (e.g. 575) lack newer symbols like + # cuCtxGetDevice_v2 and the GPU path falls back to CPU. Filtered client-side in jq + # because vast can't numerically compare the driver_version string server-side. + MIN_DRIVER: "580" run: | # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing. QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" - echo "Query: $QUERY" + echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" + # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first. + SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do vastai search offers "$QUERY" --raw -o dph_total > offers.json || true - OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json) - OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json) + OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json) + OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json) if [ -n "$OFFER_ID" ]; then echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)" break @@ -157,7 +161,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)" + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" @@ -184,6 +188,25 @@ jobs: echo "id=$IID" >> "$GITHUB_OUTPUT" echo "Created instance $IID" + - name: Attach SSH key to instance + env: + IID: ${{ steps.instance.outputs.id }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys). + # It's removed when the instance is destroyed, so no account-level key to clean up. + # Retry: the instance may not accept the attach immediately after create. + PUB="$(cat "$KEY.pub")" + for attempt in $(seq 1 12); do + if vastai attach ssh "$IID" "$PUB"; then + echo "Attached ssh key (attempt $attempt)"; exit 0 + fi + echo "attach failed (attempt $attempt/12); retrying in 10s..." + sleep 10 + done + echo "::error::Failed to attach ssh key to instance $IID" + exit 1 + - name: Wait for SSH id: ssh env: @@ -285,6 +308,15 @@ jobs: # Extract the result section for the PR comment (same marker bench-abba.yml uses). sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt" + # Surface the result in the Actions run summary too (push/workflow_dispatch + # runs have no PR to comment on). + { + echo "## GPU ABBA — ethrex 20 transfers (vs main)" + echo '```' + cat "$RUNNER_TEMP/abba_result.txt" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + - name: Comment ABBA result on PR if: always() && github.event_name == 'issue_comment' uses: actions/github-script@v7 @@ -340,19 +372,8 @@ jobs: if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then IID=$(cat "$RUNNER_TEMP/vast_instance_id") echo "Destroying instance $IID" - vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console" + # --yes: skip the interactive [y/N] confirm (CI has no tty). + vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console" else echo "No instance id recorded; nothing to destroy." fi - - - name: Remove ephemeral SSH key - if: always() - env: - KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }} - run: | - [ -z "$KEY_COMMENT" ] && exit 0 - vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0 - for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do - echo "Deleting ssh-key $kid" - vastai delete ssh-key "$kid" || true - done From f645901afff23317b78a73cd760a8bba739094a8 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:18:59 -0300 Subject: [PATCH 08/17] fix: rebuild binaries --- .github/workflows/benchmark-gpu.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 69b290ec6..531000938 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -298,10 +298,12 @@ jobs: # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree), # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon # verdict. BENCH_FEATURES routes the build through the CUDA prover path. + # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both + # binaries (PTX is compiled for the detected arch); never trust a cached binary. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ git fetch --force origin main; $FETCH; \ - SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ + REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt" From 963966c0376eaa10c98ad2a002ef9672b4e98649 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:25:54 -0300 Subject: [PATCH 09/17] fix: use correct sh --- .github/workflows/benchmark-gpu.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 531000938..e81c65f64 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -295,14 +295,17 @@ jobs: REF_A="origin/$BRANCH" fi - # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree), - # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon - # verdict. BENCH_FEATURES routes the build through the CUDA prover path. + # The template clones the repo at the DEFAULT branch (main), so check out the PR + # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU + # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated + # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + + # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path. # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both # binaries (PTX is compiled for the detected arch); never trust a cached binary. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ git fetch --force origin main; $FETCH; \ + git checkout -f $REF_A; \ REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" From f09578ee4d604be371be03a941ca851fc4f28446 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 11:07:36 -0300 Subject: [PATCH 10/17] fix: use 64gb ram --- .github/workflows/benchmark-gpu.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index e81c65f64..7a90b0e0f 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -38,7 +38,7 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified + + # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified + # rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "3" @@ -143,8 +143,8 @@ jobs: MIN_DRIVER: "580" run: | # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different - # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # units), so >=64 means 64 GB. >=64000 would mean 64000 GB and match nothing. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first. SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)" @@ -161,7 +161,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" From defc4da1caa09a132e772b2c2261517c6c7625b7 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 11:48:11 -0300 Subject: [PATCH 11/17] fix: use expensive machine with $1 cap --- .github/workflows/benchmark-gpu.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 7a90b0e0f..a52c4704f 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -41,7 +41,7 @@ env: # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified + # rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 - PRICE_CAP: "3" + PRICE_CAP: "1" VAST_IMAGE_DISK: "64" # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats. BENCH_FEATURES: "jemalloc-stats,prover/cuda" @@ -142,12 +142,13 @@ jobs: # because vast can't numerically compare the driver_version string server-side. MIN_DRIVER: "580" run: | - # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different - # units), so >=64 means 64 GB. >=64000 would mean 64000 GB and match nothing. + # cpu_ram filter is in GB. QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" - # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first. - SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)" + # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first + # (within the price cap) — premium hosts have faster disks/network (quicker image + # pulls) and better reliability; the cheapest boxes were flaky. + SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do vastai search offers "$QUERY" --raw -o dph_total > offers.json || true @@ -214,7 +215,8 @@ jobs: run: | echo "Waiting for instance $IID to reach 'running' with SSH endpoint..." HOST=""; PORT="" - for _ in $(seq 1 60); do # ~10 min + # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while. + for _ in $(seq 1 180); do # ~30 min vastai show instance "$IID" --raw > inst.json || true STATUS=$(jq -r '.actual_status // empty' inst.json) # We create with --direct, so SSH straight to the public IP + the host port From 1e0eb3952557203ed4a1164d5f772ba7065c8d45 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:09:29 -0300 Subject: [PATCH 12/17] fix: remove temporary code --- .github/workflows/benchmark-gpu.yml | 43 +++++++++++++---------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index a52c4704f..35948bb7b 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -19,14 +19,9 @@ on: inputs: pairs: description: "Number of A/B/B/A pairs" - default: "1" # TEMP(testing): fast runs; restore to "10" before merge + default: "14" issue_comment: types: [created] - # TEMP(testing): lets the workflow run from this branch before it's on the default - # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do - # not). REMOVE this push trigger before merging. - push: - branches: [gpu_benchmarks] permissions: contents: read @@ -50,10 +45,7 @@ jobs: benchmark-gpu: runs-on: ubuntu-latest # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author. - # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge. - # REMOVE the push clause before merging. if: >- - github.event_name == 'push' || github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && @@ -78,16 +70,16 @@ jobs: OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH="" # "/bench-gpu 20" -> 20 pairs; otherwise default. N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p') - PAIRS=${N:-1} # TEMP(testing): default 1; restore to 10 before merge + PAIRS=${N:-14} else - # workflow_dispatch / push: compare this branch vs main. + # workflow_dispatch: compare this branch vs main. OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" - PAIRS=${DISPATCH_PAIRS:-1} # TEMP(testing): default 1; restore to 10 before merge + PAIRS=${DISPATCH_PAIRS:-14} fi - # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge. - if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then - echo "::warning::pair count out of range [1,40], defaulting to 1" - PAIRS=1 + # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta. + if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then + echo "::warning::pair count out of range [2,40], defaulting to 14" + PAIRS=14 fi { echo "pr_num=$OUT_PR_NUM" @@ -297,17 +289,20 @@ jobs: REF_A="origin/$BRANCH" fi - # The template clones the repo at the DEFAULT branch (main), so check out the PR - # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU - # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated - # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + - # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path. + # Run main's bench_abba.sh — the harness is the pinned measurement methodology, so a + # PR can't alter how its own benchmark is computed. (The template clones the default + # branch, so checking out origin/main is also what's already there; this makes it + # explicit and robust to the template default changing.) The harness still builds the + # cli at REF_A (the PR) and origin/main in isolated worktrees, runs PAIRS interleaved + # A/B/B/A proves, and prints the paired-t CI + Wilcoxon verdict. BENCH_FEATURES routes + # the build through the CUDA prover path. NOTE: requires this PR's bench_abba.sh change + # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge. # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both # binaries (PTX is compiled for the detected arch); never trust a cached binary. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ git fetch --force origin main; $FETCH; \ - git checkout -f $REF_A; \ + git checkout -f origin/main; \ REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" @@ -315,8 +310,8 @@ jobs: # Extract the result section for the PR comment (same marker bench-abba.yml uses). sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt" - # Surface the result in the Actions run summary too (push/workflow_dispatch - # runs have no PR to comment on). + # Surface the result in the Actions run summary too (workflow_dispatch runs + # have no PR to comment on). { echo "## GPU ABBA — ethrex 20 transfers (vs main)" echo '```' From b075c2ac32ce2175c08d16f506ef9053cbc32118 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:42:06 -0300 Subject: [PATCH 13/17] fix: apply code review --- .github/workflows/benchmark-gpu.yml | 70 ++++++++++++++++++++++------- scripts/bench_abba.sh | 17 ++++--- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 35948bb7b..564c3c87b 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -6,7 +6,7 @@ name: Benchmark GPU (PR) # It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU, # posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box. # -# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via +# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 14) or via # workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens # on the rented Vast box (provisioned by the template onstart). # @@ -40,6 +40,11 @@ env: VAST_IMAGE_DISK: "64" # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats. BENCH_FEATURES: "jemalloc-stats,prover/cuda" + # Unique per-run label set on the instance, for easy identification in the Vast console. + RUN_LABEL: "gpu-bench-${{ github.run_id }}-${{ github.run_attempt }}" + # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit + # hash can't) — avoids pulling untrusted code at run time. + VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874" jobs: benchmark-gpu: @@ -100,18 +105,38 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, comment_id: context.payload.comment.id, content: 'eyes' }); - await github.rest.issues.createComment({ + // Post the "started" notice under the SAME marker the result step uses, so the + // result updates this comment in place (and re-runs reuse it rather than stacking). + const marker = 'GPU Benchmark (ABBA)'; + const body = `## GPU Benchmark (ABBA) — running…\n\n⏳ Renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; the result will replace this comment.`; + const comments = await github.paginate(github.rest.issues.listComments, { owner: context.repo.owner, repo: context.repo.repo, - issue_number: context.issue.number, - body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.` + issue_number: context.issue.number, per_page: 100, }); + const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, repo: context.repo.repo, + comment_id: existing.id, body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number, body, + }); + } - name: Install Vast CLI + # No secrets in this step's env: install-time code can't read the API key during pip + # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason. + # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally + # managed"; safe to override on a disposable runner. + run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}" + + - name: Authenticate Vast CLI env: VAST_API_KEY: ${{ secrets.VAST_API_KEY }} - run: | - pip install --quiet --upgrade vastai - vastai set api-key "$VAST_API_KEY" + run: vastai set api-key "$VAST_API_KEY" - name: Generate ephemeral SSH key id: sshkey @@ -140,7 +165,9 @@ jobs: # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first # (within the price cap) — premium hosts have faster disks/network (quicker image # pulls) and better reliability; the cheapest boxes were flaky. - SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" + # `try ... catch 0` so a malformed/null driver_version on one offer is treated as 0 + # (filtered out) rather than erroring the whole jq and wasting the attempt. + SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do vastai search offers "$QUERY" --raw -o dph_total > offers.json || true @@ -169,6 +196,7 @@ jobs: vastai create instance "$OFFER_ID" \ --template_hash "$VAST_TEMPLATE_HASH" \ --disk "$VAST_IMAGE_DISK" \ + --label "$RUN_LABEL" \ --ssh --direct --raw > create.json cat create.json IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json) @@ -179,7 +207,7 @@ jobs: # Persist immediately so teardown runs even if later steps fail. echo "$IID" > "$RUNNER_TEMP/vast_instance_id" echo "id=$IID" >> "$GITHUB_OUTPUT" - echo "Created instance $IID" + echo "Created instance $IID (label $RUN_LABEL)" - name: Attach SSH key to instance env: @@ -255,11 +283,13 @@ jobs: if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then echo "onstart reported done"; exit 0 fi - # shellcheck disable=SC2016 # $HOME/$(...) must expand on the remote box, not the runner + # Fallback if the log marker isn't found: the late-stage artifacts (cargo + the + # sysroot + the cloned repo) imply the earlier Rust/LLVM/toolchain install finished. + # Deliberately no toolchain-date check — it would go stale when the repo bumps nightly. + # shellcheck disable=SC2016 # $HOME must expand on the remote box, not the runner if $SSH 'test -x "$HOME/.cargo/bin/cargo" \ && test -f /opt/lambda-vm-sysroot/include/stdlib.h \ - && test -d /workspace/lambda_vm/.git \ - && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then + && test -d /workspace/lambda_vm/.git'; then echo "provisioning artifacts present"; exit 0 fi sleep 10 @@ -306,6 +336,9 @@ jobs: REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" + # pipefail so a failed remote bench (e.g. a prove that dies) propagates through the + # tee pipe and fails this step, instead of being masked by tee's exit 0. + set -o pipefail $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt" # Extract the result section for the PR comment (same marker bench-abba.yml uses). sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt" @@ -349,9 +382,9 @@ jobs: body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n'; } - const { data: comments } = await github.rest.issues.listComments({ + const comments = await github.paginate(github.rest.issues.listComments, { owner: context.repo.owner, repo: context.repo.repo, - issue_number: context.issue.number, + issue_number: context.issue.number, per_page: 100, }); const marker = 'GPU Benchmark (ABBA)'; const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker)); @@ -374,8 +407,15 @@ jobs: if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then IID=$(cat "$RUNNER_TEMP/vast_instance_id") echo "Destroying instance $IID" + # Retry transient failures (network/auth) so a paid box isn't stranded. # --yes: skip the interactive [y/N] confirm (CI has no tty). - vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console" + destroyed="" + for attempt in 1 2 3; do + if vastai destroy instance "$IID" --yes; then destroyed=1; break; fi + echo "destroy attempt $attempt failed; retrying in 10s..." + sleep 10 + done + [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $IID after 3 attempts — check the Vast console (label $RUN_LABEL)" else echo "No instance id recorded; nothing to destroy." fi diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh index 57fab5e28..9bcbb39cc 100755 --- a/scripts/bench_abba.sh +++ b/scripts/bench_abba.sh @@ -94,10 +94,12 @@ INPUT="$(cd "$(dirname "$INPUT_REL")" && pwd)/$(basename "$INPUT_REL")" need_build=0 if [ "${REBUILD:-0}" = "1" ] || [ ! -x "$WORK/cli_A" ] || [ ! -x "$WORK/cli_B" ]; then need_build=1 -elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A" ] || [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B" ]; then - # Cache persists on the self-hosted runner; rebuild if it's for different refs - # (a different PR, or main advanced) so we never benchmark stale binaries. - echo "==> Cached binaries are for different refs; rebuilding." +elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A $BENCH_FEATURES" ] || \ + [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B $BENCH_FEATURES" ]; then + # Cache persists on the self-hosted runner; rebuild if it's for different refs (a + # different PR, or main advanced) OR a different feature set (e.g. CPU vs prover/cuda), + # so we never benchmark stale binaries. The marker stores " ". + echo "==> Cached binaries are for different refs/features; rebuilding." need_build=1 fi if [ "$need_build" = "1" ]; then @@ -115,15 +117,16 @@ if [ "$need_build" = "1" ]; then exit 1 fi cp "$WT/target/release/cli" "$WORK/$2" - echo "$1" > "$WORK/$2.sha" + # Marker = " " so the cache invalidates on either changing. + echo "$1 $BENCH_FEATURES" > "$WORK/$2.sha" } build_cli "$SHA_B" cli_B build_cli "$SHA_A" cli_A cleanup trap - EXIT else - echo "==> Reusing cached binaries (SHAs match requested refs; REBUILD=1 to force):" - echo " cli_A=${SHA_A:0:10} cli_B=${SHA_B:0:10}" + echo "==> Reusing cached binaries (refs + features match; REBUILD=1 to force):" + echo " cli_A=${SHA_A:0:10} cli_B=${SHA_B:0:10} features=$BENCH_FEATURES" fi # --- 3. Interleaved A/B/B/A measurement (fresh CSV -- pre-committed batch) --- From d85fb3778bb0e740d613176810f91f1be38fdaaf Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:46:17 -0300 Subject: [PATCH 14/17] test: run on push --- .github/workflows/benchmark-gpu.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 564c3c87b..40985bdf5 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -22,6 +22,9 @@ on: default: "14" issue_comment: types: [created] + # TEMP(testing): run from this branch pre-merge. REMOVE before merging. + push: + branches: [gpu_benchmarks] permissions: contents: read @@ -50,7 +53,9 @@ jobs: benchmark-gpu: runs-on: ubuntu-latest # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author. + # TEMP(testing): the `push` clause lets branch pushes run it pre-merge. REMOVE before merging. if: >- + github.event_name == 'push' || github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && @@ -79,7 +84,7 @@ jobs: else # workflow_dispatch: compare this branch vs main. OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" - PAIRS=${DISPATCH_PAIRS:-14} + PAIRS=${DISPATCH_PAIRS:-2} # TEMP(testing): 2 pairs for a fast push test; restore to 14 before merging fi # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta. if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then @@ -329,10 +334,12 @@ jobs: # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge. # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both # binaries (PTX is compiled for the detected arch); never trust a cached binary. + # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised + # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ git fetch --force origin main; $FETCH; \ - git checkout -f origin/main; \ + git checkout -f $REF_A; \ REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" From ecf615d3dd126f0cc8c3424417c7a111becfa0e5 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 13:06:19 -0300 Subject: [PATCH 15/17] fix: cuda --- .github/workflows/benchmark-gpu.yml | 6 +++++- scripts/bench_abba.sh | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 40985bdf5..2bf175e22 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -336,11 +336,15 @@ jobs: # binaries (PTX is compiled for the detected arch); never trust a cached binary. # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging. + # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer + # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol + # they don't export). nvidia-smi is logged for diagnosing driver issues. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ + nvidia-smi || true; \ git fetch --force origin main; $FETCH; \ git checkout -f $REF_A; \ - REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ + REBUILD=1 CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" # pipefail so a failed remote bench (e.g. a prove that dies) propagates through the diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh index 9bcbb39cc..950b11ffa 100755 --- a/scripts/bench_abba.sh +++ b/scripts/bench_abba.sh @@ -110,7 +110,17 @@ if [ "$need_build" = "1" ]; then git worktree add --detach "$WT" "$SHA_B" >/dev/null build_cli() { # $1=sha $2=out (shared target dir -> 2nd build is incremental) echo "==> Building cli @ ${1:0:10} -> $2 (features: $BENCH_FEATURES)" - git -C "$WT" checkout --quiet "$1" + # -f: discard any prior worktree edit (e.g. the CUDARC_PIN sed below) before switching + # refs, so the checkout can't conflict. + git -C "$WT" checkout --quiet -f "$1" + # CUDARC_PIN: pin math-cuda's cudarc to a fixed CUDA version and drop fallback-latest, so + # cudarc binds a known driver-symbol set instead of its newest (which can request symbols + # the rented box's driver doesn't export, e.g. cuDevSmResourceSplit -> runtime panic). + if [ -n "${CUDARC_PIN:-}" ]; then + sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \ + "$WT/crypto/math-cuda/Cargo.toml" + echo " cudarc pinned to ${CUDARC_PIN}" + fi if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2 tail -40 "$WORK/build_$2.log" >&2 From 6e6a6105c6a866ad6b6e5714aed0827f8f49bae7 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Mon, 29 Jun 2026 13:25:22 -0300 Subject: [PATCH 16/17] remove test setup --- .github/workflows/benchmark-gpu.yml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 2bf175e22..661685f67 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -22,9 +22,6 @@ on: default: "14" issue_comment: types: [created] - # TEMP(testing): run from this branch pre-merge. REMOVE before merging. - push: - branches: [gpu_benchmarks] permissions: contents: read @@ -53,9 +50,7 @@ jobs: benchmark-gpu: runs-on: ubuntu-latest # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author. - # TEMP(testing): the `push` clause lets branch pushes run it pre-merge. REMOVE before merging. if: >- - github.event_name == 'push' || github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && @@ -84,7 +79,7 @@ jobs: else # workflow_dispatch: compare this branch vs main. OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" - PAIRS=${DISPATCH_PAIRS:-2} # TEMP(testing): 2 pairs for a fast push test; restore to 14 before merging + PAIRS=${DISPATCH_PAIRS:-14} fi # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta. if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then @@ -334,8 +329,6 @@ jobs: # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge. # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both # binaries (PTX is compiled for the detected arch); never trust a cached binary. - # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised - # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging. # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol # they don't export). nvidia-smi is logged for diagnosing driver issues. @@ -343,7 +336,7 @@ jobs: command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ nvidia-smi || true; \ git fetch --force origin main; $FETCH; \ - git checkout -f $REF_A; \ + git checkout -f origin/main; \ REBUILD=1 CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \ scripts/bench_abba.sh $REF_A origin/main $PAIRS" From 80b6963450a96dbc930d4667446e5e9cc8c135eb Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Mon, 29 Jun 2026 16:23:53 -0300 Subject: [PATCH 17/17] ci(bench-gpu): harden teardown, cap pairs at 32, fix CUDA comment (#736) Review follow-ups on the GPU benchmark workflow: - Teardown: fall back to destroying by the unique RUN_LABEL when no instance id was recorded. The id file is written only after `create` succeeds and its JSON parses, so a box created in that window (concurrency cancel, or a parse failure) could otherwise leak and bill indefinitely. - Cap pairs at 32 (was 40) and round odd requests up to even (the AB/BA design wants even N); raise the job timeout to 210 min so a worst-case 32-pair run (64 proves + slow provisioning + dual CUDA build) fits without timing out after the expensive build. - Fix the CUDARC_PIN comment: the boxes are ~CUDA 12.8 (matching cuda-12080 and the cuda_max_good>=12.8 offer floor), not 13.0; tie it to the MIN_DRIVER guard as the opposite end of the same compatibility window. - Log only the needed fields of create.json instead of the full --raw response, so an unexpected sensitive field can't land in the run log. - Validate the workflow_dispatch branch name before it is interpolated into the remote `bash -lc` command. - Move the run-summary write into an always() step so workflow_dispatch failures are visible in the Actions summary rather than only the raw step log. --- .github/workflows/benchmark-gpu.yml | 99 ++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 23 deletions(-) diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml index 661685f67..1e2ef01b1 100644 --- a/.github/workflows/benchmark-gpu.yml +++ b/.github/workflows/benchmark-gpu.yml @@ -56,8 +56,10 @@ jobs: github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench-gpu') && contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) - # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each). - timeout-minutes: 180 + # ABBA on the GPU: provisioning + dual cuda build (~30 min) + 2*pairs proves + # (~95s each). At the max 32 pairs (64 proves) a slow-provision box runs ~3 hr, + # so allow headroom over that; teardown still always destroys the box. + timeout-minutes: 210 steps: - name: Resolve PR ref + pair count id: config @@ -81,11 +83,18 @@ jobs: OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF" PAIRS=${DISPATCH_PAIRS:-14} fi - # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta. - if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then - echo "::warning::pair count out of range [2,40], defaulting to 14" + # Clamp to [2,32]; out-of-range -> default. 14 ~ resolves a 2% delta. The ceiling + # keeps the worst-case run (64 proves + provisioning + dual build) under the job + # timeout above. + if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 32 ] 2>/dev/null; then + echo "::warning::pair count out of range [2,32], defaulting to 14" PAIRS=14 fi + # Even is ideal so the AB/BA orders balance; round an odd request up by one. + if [ "$((PAIRS % 2))" -ne 0 ]; then + PAIRS=$((PAIRS + 1)) + echo "::notice::rounded odd pair count up to $PAIRS so AB/BA orders balance" + fi { echo "pr_num=$OUT_PR_NUM" echo "head_sha=$OUT_HEAD_SHA" @@ -198,7 +207,9 @@ jobs: --disk "$VAST_IMAGE_DISK" \ --label "$RUN_LABEL" \ --ssh --direct --raw > create.json - cat create.json + # Log only the fields we need rather than the full --raw response, which could carry + # an unexpected sensitive field into the (collaborator-/world-readable) run log. + jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json) if [ -z "$IID" ]; then echo "::error::Failed to create Vast instance" @@ -315,6 +326,12 @@ jobs: FETCH="git fetch --force origin refs/pull/$PR_NUM/head" REF_A="$HEAD_SHA" else + # Reject anything outside the git-ref-safe charset before it reaches the remote + # `bash -lc` (defense-in-depth; workflow_dispatch is write-access only, but never + # interpolate an unvalidated ref into a remote shell command). + case "$BRANCH" in + ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid branch name: '$BRANCH'"; exit 1 ;; + esac FETCH="git fetch --force origin $BRANCH" REF_A="origin/$BRANCH" fi @@ -329,9 +346,13 @@ jobs: # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge. # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both # binaries (PTX is compiled for the detected arch); never trust a cached binary. - # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer - # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol - # they don't export). nvidia-smi is logged for diagnosing driver issues. + # CUDARC_PIN: pin cudarc to a fixed CUDA version (cuda-12080 = CUDA 12.8, matching the + # cuda_max_good>=12.8 offer floor) and drop fallback-latest, so cudarc binds a known + # symbol set instead of its newest. With fallback-latest cudarc requested a symbol the + # box's driver doesn't export (e.g. cuDevSmResourceSplit) -> runtime panic. This is the + # too-new end of the same compatibility window that MIN_DRIVER>=580 guards at the + # too-old end (older drivers lack cuCtxGetDevice_v2 and the GPU path falls back to CPU). + # nvidia-smi is logged for diagnosing driver issues. REMOTE="set -e; cd /workspace/lambda_vm; \ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \ nvidia-smi || true; \ @@ -347,13 +368,25 @@ jobs: # Extract the result section for the PR comment (same marker bench-abba.yml uses). sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt" - # Surface the result in the Actions run summary too (workflow_dispatch runs - # have no PR to comment on). + - name: Write run summary + # Always run so a failure (incl. workflow_dispatch, which has no PR comment step) is + # visible in the Actions run summary instead of only the raw step log. + if: always() && (steps.bench.outcome == 'success' || steps.bench.outcome == 'failure') + env: + OUTCOME: ${{ steps.bench.outcome }} + run: | { echo "## GPU ABBA — ethrex 20 transfers (vs main)" - echo '```' - cat "$RUNNER_TEMP/abba_result.txt" - echo '```' + if [ "$OUTCOME" = "success" ] && [ -s "$RUNNER_TEMP/abba_result.txt" ]; then + echo '```' + cat "$RUNNER_TEMP/abba_result.txt" + echo '```' + else + echo "❌ Run outcome: ${OUTCOME:-unknown}. Last log lines:" + echo '```' + tail -n 30 "$RUNNER_TEMP/abba_out.txt" 2>/dev/null || echo "(no output captured)" + echo '```' + fi } >> "$GITHUB_STEP_SUMMARY" - name: Comment ABBA result on PR @@ -408,18 +441,38 @@ jobs: - name: Destroy instance if: always() run: | - if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then - IID=$(cat "$RUNNER_TEMP/vast_instance_id") - echo "Destroying instance $IID" - # Retry transient failures (network/auth) so a paid box isn't stranded. - # --yes: skip the interactive [y/N] confirm (CI has no tty). - destroyed="" + # Retry transient failures (network/auth) so a paid box isn't stranded. + # --yes: skip the interactive [y/N] confirm (CI has no tty). + destroy() { + iid="$1"; destroyed="" for attempt in 1 2 3; do - if vastai destroy instance "$IID" --yes; then destroyed=1; break; fi + if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi echo "destroy attempt $attempt failed; retrying in 10s..." sleep 10 done - [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $IID after 3 attempts — check the Vast console (label $RUN_LABEL)" + [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)" + } + if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then + IID=$(cat "$RUNNER_TEMP/vast_instance_id") + echo "Destroying instance $IID" + destroy "$IID" else - echo "No instance id recorded; nothing to destroy." + # The id file is written only AFTER create succeeds AND its JSON parses, so a box can + # exist unrecorded if the run was cancelled in that window (concurrency cancel) or the + # parse failed. Fall back to destroying by our unique RUN_LABEL so the box can't leak + # (bill indefinitely). RUN_LABEL is unique per run, so this never touches another run's box. + echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..." + vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json + # Tolerate either a bare array or {instances:[...]}; match our exact label. + LEAKED=$(jq -r --arg L "$RUN_LABEL" \ + '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \ + all_inst.json 2>/dev/null || true) + if [ -z "$LEAKED" ]; then + echo "No instance labelled $RUN_LABEL found; nothing to destroy." + else + for IID in $LEAKED; do + echo "Destroying leaked instance $IID (label $RUN_LABEL)" + destroy "$IID" + done + fi fi