From 158278c25aae372d6b7e7c835c108206e599b311 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 16:07:44 -0300
Subject: [PATCH 01/17] ci: add gpu benchmarks

---
 .github/workflows/benchmark-gpu.yml | 337 ++++++++++++++++++++++++++++
 .github/workflows/benchmark-pr.yml  |   1 +
 infra/gpu_bench.sh                  |  95 ++++++++
 3 files changed, 433 insertions(+)
 create mode 100644 .github/workflows/benchmark-gpu.yml
 create mode 100755 infra/gpu_bench.sh

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
new file mode 100644
index 000000000..abdea98a5
--- /dev/null
+++ b/.github/workflows/benchmark-gpu.yml
@@ -0,0 +1,337 @@
+name: Benchmark GPU (PR)
+
+# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover
+# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute
+# GPU numbers back to the PR, then always destroy the instance.
+#
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3),
+# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the
+# GPU work happens on the rented Vast box (provisioned by the template onstart).
+#
+# Requires repo secrets:
+#   VAST_API_KEY        — https://cloud.vast.ai/manage-keys/
+#   VAST_TEMPLATE_HASH  — hash of the "NVIDIA CUDA Lambda VM 64GB" template
+
+on:
+  workflow_dispatch:
+    inputs:
+      runs:
+        description: "Number of prove iterations"
+        default: "3"
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+concurrency:
+  group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk,
+  # verified + rentable, Blackwell-capable driver, under the price cap ($/hr).
+  GPU_NAME: RTX_5090
+  PRICE_CAP: "3"
+  VAST_IMAGE_DISK: "64"
+
+jobs:
+  benchmark-gpu:
+    runs-on: ubuntu-latest
+    # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       startsWith(github.event.comment.body, '/bench-gpu') &&
+       contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    steps:
+      - name: React to comment
+        if: github.event_name == 'issue_comment'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'eyes'
+            });
+
+      - name: Resolve PR ref + run count
+        id: config
+        env:
+          GH_TOKEN: ${{ github.token }}
+          EVENT_NAME: ${{ github.event_name }}
+          COMMENT_BODY: ${{ github.event.comment.body }}
+          PR_NUM: ${{ github.event.issue.number }}
+          DISPATCH_RUNS: ${{ github.event.inputs.runs }}
+          DISPATCH_REF: ${{ github.ref_name }}
+        run: |
+          if [ "$EVENT_NAME" = "issue_comment" ]; then
+            SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+            echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT"
+            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+            # "/bench-gpu 5" -> 5 iterations; otherwise default.
+            N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
+            RUNS=${N:-3}
+          else
+            echo "pr_num=" >> "$GITHUB_OUTPUT"
+            echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT"
+            RUNS=${DISPATCH_RUNS:-3}
+          fi
+          # Clamp to [1,10].
+          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
+            echo "::warning::run count out of range, defaulting to 3"
+            RUNS=3
+          fi
+          echo "runs=$RUNS" >> "$GITHUB_OUTPUT"
+          echo "Using $RUNS prove iteration(s)"
+
+      - name: Install Vast CLI
+        env:
+          VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
+        run: |
+          pip install --quiet --upgrade vastai
+          vastai set api-key "$VAST_API_KEY"
+
+      - name: Register ephemeral SSH key
+        id: sshkey
+        run: |
+          mkdir -p "$HOME/.ssh"
+          KEY="$HOME/.ssh/vast_bench"
+          COMMENT="gh-actions-bench-${GITHUB_RUN_ID}"
+          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null
+          vastai create ssh-key "$(cat "$KEY.pub")"
+          echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT"
+          echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
+
+      - name: Pick a Vast offer
+        id: offer
+        run: |
+          # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          echo "Query: $QUERY"
+          vastai search offers "$QUERY" --raw -o dph_total > offers.json
+          OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
+          OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+          if [ -z "$OFFER_ID" ]; then
+            echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            exit 1
+          fi
+          echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr"
+          echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
+          echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
+
+      - name: Create instance
+        id: instance
+        env:
+          VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
+          OFFER_ID: ${{ steps.offer.outputs.id }}
+        run: |
+          vastai create instance "$OFFER_ID" \
+            --template_hash "$VAST_TEMPLATE_HASH" \
+            --disk "$VAST_IMAGE_DISK" \
+            --ssh --direct --raw > create.json
+          cat create.json
+          IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
+          if [ -z "$IID" ]; then
+            echo "::error::Failed to create Vast instance"
+            exit 1
+          fi
+          # Persist immediately so teardown runs even if later steps fail.
+          echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
+          echo "id=$IID" >> "$GITHUB_OUTPUT"
+          echo "Created instance $IID"
+
+      - name: Wait for SSH
+        id: ssh
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+        run: |
+          echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
+          HOST=""; PORT=""
+          for _ in $(seq 1 60); do   # ~10 min
+            vastai show instance "$IID" --raw > inst.json || true
+            STATUS=$(jq -r '.actual_status // empty' inst.json)
+            # We create with --direct, so SSH straight to the public IP + the host port
+            # mapped to container port 22. The .ssh_host/.ssh_port proxy fields are
+            # unreliable (observed off-by-one vs the real proxy port), so use the direct
+            # mapping — same endpoint `vastai ssh-url` reports.
+            HOST=$(jq -r '.public_ipaddr // empty' inst.json)
+            PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
+            echo "  status=$STATUS ssh=$HOST:$PORT"
+            if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
+              break
+            fi
+            sleep 10
+          done
+          if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
+            echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
+            exit 1
+          fi
+          echo "host=$HOST" >> "$GITHUB_OUTPUT"
+          echo "port=$PORT" >> "$GITHUB_OUTPUT"
+
+          # Wait for sshd to accept our key.
+          for _ in $(seq 1 30); do
+            if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+                 -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
+              echo "sshd reachable"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::sshd did not accept connections in time"
+          exit 1
+
+      - name: Wait for onstart provisioning
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
+          # The bootstrap's final stdout line is "=== done ===". Vast captures onstart
+          # output to /var/log/onstart.log; fall back to checking the artifacts it leaves.
+          for _ in $(seq 1 120); do   # ~20 min
+            if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
+              echo "onstart reported done"; exit 0
+            fi
+            # shellcheck disable=SC2016  # $HOME/$(...) must expand on the remote box, not the runner
+            if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
+                  && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
+                  && test -d /workspace/lambda_vm/.git \
+                  && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then
+              echo "provisioning artifacts present"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::onstart provisioning did not complete in time"
+          exit 1
+
+      - name: Check out PR source on the box
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+          PR_NUM: ${{ steps.config.outputs.pr_num }}
+          SHA: ${{ steps.config.outputs.sha }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          if [ -n "$PR_NUM" ]; then
+            # Fetch the PR head via the base repo's pull ref (works for fork PRs too).
+            $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'"
+          else
+            # workflow_dispatch: check out the requested branch.
+            $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD"
+          fi
+          $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline"
+
+      - name: Run GPU benchmark
+        id: bench
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+          RUNS: ${{ steps.config.outputs.runs }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself.
+          $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log
+
+          # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:").
+          mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}')
+          mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*'   bench.log | awk '{print $3}')
+          if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then
+            echo "::error::Failed to parse any GPU metrics from the bench output"
+            exit 1
+          fi
+          MED_POS=$(( (${#TIMES[@]} + 1) / 2 ))
+          TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS")
+          HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS")
+          ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -)
+          ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -)
+          {
+            echo "time_s=$TIME_MED"
+            echo "peak_mb=$HEAP_MED"
+            echo "all_times=$ALL_TIMES"
+            echo "all_heaps=$ALL_HEAPS"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Comment on PR
+        if: github.event_name == 'issue_comment'
+        uses: actions/github-script@v7
+        env:
+          TIME_S: ${{ steps.bench.outputs.time_s }}
+          PEAK_MB: ${{ steps.bench.outputs.peak_mb }}
+          ALL_TIMES: ${{ steps.bench.outputs.all_times }}
+          ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }}
+          RUNS: ${{ steps.config.outputs.runs }}
+          GPU_NAME: ${{ env.GPU_NAME }}
+          OFFER_PRICE: ${{ steps.offer.outputs.price }}
+          COMMIT_SHA: ${{ steps.config.outputs.sha }}
+        with:
+          script: |
+            const time = process.env.TIME_S;
+            const peak = process.env.PEAK_MB;
+            const runs = parseInt(process.env.RUNS || '1');
+            const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / ');
+            const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / ');
+            const nLabel = runs > 1 ? ` (median of ${runs})` : '';
+            const sha = (process.env.COMMIT_SHA || '').substring(0, 8);
+
+            let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`;
+            body += `<sub>GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`</sub>\n\n`;
+            body += `| Metric | GPU |\n`;
+            body += `|--------|-----|\n`;
+            body += `| **Prove time** | ${time}s |\n`;
+            body += `| **Peak heap** | ${peak} MB |\n`;
+            if (runs > 1) {
+              body += `\n<sub>Runs — time: ${allTimes} · heap: ${allHeaps}</sub>\n`;
+            }
+            body += `\n<sub>Commit: ${sha} · Runner: Vast.ai RTX 5090</sub>\n`;
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            const marker = 'Benchmark (GPU) — ethrex';
+            const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                comment_id: existing.id, body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: context.issue.number, body,
+              });
+            }
+
+      # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
+      - name: Destroy instance
+        if: always()
+        run: |
+          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+            echo "Destroying instance $IID"
+            vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console"
+          else
+            echo "No instance id recorded; nothing to destroy."
+          fi
+
+      - name: Remove ephemeral SSH key
+        if: always()
+        env:
+          KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }}
+        run: |
+          [ -z "$KEY_COMMENT" ] && exit 0
+          vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0
+          for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do
+            echo "Deleting ssh-key $kid"
+            vastai delete ssh-key "$kid" || true
+          done
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
index ca66bf9a7..2eaebc213 100644
--- a/.github/workflows/benchmark-pr.yml
+++ b/.github/workflows/benchmark-pr.yml
@@ -55,6 +55,7 @@ jobs:
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench') &&
+       !startsWith(github.event.comment.body, '/bench-gpu') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
     steps:
       - name: React to comment
diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh
new file mode 100755
index 000000000..1557e1e02
--- /dev/null
+++ b/infra/gpu_bench.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled.
+#
+# Usage: infra/gpu_bench.sh [runs]
+#   runs  number of prove iterations (default 3)
+#
+# Assumes the box was provisioned by the Vast template onstart
+# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01,
+# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place;
+# CUDA/nvcc come from the base image. This script does NOT provision — it only
+# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop.
+#
+# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml):
+# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient)
+# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the
+# orchestrating workflow parses.
+
+set -euo pipefail
+
+RUNS="${1:-3}"
+
+# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT).
+ELF="executor/program_artifacts/rust/ethrex.elf"
+INPUT="executor/tests/ethrex_bench_20.bin"
+TRANSFERS=20
+
+log() { printf '\n=== %s ===\n' "$*"; }
+
+# --- 0. Locate cargo + sysroot (provisioned by the template onstart) ---------
+if [ -f "$HOME/.cargo/env" ]; then
+    # shellcheck disable=SC1091
+    . "$HOME/.cargo/env"
+fi
+export PATH="$HOME/.cargo/bin:$PATH"
+export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
+
+# --- 1. Sanity-check the GPU toolchain ---------------------------------------
+log "GPU + toolchain check"
+if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "::error::nvidia-smi not found — no GPU driver on this box" >&2
+    exit 1
+fi
+nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true
+
+# nvcc may live under /usr/local/cuda/bin without being on PATH.
+if ! command -v nvcc >/dev/null 2>&1; then
+    for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
+        if [ -x "$d/nvcc" ]; then
+            export PATH="$d:$PATH"
+            export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}"
+            break
+        fi
+    done
+fi
+if ! command -v nvcc >/dev/null 2>&1; then
+    echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2
+    exit 1
+fi
+nvcc --version | tail -n 2
+
+if ! command -v cargo >/dev/null 2>&1; then
+    echo "::error::cargo not found — template onstart provisioning incomplete" >&2
+    exit 1
+fi
+if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then
+    echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2
+    exit 1
+fi
+
+# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------
+log "building ethrex guest ELF"
+make "$ELF"
+
+# --- 3. Generate the 20-transfer fixture -------------------------------------
+log "generating $INPUT ($TRANSFERS distinct transfers)"
+( cd tooling/ethrex-fixtures && cargo build --release )
+GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures
+"$GEN" "$TRANSFERS" "$INPUT" distinct
+
+# --- 4. Build the CLI with the GPU (cuda) path -------------------------------
+# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes
+# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects
+# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed.
+log "building CLI with --features jemalloc-stats,prover/cuda"
+cargo build --release -p cli --features jemalloc-stats,prover/cuda
+
+# --- 5. Prove loop -----------------------------------------------------------
+log "proving $ELF x$RUNS (GPU)"
+for i in $(seq 1 "$RUNS"); do
+    echo "--- Run $i/$RUNS ---"
+    ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time
+    rm -f /tmp/proof.bin
+done
+
+log "done"

From a9d848a5ea22261340e33e3c5d66660b8e4671a4 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 16:39:04 -0300
Subject: [PATCH 02/17] add retries

---
 .github/workflows/benchmark-gpu.yml | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index abdea98a5..e1750ddde 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -109,18 +109,33 @@ jobs:
 
       - name: Pick a Vast offer
         id: offer
+        env:
+          # Retry the same query to ride out transient scarcity (datacenter RTX 5090s
+          # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
+          OFFER_ATTEMPTS: "10"
+          OFFER_INTERVAL: "30"
         run: |
-          # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it.
+          # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB
+          # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY"
-          vastai search offers "$QUERY" --raw -o dph_total > offers.json
-          OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
-          OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+          OFFER_ID=""
+          for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
+            vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
+            OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
+            OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+            if [ -n "$OFFER_ID" ]; then
+              echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
+              break
+            fi
+            echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
+            sleep "$OFFER_INTERVAL"
+          done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
-          echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr"
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
           echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
 

From 5ef0fe2b6608ff317b9a4142f0c04134a229f3e1 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:00:54 -0300
Subject: [PATCH 03/17] ci: use ABBA method to run the benchmark

---
 .github/workflows/benchmark-gpu.yml | 211 ++++++++++++++--------------
 infra/gpu_bench.sh                  |  95 -------------
 scripts/bench_abba.sh               |   9 +-
 3 files changed, 116 insertions(+), 199 deletions(-)
 delete mode 100755 infra/gpu_bench.sh

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index e1750ddde..51a7ed63b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -1,12 +1,14 @@
 name: Benchmark GPU (PR)
 
-# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover
-# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute
-# GPU numbers back to the PR, then always destroy the instance.
+# Rent an RTX 5090 on Vast.ai (hourly) and run the drift-free A/B/B/A (ABBA) paired
+# prover benchmark — the same method as the CPU `/bench-abba` (scripts/bench_abba.sh) —
+# but with the CUDA prover path enabled (BENCH_FEATURES=jemalloc-stats,prover/cuda).
+# It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU,
+# posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box.
 #
-# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3),
-# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the
-# GPU work happens on the rented Vast box (provisioned by the template onstart).
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via
+# workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens
+# on the rented Vast box (provisioned by the template onstart).
 #
 # Requires repo secrets:
 #   VAST_API_KEY        — https://cloud.vast.ai/manage-keys/
@@ -15,79 +17,102 @@ name: Benchmark GPU (PR)
 on:
   workflow_dispatch:
     inputs:
-      runs:
-        description: "Number of prove iterations"
-        default: "3"
+      pairs:
+        description: "Number of A/B/B/A pairs"
+        default: "10"
   issue_comment:
     types: [created]
+  # TEMP(testing): lets the workflow run from this branch before it's on the default
+  # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do
+  # not). REMOVE this push trigger before merging.
+  push:
+    branches: [gpu_benchmarks]
 
 permissions:
   contents: read
   pull-requests: write
+  issues: write
 
 concurrency:
   group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
   cancel-in-progress: true
 
 env:
-  # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk,
-  # verified + rentable, Blackwell-capable driver, under the price cap ($/hr).
+  # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove
+  # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "3"
   VAST_IMAGE_DISK: "64"
+  # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
+  BENCH_FEATURES: "jemalloc-stats,prover/cuda"
 
 jobs:
   benchmark-gpu:
     runs-on: ubuntu-latest
     # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+    # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge.
+    # REMOVE the push clause before merging.
     if: >-
+      github.event_name == 'push' ||
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench-gpu') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each).
+    timeout-minutes: 180
     steps:
-      - name: React to comment
-        if: github.event_name == 'issue_comment'
-        uses: actions/github-script@v7
-        with:
-          script: |
-            await github.rest.reactions.createForIssueComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: context.payload.comment.id,
-              content: 'eyes'
-            });
-
-      - name: Resolve PR ref + run count
+      - name: Resolve PR ref + pair count
         id: config
         env:
           GH_TOKEN: ${{ github.token }}
           EVENT_NAME: ${{ github.event_name }}
           COMMENT_BODY: ${{ github.event.comment.body }}
           PR_NUM: ${{ github.event.issue.number }}
-          DISPATCH_RUNS: ${{ github.event.inputs.runs }}
+          DISPATCH_PAIRS: ${{ github.event.inputs.pairs }}
           DISPATCH_REF: ${{ github.ref_name }}
         run: |
           if [ "$EVENT_NAME" = "issue_comment" ]; then
-            SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
-            echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT"
-            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-            # "/bench-gpu 5" -> 5 iterations; otherwise default.
+            # Pin the head SHA (works for fork PRs; avoids a force-push race mid-run).
+            HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+            OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
+            # "/bench-gpu 20" -> 20 pairs; otherwise default.
             N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
-            RUNS=${N:-3}
+            PAIRS=${N:-10}
           else
-            echo "pr_num=" >> "$GITHUB_OUTPUT"
-            echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT"
-            RUNS=${DISPATCH_RUNS:-3}
+            # workflow_dispatch / push: compare this branch vs main.
+            OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
+            PAIRS=${DISPATCH_PAIRS:-10}
           fi
-          # Clamp to [1,10].
-          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
-            echo "::warning::run count out of range, defaulting to 3"
-            RUNS=3
+          # Clamp to [2,40] (even is ideal so AB/BA orders balance).
+          if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count out of range [2,40], defaulting to 10"
+            PAIRS=10
           fi
-          echo "runs=$RUNS" >> "$GITHUB_OUTPUT"
-          echo "Using $RUNS prove iteration(s)"
+          {
+            echo "pr_num=$OUT_PR_NUM"
+            echo "head_sha=$OUT_HEAD_SHA"
+            echo "branch=$OUT_BRANCH"
+            echo "pairs=$PAIRS"
+          } >> "$GITHUB_OUTPUT"
+          echo "Using $PAIRS A/B/B/A pairs"
+
+      - name: Acknowledge (react + occupancy notice)
+        if: github.event_name == 'issue_comment'
+        uses: actions/github-script@v7
+        env:
+          PAIRS: ${{ steps.config.outputs.pairs }}
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              comment_id: context.payload.comment.id, content: 'eyes'
+            });
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.`
+            });
 
       - name: Install Vast CLI
         env:
@@ -226,94 +251,76 @@ jobs:
           echo "::error::onstart provisioning did not complete in time"
           exit 1
 
-      - name: Check out PR source on the box
+      - name: Run GPU ABBA benchmark
+        id: bench
         env:
           HOST: ${{ steps.ssh.outputs.host }}
           PORT: ${{ steps.ssh.outputs.port }}
           KEY: ${{ steps.sshkey.outputs.key_path }}
           PR_NUM: ${{ steps.config.outputs.pr_num }}
-          SHA: ${{ steps.config.outputs.sha }}
+          HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+          BRANCH: ${{ steps.config.outputs.branch }}
+          PAIRS: ${{ steps.config.outputs.pairs }}
         run: |
           SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+
+          # Resolve the PR side (REF_A) and the fetch needed to make it resolvable on the box.
           if [ -n "$PR_NUM" ]; then
-            # Fetch the PR head via the base repo's pull ref (works for fork PRs too).
-            $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'"
+            FETCH="git fetch --force origin refs/pull/$PR_NUM/head"
+            REF_A="$HEAD_SHA"
           else
-            # workflow_dispatch: check out the requested branch.
-            $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD"
+            FETCH="git fetch --force origin $BRANCH"
+            REF_A="origin/$BRANCH"
           fi
-          $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline"
 
-      - name: Run GPU benchmark
-        id: bench
-        env:
-          HOST: ${{ steps.ssh.outputs.host }}
-          PORT: ${{ steps.ssh.outputs.port }}
-          KEY: ${{ steps.sshkey.outputs.key_path }}
-          RUNS: ${{ steps.config.outputs.runs }}
-        run: |
-          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
-          # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself.
-          $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log
+          # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
+          # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
+          # verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+          REMOTE="set -e; cd /workspace/lambda_vm; \
+            command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
+            git fetch --force origin main; $FETCH; \
+            SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+            scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 
-          # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:").
-          mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}')
-          mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*'   bench.log | awk '{print $3}')
-          if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then
-            echo "::error::Failed to parse any GPU metrics from the bench output"
-            exit 1
-          fi
-          MED_POS=$(( (${#TIMES[@]} + 1) / 2 ))
-          TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS")
-          HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS")
-          ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -)
-          ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -)
-          {
-            echo "time_s=$TIME_MED"
-            echo "peak_mb=$HEAP_MED"
-            echo "all_times=$ALL_TIMES"
-            echo "all_heaps=$ALL_HEAPS"
-          } >> "$GITHUB_OUTPUT"
+          $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
+          # Extract the result section for the PR comment (same marker bench-abba.yml uses).
+          sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
 
-      - name: Comment on PR
-        if: github.event_name == 'issue_comment'
+      - name: Comment ABBA result on PR
+        if: always() && github.event_name == 'issue_comment'
         uses: actions/github-script@v7
         env:
-          TIME_S: ${{ steps.bench.outputs.time_s }}
-          PEAK_MB: ${{ steps.bench.outputs.peak_mb }}
-          ALL_TIMES: ${{ steps.bench.outputs.all_times }}
-          ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }}
-          RUNS: ${{ steps.config.outputs.runs }}
+          HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+          PAIRS: ${{ steps.config.outputs.pairs }}
+          OUTCOME: ${{ steps.bench.outcome }}
           GPU_NAME: ${{ env.GPU_NAME }}
           OFFER_PRICE: ${{ steps.offer.outputs.price }}
-          COMMIT_SHA: ${{ steps.config.outputs.sha }}
         with:
           script: |
-            const time = process.env.TIME_S;
-            const peak = process.env.PEAK_MB;
-            const runs = parseInt(process.env.RUNS || '1');
-            const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / ');
-            const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / ');
-            const nLabel = runs > 1 ? ` (median of ${runs})` : '';
-            const sha = (process.env.COMMIT_SHA || '').substring(0, 8);
+            const fs = require('fs');
+            const tmp = process.env.RUNNER_TEMP;
+            const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
+            const head = (process.env.HEAD_SHA || '').slice(0, 10);
+            const pairs = process.env.PAIRS;
+            const gpu = (process.env.GPU_NAME || '').replace('_', ' ');
+            const price = process.env.OFFER_PRICE;
 
-            let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`;
-            body += `<sub>GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`</sub>\n\n`;
-            body += `| Metric | GPU |\n`;
-            body += `|--------|-----|\n`;
-            body += `| **Prove time** | ${time}s |\n`;
-            body += `| **Peak heap** | ${peak} MB |\n`;
-            if (runs > 1) {
-              body += `\n<sub>Runs — time: ${allTimes} · heap: ${allHeaps}</sub>\n`;
+            let body = `## GPU Benchmark (ABBA) — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
+            body += `<sub>${gpu} · Vast.ai datacenter${price ? ` @ \$${price}/hr` : ''} · \`prover/cuda\` · drift-free A/B/B/A</sub>\n\n`;
+            if (process.env.OUTCOME === 'success') {
+              const res = read(`${tmp}/abba_result.txt`) || read(`${tmp}/abba_out.txt`);
+              body += '```\n' + res + '\n```\n';
+              body += '\n<sub>+ = PR faster. Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
+            } else {
+              const tail = read(`${tmp}/abba_out.txt`).split('\n').slice(-30).join('\n');
+              body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
             }
-            body += `\n<sub>Commit: ${sha} · Runner: Vast.ai RTX 5090</sub>\n`;
 
             const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
+              owner: context.repo.owner, repo: context.repo.repo,
               issue_number: context.issue.number,
             });
-            const marker = 'Benchmark (GPU) — ethrex';
+            const marker = 'GPU Benchmark (ABBA)';
             const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
             if (existing) {
               await github.rest.issues.updateComment({
diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh
deleted file mode 100755
index 1557e1e02..000000000
--- a/infra/gpu_bench.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/bash
-# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled.
-#
-# Usage: infra/gpu_bench.sh [runs]
-#   runs  number of prove iterations (default 3)
-#
-# Assumes the box was provisioned by the Vast template onstart
-# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01,
-# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place;
-# CUDA/nvcc come from the base image. This script does NOT provision — it only
-# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop.
-#
-# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml):
-# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient)
-# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the
-# orchestrating workflow parses.
-
-set -euo pipefail
-
-RUNS="${1:-3}"
-
-# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT).
-ELF="executor/program_artifacts/rust/ethrex.elf"
-INPUT="executor/tests/ethrex_bench_20.bin"
-TRANSFERS=20
-
-log() { printf '\n=== %s ===\n' "$*"; }
-
-# --- 0. Locate cargo + sysroot (provisioned by the template onstart) ---------
-if [ -f "$HOME/.cargo/env" ]; then
-    # shellcheck disable=SC1091
-    . "$HOME/.cargo/env"
-fi
-export PATH="$HOME/.cargo/bin:$PATH"
-export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
-
-# --- 1. Sanity-check the GPU toolchain ---------------------------------------
-log "GPU + toolchain check"
-if ! command -v nvidia-smi >/dev/null 2>&1; then
-    echo "::error::nvidia-smi not found — no GPU driver on this box" >&2
-    exit 1
-fi
-nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true
-
-# nvcc may live under /usr/local/cuda/bin without being on PATH.
-if ! command -v nvcc >/dev/null 2>&1; then
-    for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
-        if [ -x "$d/nvcc" ]; then
-            export PATH="$d:$PATH"
-            export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}"
-            break
-        fi
-    done
-fi
-if ! command -v nvcc >/dev/null 2>&1; then
-    echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2
-    exit 1
-fi
-nvcc --version | tail -n 2
-
-if ! command -v cargo >/dev/null 2>&1; then
-    echo "::error::cargo not found — template onstart provisioning incomplete" >&2
-    exit 1
-fi
-if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then
-    echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2
-    exit 1
-fi
-
-# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------
-log "building ethrex guest ELF"
-make "$ELF"
-
-# --- 3. Generate the 20-transfer fixture -------------------------------------
-log "generating $INPUT ($TRANSFERS distinct transfers)"
-( cd tooling/ethrex-fixtures && cargo build --release )
-GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures
-"$GEN" "$TRANSFERS" "$INPUT" distinct
-
-# --- 4. Build the CLI with the GPU (cuda) path -------------------------------
-# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes
-# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects
-# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed.
-log "building CLI with --features jemalloc-stats,prover/cuda"
-cargo build --release -p cli --features jemalloc-stats,prover/cuda
-
-# --- 5. Prove loop -----------------------------------------------------------
-log "proving $ELF x$RUNS (GPU)"
-for i in $(seq 1 "$RUNS"); do
-    echo "--- Run $i/$RUNS ---"
-    ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time
-    rm -f /tmp/proof.bin
-done
-
-log "done"
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
index 79bfddf27..57fab5e28 100755
--- a/scripts/bench_abba.sh
+++ b/scripts/bench_abba.sh
@@ -27,6 +27,8 @@
 #     REF_B    baseline   (default: origin/main)
 #     N_PAIRS  pairs      (default: 20 -> 40 runs, ~33 min on ethrex)
 #   Env: REBUILD=1 forces a rebuild even if cached binaries exist.
+#        BENCH_FEATURES=<list> cargo features for the cli build (default: jemalloc-stats).
+#          The GPU ABBA workflow passes "jemalloc-stats,prover/cuda" to bench the GPU path.
 #
 #   Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect,
 #   ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6%
@@ -45,6 +47,9 @@ fi
 REF_A="$1"
 REF_B="${2:-origin/main}"
 N_PAIRS="${3:-20}"
+# cli build features. Default matches the CPU bench; the GPU ABBA workflow overrides
+# with "jemalloc-stats,prover/cuda" to exercise the CUDA prover path.
+BENCH_FEATURES="${BENCH_FEATURES:-jemalloc-stats}"
 
 ELF_REL="executor/program_artifacts/rust/ethrex.elf"
 INPUT_REL="executor/tests/ethrex_bench_20.bin"
@@ -102,9 +107,9 @@ if [ "$need_build" = "1" ]; then
   echo "==> Building both prover binaries in isolated worktree $WT"
   git worktree add --detach "$WT" "$SHA_B" >/dev/null
   build_cli() {  # $1=sha $2=out (shared target dir -> 2nd build is incremental)
-    echo "==> Building cli @ ${1:0:10} -> $2"
+    echo "==> Building cli @ ${1:0:10} -> $2  (features: $BENCH_FEATURES)"
     git -C "$WT" checkout --quiet "$1"
-    if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then
+    if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then
       echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2
       tail -40 "$WORK/build_$2.log" >&2
       exit 1

From 337bce05248c8ec4e7ad07d1cee4142c9ddc15e8 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:04:29 -0300
Subject: [PATCH 04/17] ci: use 64gb ram

---
 .github/workflows/benchmark-gpu.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 51a7ed63b..c885cc03b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,8 +38,8 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove
-  # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
+  # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk,
+  # verified + rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "3"
   VAST_IMAGE_DISK: "64"
@@ -140,10 +140,10 @@ jobs:
           OFFER_ATTEMPTS: "10"
           OFFER_INTERVAL: "30"
         run: |
-          # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it.
-          # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB
-          # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000
+          # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap
+          # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -158,7 +158,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"

From 3fce3499a6de1ae0fd4d5209ae0946862dbbc501 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:24:17 -0300
Subject: [PATCH 05/17] ci: remove datacenter flag

---
 .github/workflows/benchmark-gpu.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index c885cc03b..d9d0b203d 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,8 +38,8 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk,
-  # verified + rentable, Blackwell-capable driver, <= cap.
+  # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
+  # rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "3"
   VAST_IMAGE_DISK: "64"
@@ -140,10 +140,9 @@ jobs:
           OFFER_ATTEMPTS: "10"
           OFFER_INTERVAL: "30"
         run: |
-          # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000
-          # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap
-          # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the
+          # "64 GB" boxes report ~64467 MB.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -158,7 +157,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"

From 76a137a5542dbc3ea15e4e050e6a03440731c981 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:29:49 -0300
Subject: [PATCH 06/17] fix: units for RAM

---
 .github/workflows/benchmark-gpu.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index d9d0b203d..d88f5eb92 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,7 +38,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
+  # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified +
   # rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "3"
@@ -140,9 +140,9 @@ jobs:
           OFFER_ATTEMPTS: "10"
           OFFER_INTERVAL: "30"
         run: |
-          # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the
-          # "64 GB" boxes report ~64467 MB.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
+          # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -157,7 +157,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"

From 09cf1e5044a19ef6008a7f278b966f78e389a57d Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 18:18:46 -0300
Subject: [PATCH 07/17] fix: min driver and ssh key

---
 .github/workflows/benchmark-gpu.yml | 79 ++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index d88f5eb92..69b290ec6 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -19,7 +19,7 @@ on:
     inputs:
       pairs:
         description: "Number of A/B/B/A pairs"
-        default: "10"
+        default: "1"   # TEMP(testing): fast runs; restore to "10" before merge
   issue_comment:
     types: [created]
   # TEMP(testing): lets the workflow run from this branch before it's on the default
@@ -78,16 +78,16 @@ jobs:
             OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
             # "/bench-gpu 20" -> 20 pairs; otherwise default.
             N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
-            PAIRS=${N:-10}
+            PAIRS=${N:-1}   # TEMP(testing): default 1; restore to 10 before merge
           else
             # workflow_dispatch / push: compare this branch vs main.
             OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
-            PAIRS=${DISPATCH_PAIRS:-10}
+            PAIRS=${DISPATCH_PAIRS:-1}   # TEMP(testing): default 1; restore to 10 before merge
           fi
-          # Clamp to [2,40] (even is ideal so AB/BA orders balance).
-          if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
-            echo "::warning::pair count out of range [2,40], defaulting to 10"
-            PAIRS=10
+          # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge.
+          if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count out of range [1,40], defaulting to 1"
+            PAIRS=1
           fi
           {
             echo "pr_num=$OUT_PR_NUM"
@@ -121,15 +121,12 @@ jobs:
           pip install --quiet --upgrade vastai
           vastai set api-key "$VAST_API_KEY"
 
-      - name: Register ephemeral SSH key
+      - name: Generate ephemeral SSH key
         id: sshkey
         run: |
           mkdir -p "$HOME/.ssh"
           KEY="$HOME/.ssh/vast_bench"
-          COMMENT="gh-actions-bench-${GITHUB_RUN_ID}"
-          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null
-          vastai create ssh-key "$(cat "$KEY.pub")"
-          echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT"
+          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-bench-${GITHUB_RUN_ID}" >/dev/null
           echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
 
       - name: Pick a Vast offer
@@ -139,16 +136,23 @@ jobs:
           # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
           OFFER_ATTEMPTS: "10"
           OFFER_INTERVAL: "30"
+          # Require driver >= this major so cudarc (default cuda-version-from-build-system)
+          # matches the runtime driver. Older drivers (e.g. 575) lack newer symbols like
+          # cuCtxGetDevice_v2 and the GPU path falls back to CPU. Filtered client-side in jq
+          # because vast can't numerically compare the driver_version string server-side.
+          MIN_DRIVER: "580"
         run: |
           # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
           # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
           QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
-          echo "Query: $QUERY"
+          echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
+          # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
+          SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
             vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
-            OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
-            OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+            OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
+            OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
             if [ -n "$OFFER_ID" ]; then
               echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
               break
@@ -157,7 +161,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
@@ -184,6 +188,25 @@ jobs:
           echo "id=$IID" >> "$GITHUB_OUTPUT"
           echo "Created instance $IID"
 
+      - name: Attach SSH key to instance
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys).
+          # It's removed when the instance is destroyed, so no account-level key to clean up.
+          # Retry: the instance may not accept the attach immediately after create.
+          PUB="$(cat "$KEY.pub")"
+          for attempt in $(seq 1 12); do
+            if vastai attach ssh "$IID" "$PUB"; then
+              echo "Attached ssh key (attempt $attempt)"; exit 0
+            fi
+            echo "attach failed (attempt $attempt/12); retrying in 10s..."
+            sleep 10
+          done
+          echo "::error::Failed to attach ssh key to instance $IID"
+          exit 1
+
       - name: Wait for SSH
         id: ssh
         env:
@@ -285,6 +308,15 @@ jobs:
           # Extract the result section for the PR comment (same marker bench-abba.yml uses).
           sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
 
+          # Surface the result in the Actions run summary too (push/workflow_dispatch
+          # runs have no PR to comment on).
+          {
+            echo "## GPU ABBA — ethrex 20 transfers (vs main)"
+            echo '```'
+            cat "$RUNNER_TEMP/abba_result.txt"
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Comment ABBA result on PR
         if: always() && github.event_name == 'issue_comment'
         uses: actions/github-script@v7
@@ -340,19 +372,8 @@ jobs:
           if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
             IID=$(cat "$RUNNER_TEMP/vast_instance_id")
             echo "Destroying instance $IID"
-            vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console"
+            # --yes: skip the interactive [y/N] confirm (CI has no tty).
+            vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console"
           else
             echo "No instance id recorded; nothing to destroy."
           fi
-
-      - name: Remove ephemeral SSH key
-        if: always()
-        env:
-          KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }}
-        run: |
-          [ -z "$KEY_COMMENT" ] && exit 0
-          vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0
-          for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do
-            echo "Deleting ssh-key $kid"
-            vastai delete ssh-key "$kid" || true
-          done

From f645901afff23317b78a73cd760a8bba739094a8 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:18:59 -0300
Subject: [PATCH 08/17] fix: rebuild binaries

---
 .github/workflows/benchmark-gpu.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 69b290ec6..531000938 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -298,10 +298,12 @@ jobs:
           # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
           # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
           # verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+          # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
+          # binaries (PTX is compiled for the detected arch); never trust a cached binary.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             git fetch --force origin main; $FETCH; \
-            SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+            REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 
           $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"

From 963966c0376eaa10c98ad2a002ef9672b4e98649 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:25:54 -0300
Subject: [PATCH 09/17] fix: use correct sh

---
 .github/workflows/benchmark-gpu.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 531000938..e81c65f64 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -295,14 +295,17 @@ jobs:
             REF_A="origin/$BRANCH"
           fi
 
-          # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
-          # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
-          # verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+          # The template clones the repo at the DEFAULT branch (main), so check out the PR
+          # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU
+          # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated
+          # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI +
+          # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path.
           # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             git fetch --force origin main; $FETCH; \
+            git checkout -f $REF_A; \
             REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 

From f09578ee4d604be371be03a941ca851fc4f28446 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 11:07:36 -0300
Subject: [PATCH 10/17] fix: use 64gb ram

---
 .github/workflows/benchmark-gpu.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index e81c65f64..7a90b0e0f 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,7 +38,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified +
+  # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
   # rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "3"
@@ -143,8 +143,8 @@ jobs:
           MIN_DRIVER: "580"
         run: |
           # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
-          # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # units), so >=64 means 64 GB. >=64000 would mean 64000 GB and match nothing.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
           SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
@@ -161,7 +161,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"

From defc4da1caa09a132e772b2c2261517c6c7625b7 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 11:48:11 -0300
Subject: [PATCH 11/17] fix: use expensive machine with $1 cap

---
 .github/workflows/benchmark-gpu.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 7a90b0e0f..a52c4704f 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -41,7 +41,7 @@ env:
   # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
   # rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
-  PRICE_CAP: "3"
+  PRICE_CAP: "1"
   VAST_IMAGE_DISK: "64"
   # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
   BENCH_FEATURES: "jemalloc-stats,prover/cuda"
@@ -142,12 +142,13 @@ jobs:
           # because vast can't numerically compare the driver_version string server-side.
           MIN_DRIVER: "580"
         run: |
-          # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
-          # units), so >=64 means 64 GB. >=64000 would mean 64000 GB and match nothing.
+          # cpu_ram filter is in GB.
           QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
-          # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
-          SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
+          # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first
+          # (within the price cap) — premium hosts have faster disks/network (quicker image
+          # pulls) and better reliability; the cheapest boxes were flaky.
+          SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
             vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
@@ -214,7 +215,8 @@ jobs:
         run: |
           echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
           HOST=""; PORT=""
-          for _ in $(seq 1 60); do   # ~10 min
+          # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while.
+          for _ in $(seq 1 180); do   # ~30 min
             vastai show instance "$IID" --raw > inst.json || true
             STATUS=$(jq -r '.actual_status // empty' inst.json)
             # We create with --direct, so SSH straight to the public IP + the host port

From 1e0eb3952557203ed4a1164d5f772ba7065c8d45 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:09:29 -0300
Subject: [PATCH 12/17] fix: remove temporary code

---
 .github/workflows/benchmark-gpu.yml | 43 +++++++++++++----------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index a52c4704f..35948bb7b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -19,14 +19,9 @@ on:
     inputs:
       pairs:
         description: "Number of A/B/B/A pairs"
-        default: "1"   # TEMP(testing): fast runs; restore to "10" before merge
+        default: "14"
   issue_comment:
     types: [created]
-  # TEMP(testing): lets the workflow run from this branch before it's on the default
-  # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do
-  # not). REMOVE this push trigger before merging.
-  push:
-    branches: [gpu_benchmarks]
 
 permissions:
   contents: read
@@ -50,10 +45,7 @@ jobs:
   benchmark-gpu:
     runs-on: ubuntu-latest
     # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
-    # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge.
-    # REMOVE the push clause before merging.
     if: >-
-      github.event_name == 'push' ||
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
@@ -78,16 +70,16 @@ jobs:
             OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
             # "/bench-gpu 20" -> 20 pairs; otherwise default.
             N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
-            PAIRS=${N:-1}   # TEMP(testing): default 1; restore to 10 before merge
+            PAIRS=${N:-14}
           else
-            # workflow_dispatch / push: compare this branch vs main.
+            # workflow_dispatch: compare this branch vs main.
             OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
-            PAIRS=${DISPATCH_PAIRS:-1}   # TEMP(testing): default 1; restore to 10 before merge
+            PAIRS=${DISPATCH_PAIRS:-14}
           fi
-          # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge.
-          if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
-            echo "::warning::pair count out of range [1,40], defaulting to 1"
-            PAIRS=1
+          # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta.
+          if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count out of range [2,40], defaulting to 14"
+            PAIRS=14
           fi
           {
             echo "pr_num=$OUT_PR_NUM"
@@ -297,17 +289,20 @@ jobs:
             REF_A="origin/$BRANCH"
           fi
 
-          # The template clones the repo at the DEFAULT branch (main), so check out the PR
-          # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU
-          # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated
-          # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI +
-          # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+          # Run main's bench_abba.sh — the harness is the pinned measurement methodology, so a
+          # PR can't alter how its own benchmark is computed. (The template clones the default
+          # branch, so checking out origin/main is also what's already there; this makes it
+          # explicit and robust to the template default changing.) The harness still builds the
+          # cli at REF_A (the PR) and origin/main in isolated worktrees, runs PAIRS interleaved
+          # A/B/B/A proves, and prints the paired-t CI + Wilcoxon verdict. BENCH_FEATURES routes
+          # the build through the CUDA prover path. NOTE: requires this PR's bench_abba.sh change
+          # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge.
           # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             git fetch --force origin main; $FETCH; \
-            git checkout -f $REF_A; \
+            git checkout -f origin/main; \
             REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 
@@ -315,8 +310,8 @@ jobs:
           # Extract the result section for the PR comment (same marker bench-abba.yml uses).
           sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
 
-          # Surface the result in the Actions run summary too (push/workflow_dispatch
-          # runs have no PR to comment on).
+          # Surface the result in the Actions run summary too (workflow_dispatch runs
+          # have no PR to comment on).
           {
             echo "## GPU ABBA — ethrex 20 transfers (vs main)"
             echo '```'

From b075c2ac32ce2175c08d16f506ef9053cbc32118 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:42:06 -0300
Subject: [PATCH 13/17] fix: apply code review

---
 .github/workflows/benchmark-gpu.yml | 70 ++++++++++++++++++++++-------
 scripts/bench_abba.sh               | 17 ++++---
 2 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 35948bb7b..564c3c87b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -6,7 +6,7 @@ name: Benchmark GPU (PR)
 # It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU,
 # posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box.
 #
-# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 14) or via
 # workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens
 # on the rented Vast box (provisioned by the template onstart).
 #
@@ -40,6 +40,11 @@ env:
   VAST_IMAGE_DISK: "64"
   # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
   BENCH_FEATURES: "jemalloc-stats,prover/cuda"
+  # Unique per-run label set on the instance, for easy identification in the Vast console.
+  RUN_LABEL: "gpu-bench-${{ github.run_id }}-${{ github.run_attempt }}"
+  # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
+  # hash can't) — avoids pulling untrusted code at run time.
+  VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"
 
 jobs:
   benchmark-gpu:
@@ -100,18 +105,38 @@ jobs:
               owner: context.repo.owner, repo: context.repo.repo,
               comment_id: context.payload.comment.id, content: 'eyes'
             });
-            await github.rest.issues.createComment({
+            // Post the "started" notice under the SAME marker the result step uses, so the
+            // result updates this comment in place (and re-runs reuse it rather than stacking).
+            const marker = 'GPU Benchmark (ABBA)';
+            const body = `## GPU Benchmark (ABBA) — running…\n\n⏳ Renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; the result will replace this comment.`;
+            const comments = await github.paginate(github.rest.issues.listComments, {
               owner: context.repo.owner, repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.`
+              issue_number: context.issue.number, per_page: 100,
             });
+            const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                comment_id: existing.id, body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: context.issue.number, body,
+              });
+            }
 
       - name: Install Vast CLI
+        # No secrets in this step's env: install-time code can't read the API key during pip
+        # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason.
+        # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally
+        # managed"; safe to override on a disposable runner.
+        run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}"
+
+      - name: Authenticate Vast CLI
         env:
           VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
-        run: |
-          pip install --quiet --upgrade vastai
-          vastai set api-key "$VAST_API_KEY"
+        run: vastai set api-key "$VAST_API_KEY"
 
       - name: Generate ephemeral SSH key
         id: sshkey
@@ -140,7 +165,9 @@ jobs:
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first
           # (within the price cap) — premium hosts have faster disks/network (quicker image
           # pulls) and better reliability; the cheapest boxes were flaky.
-          SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
+          # `try ... catch 0` so a malformed/null driver_version on one offer is treated as 0
+          # (filtered out) rather than erroring the whole jq and wasting the attempt.
+          SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
             vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
@@ -169,6 +196,7 @@ jobs:
           vastai create instance "$OFFER_ID" \
             --template_hash "$VAST_TEMPLATE_HASH" \
             --disk "$VAST_IMAGE_DISK" \
+            --label "$RUN_LABEL" \
             --ssh --direct --raw > create.json
           cat create.json
           IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
@@ -179,7 +207,7 @@ jobs:
           # Persist immediately so teardown runs even if later steps fail.
           echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
           echo "id=$IID" >> "$GITHUB_OUTPUT"
-          echo "Created instance $IID"
+          echo "Created instance $IID (label $RUN_LABEL)"
 
       - name: Attach SSH key to instance
         env:
@@ -255,11 +283,13 @@ jobs:
             if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
               echo "onstart reported done"; exit 0
             fi
-            # shellcheck disable=SC2016  # $HOME/$(...) must expand on the remote box, not the runner
+            # Fallback if the log marker isn't found: the late-stage artifacts (cargo + the
+            # sysroot + the cloned repo) imply the earlier Rust/LLVM/toolchain install finished.
+            # Deliberately no toolchain-date check — it would go stale when the repo bumps nightly.
+            # shellcheck disable=SC2016  # $HOME must expand on the remote box, not the runner
             if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
                   && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
-                  && test -d /workspace/lambda_vm/.git \
-                  && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then
+                  && test -d /workspace/lambda_vm/.git'; then
               echo "provisioning artifacts present"; exit 0
             fi
             sleep 10
@@ -306,6 +336,9 @@ jobs:
             REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 
+          # pipefail so a failed remote bench (e.g. a prove that dies) propagates through the
+          # tee pipe and fails this step, instead of being masked by tee's exit 0.
+          set -o pipefail
           $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
           # Extract the result section for the PR comment (same marker bench-abba.yml uses).
           sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
@@ -349,9 +382,9 @@ jobs:
               body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
             }
 
-            const { data: comments } = await github.rest.issues.listComments({
+            const comments = await github.paginate(github.rest.issues.listComments, {
               owner: context.repo.owner, repo: context.repo.repo,
-              issue_number: context.issue.number,
+              issue_number: context.issue.number, per_page: 100,
             });
             const marker = 'GPU Benchmark (ABBA)';
             const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
@@ -374,8 +407,15 @@ jobs:
           if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
             IID=$(cat "$RUNNER_TEMP/vast_instance_id")
             echo "Destroying instance $IID"
+            # Retry transient failures (network/auth) so a paid box isn't stranded.
             # --yes: skip the interactive [y/N] confirm (CI has no tty).
-            vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console"
+            destroyed=""
+            for attempt in 1 2 3; do
+              if vastai destroy instance "$IID" --yes; then destroyed=1; break; fi
+              echo "destroy attempt $attempt failed; retrying in 10s..."
+              sleep 10
+            done
+            [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $IID after 3 attempts — check the Vast console (label $RUN_LABEL)"
           else
             echo "No instance id recorded; nothing to destroy."
           fi
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
index 57fab5e28..9bcbb39cc 100755
--- a/scripts/bench_abba.sh
+++ b/scripts/bench_abba.sh
@@ -94,10 +94,12 @@ INPUT="$(cd "$(dirname "$INPUT_REL")" && pwd)/$(basename "$INPUT_REL")"
 need_build=0
 if [ "${REBUILD:-0}" = "1" ] || [ ! -x "$WORK/cli_A" ] || [ ! -x "$WORK/cli_B" ]; then
   need_build=1
-elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A" ] || [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B" ]; then
-  # Cache persists on the self-hosted runner; rebuild if it's for different refs
-  # (a different PR, or main advanced) so we never benchmark stale binaries.
-  echo "==> Cached binaries are for different refs; rebuilding."
+elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A $BENCH_FEATURES" ] || \
+     [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B $BENCH_FEATURES" ]; then
+  # Cache persists on the self-hosted runner; rebuild if it's for different refs (a
+  # different PR, or main advanced) OR a different feature set (e.g. CPU vs prover/cuda),
+  # so we never benchmark stale binaries. The marker stores "<sha> <features>".
+  echo "==> Cached binaries are for different refs/features; rebuilding."
   need_build=1
 fi
 if [ "$need_build" = "1" ]; then
@@ -115,15 +117,16 @@ if [ "$need_build" = "1" ]; then
       exit 1
     fi
     cp "$WT/target/release/cli" "$WORK/$2"
-    echo "$1" > "$WORK/$2.sha"
+    # Marker = "<sha> <features>" so the cache invalidates on either changing.
+    echo "$1 $BENCH_FEATURES" > "$WORK/$2.sha"
   }
   build_cli "$SHA_B" cli_B
   build_cli "$SHA_A" cli_A
   cleanup
   trap - EXIT
 else
-  echo "==> Reusing cached binaries (SHAs match requested refs; REBUILD=1 to force):"
-  echo "     cli_A=${SHA_A:0:10}  cli_B=${SHA_B:0:10}"
+  echo "==> Reusing cached binaries (refs + features match; REBUILD=1 to force):"
+  echo "     cli_A=${SHA_A:0:10}  cli_B=${SHA_B:0:10}  features=$BENCH_FEATURES"
 fi
 
 # --- 3. Interleaved A/B/B/A measurement (fresh CSV -- pre-committed batch) ---

From d85fb3778bb0e740d613176810f91f1be38fdaaf Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:46:17 -0300
Subject: [PATCH 14/17] test: run on push

---
 .github/workflows/benchmark-gpu.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 564c3c87b..40985bdf5 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -22,6 +22,9 @@ on:
         default: "14"
   issue_comment:
     types: [created]
+  # TEMP(testing): run from this branch pre-merge. REMOVE before merging.
+  push:
+    branches: [gpu_benchmarks]
 
 permissions:
   contents: read
@@ -50,7 +53,9 @@ jobs:
   benchmark-gpu:
     runs-on: ubuntu-latest
     # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+    # TEMP(testing): the `push` clause lets branch pushes run it pre-merge. REMOVE before merging.
     if: >-
+      github.event_name == 'push' ||
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
@@ -79,7 +84,7 @@ jobs:
           else
             # workflow_dispatch: compare this branch vs main.
             OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
-            PAIRS=${DISPATCH_PAIRS:-14}
+            PAIRS=${DISPATCH_PAIRS:-2}   # TEMP(testing): 2 pairs for a fast push test; restore to 14 before merging
           fi
           # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta.
           if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
@@ -329,10 +334,12 @@ jobs:
           # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge.
           # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
+          # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised
+          # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             git fetch --force origin main; $FETCH; \
-            git checkout -f origin/main; \
+            git checkout -f $REF_A; \
             REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 

From ecf615d3dd126f0cc8c3424417c7a111becfa0e5 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 13:06:19 -0300
Subject: [PATCH 15/17] fix: cuda

---
 .github/workflows/benchmark-gpu.yml |  6 +++++-
 scripts/bench_abba.sh               | 12 +++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 40985bdf5..2bf175e22 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -336,11 +336,15 @@ jobs:
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
           # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised
           # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging.
+          # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer
+          # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol
+          # they don't export). nvidia-smi is logged for diagnosing driver issues.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
+            nvidia-smi || true; \
             git fetch --force origin main; $FETCH; \
             git checkout -f $REF_A; \
-            REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+            REBUILD=1 CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 
           # pipefail so a failed remote bench (e.g. a prove that dies) propagates through the
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
index 9bcbb39cc..950b11ffa 100755
--- a/scripts/bench_abba.sh
+++ b/scripts/bench_abba.sh
@@ -110,7 +110,17 @@ if [ "$need_build" = "1" ]; then
   git worktree add --detach "$WT" "$SHA_B" >/dev/null
   build_cli() {  # $1=sha $2=out (shared target dir -> 2nd build is incremental)
     echo "==> Building cli @ ${1:0:10} -> $2  (features: $BENCH_FEATURES)"
-    git -C "$WT" checkout --quiet "$1"
+    # -f: discard any prior worktree edit (e.g. the CUDARC_PIN sed below) before switching
+    # refs, so the checkout can't conflict.
+    git -C "$WT" checkout --quiet -f "$1"
+    # CUDARC_PIN: pin math-cuda's cudarc to a fixed CUDA version and drop fallback-latest, so
+    # cudarc binds a known driver-symbol set instead of its newest (which can request symbols
+    # the rented box's driver doesn't export, e.g. cuDevSmResourceSplit -> runtime panic).
+    if [ -n "${CUDARC_PIN:-}" ]; then
+      sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
+        "$WT/crypto/math-cuda/Cargo.toml"
+      echo "    cudarc pinned to ${CUDARC_PIN}"
+    fi
     if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then
       echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2
       tail -40 "$WORK/build_$2.log" >&2

From 6e6a6105c6a866ad6b6e5714aed0827f8f49bae7 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Mon, 29 Jun 2026 13:25:22 -0300
Subject: [PATCH 16/17] remove test setup

---
 .github/workflows/benchmark-gpu.yml | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 2bf175e22..661685f67 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -22,9 +22,6 @@ on:
         default: "14"
   issue_comment:
     types: [created]
-  # TEMP(testing): run from this branch pre-merge. REMOVE before merging.
-  push:
-    branches: [gpu_benchmarks]
 
 permissions:
   contents: read
@@ -53,9 +50,7 @@ jobs:
   benchmark-gpu:
     runs-on: ubuntu-latest
     # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
-    # TEMP(testing): the `push` clause lets branch pushes run it pre-merge. REMOVE before merging.
     if: >-
-      github.event_name == 'push' ||
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
@@ -84,7 +79,7 @@ jobs:
           else
             # workflow_dispatch: compare this branch vs main.
             OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
-            PAIRS=${DISPATCH_PAIRS:-2}   # TEMP(testing): 2 pairs for a fast push test; restore to 14 before merging
+            PAIRS=${DISPATCH_PAIRS:-14}
           fi
           # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta.
           if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
@@ -334,8 +329,6 @@ jobs:
           # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge.
           # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
-          # TEMP(testing): checks out $REF_A (the branch harness) so the GPU path is exercised
-          # pre-merge; main's harness lacks BENCH_FEATURES. RESTORE to `origin/main` before merging.
           # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer
           # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol
           # they don't export). nvidia-smi is logged for diagnosing driver issues.
@@ -343,7 +336,7 @@ jobs:
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             nvidia-smi || true; \
             git fetch --force origin main; $FETCH; \
-            git checkout -f $REF_A; \
+            git checkout -f origin/main; \
             REBUILD=1 CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
             scripts/bench_abba.sh $REF_A origin/main $PAIRS"
 

From 80b6963450a96dbc930d4667446e5e9cc8c135eb Mon Sep 17 00:00:00 2001
From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com>
Date: Mon, 29 Jun 2026 16:23:53 -0300
Subject: [PATCH 17/17] ci(bench-gpu): harden teardown, cap pairs at 32, fix
 CUDA comment (#736)

Review follow-ups on the GPU benchmark workflow:

- Teardown: fall back to destroying by the unique RUN_LABEL when no instance
  id was recorded. The id file is written only after `create` succeeds and its
  JSON parses, so a box created in that window (concurrency cancel, or a parse
  failure) could otherwise leak and bill indefinitely.
- Cap pairs at 32 (was 40) and round odd requests up to even (the AB/BA design
  wants even N); raise the job timeout to 210 min so a worst-case 32-pair run
  (64 proves + slow provisioning + dual CUDA build) fits without timing out
  after the expensive build.
- Fix the CUDARC_PIN comment: the boxes are ~CUDA 12.8 (matching cuda-12080 and
  the cuda_max_good>=12.8 offer floor), not 13.0; tie it to the MIN_DRIVER guard
  as the opposite end of the same compatibility window.
- Log only the needed fields of create.json instead of the full --raw response,
  so an unexpected sensitive field can't land in the run log.
- Validate the workflow_dispatch branch name before it is interpolated into the
  remote `bash -lc` command.
- Move the run-summary write into an always() step so workflow_dispatch failures
  are visible in the Actions summary rather than only the raw step log.
---
 .github/workflows/benchmark-gpu.yml | 99 ++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 661685f67..1e2ef01b1 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -56,8 +56,10 @@ jobs:
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench-gpu') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
-    # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each).
-    timeout-minutes: 180
+    # ABBA on the GPU: provisioning + dual cuda build (~30 min) + 2*pairs proves
+    # (~95s each). At the max 32 pairs (64 proves) a slow-provision box runs ~3 hr,
+    # so allow headroom over that; teardown still always destroys the box.
+    timeout-minutes: 210
     steps:
       - name: Resolve PR ref + pair count
         id: config
@@ -81,11 +83,18 @@ jobs:
             OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
             PAIRS=${DISPATCH_PAIRS:-14}
           fi
-          # Clamp to [2,40] (even is ideal so AB/BA orders balance). 14 ~ resolves a 2% delta.
-          if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
-            echo "::warning::pair count out of range [2,40], defaulting to 14"
+          # Clamp to [2,32]; out-of-range -> default. 14 ~ resolves a 2% delta. The ceiling
+          # keeps the worst-case run (64 proves + provisioning + dual build) under the job
+          # timeout above.
+          if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 32 ] 2>/dev/null; then
+            echo "::warning::pair count out of range [2,32], defaulting to 14"
             PAIRS=14
           fi
+          # Even is ideal so the AB/BA orders balance; round an odd request up by one.
+          if [ "$((PAIRS % 2))" -ne 0 ]; then
+            PAIRS=$((PAIRS + 1))
+            echo "::notice::rounded odd pair count up to $PAIRS so AB/BA orders balance"
+          fi
           {
             echo "pr_num=$OUT_PR_NUM"
             echo "head_sha=$OUT_HEAD_SHA"
@@ -198,7 +207,9 @@ jobs:
             --disk "$VAST_IMAGE_DISK" \
             --label "$RUN_LABEL" \
             --ssh --direct --raw > create.json
-          cat create.json
+          # Log only the fields we need rather than the full --raw response, which could carry
+          # an unexpected sensitive field into the (collaborator-/world-readable) run log.
+          jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json
           IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
           if [ -z "$IID" ]; then
             echo "::error::Failed to create Vast instance"
@@ -315,6 +326,12 @@ jobs:
             FETCH="git fetch --force origin refs/pull/$PR_NUM/head"
             REF_A="$HEAD_SHA"
           else
+            # Reject anything outside the git-ref-safe charset before it reaches the remote
+            # `bash -lc` (defense-in-depth; workflow_dispatch is write-access only, but never
+            # interpolate an unvalidated ref into a remote shell command).
+            case "$BRANCH" in
+              ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid branch name: '$BRANCH'"; exit 1 ;;
+            esac
             FETCH="git fetch --force origin $BRANCH"
             REF_A="origin/$BRANCH"
           fi
@@ -329,9 +346,13 @@ jobs:
           # (the BENCH_FEATURES env) to be on main — i.e. it only takes effect after merge.
           # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
           # binaries (PTX is compiled for the detected arch); never trust a cached binary.
-          # CUDARC_PIN: pin cudarc to a fixed CUDA version so it doesn't bind symbols newer
-          # than the box's driver (the boxes are CUDA 13.0; fallback-latest requested a symbol
-          # they don't export). nvidia-smi is logged for diagnosing driver issues.
+          # CUDARC_PIN: pin cudarc to a fixed CUDA version (cuda-12080 = CUDA 12.8, matching the
+          # cuda_max_good>=12.8 offer floor) and drop fallback-latest, so cudarc binds a known
+          # symbol set instead of its newest. With fallback-latest cudarc requested a symbol the
+          # box's driver doesn't export (e.g. cuDevSmResourceSplit) -> runtime panic. This is the
+          # too-new end of the same compatibility window that MIN_DRIVER>=580 guards at the
+          # too-old end (older drivers lack cuCtxGetDevice_v2 and the GPU path falls back to CPU).
+          # nvidia-smi is logged for diagnosing driver issues.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
             nvidia-smi || true; \
@@ -347,13 +368,25 @@ jobs:
           # Extract the result section for the PR comment (same marker bench-abba.yml uses).
           sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
 
-          # Surface the result in the Actions run summary too (workflow_dispatch runs
-          # have no PR to comment on).
+      - name: Write run summary
+        # Always run so a failure (incl. workflow_dispatch, which has no PR comment step) is
+        # visible in the Actions run summary instead of only the raw step log.
+        if: always() && (steps.bench.outcome == 'success' || steps.bench.outcome == 'failure')
+        env:
+          OUTCOME: ${{ steps.bench.outcome }}
+        run: |
           {
             echo "## GPU ABBA — ethrex 20 transfers (vs main)"
-            echo '```'
-            cat "$RUNNER_TEMP/abba_result.txt"
-            echo '```'
+            if [ "$OUTCOME" = "success" ] && [ -s "$RUNNER_TEMP/abba_result.txt" ]; then
+              echo '```'
+              cat "$RUNNER_TEMP/abba_result.txt"
+              echo '```'
+            else
+              echo "❌ Run outcome: ${OUTCOME:-unknown}. Last log lines:"
+              echo '```'
+              tail -n 30 "$RUNNER_TEMP/abba_out.txt" 2>/dev/null || echo "(no output captured)"
+              echo '```'
+            fi
           } >> "$GITHUB_STEP_SUMMARY"
 
       - name: Comment ABBA result on PR
@@ -408,18 +441,38 @@ jobs:
       - name: Destroy instance
         if: always()
         run: |
-          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
-            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
-            echo "Destroying instance $IID"
-            # Retry transient failures (network/auth) so a paid box isn't stranded.
-            # --yes: skip the interactive [y/N] confirm (CI has no tty).
-            destroyed=""
+          # Retry transient failures (network/auth) so a paid box isn't stranded.
+          # --yes: skip the interactive [y/N] confirm (CI has no tty).
+          destroy() {
+            iid="$1"; destroyed=""
             for attempt in 1 2 3; do
-              if vastai destroy instance "$IID" --yes; then destroyed=1; break; fi
+              if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi
               echo "destroy attempt $attempt failed; retrying in 10s..."
               sleep 10
             done
-            [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $IID after 3 attempts — check the Vast console (label $RUN_LABEL)"
+            [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)"
+          }
+          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+            echo "Destroying instance $IID"
+            destroy "$IID"
           else
-            echo "No instance id recorded; nothing to destroy."
+            # The id file is written only AFTER create succeeds AND its JSON parses, so a box can
+            # exist unrecorded if the run was cancelled in that window (concurrency cancel) or the
+            # parse failed. Fall back to destroying by our unique RUN_LABEL so the box can't leak
+            # (bill indefinitely). RUN_LABEL is unique per run, so this never touches another run's box.
+            echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..."
+            vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json
+            # Tolerate either a bare array or {instances:[...]}; match our exact label.
+            LEAKED=$(jq -r --arg L "$RUN_LABEL" \
+              '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \
+              all_inst.json 2>/dev/null || true)
+            if [ -z "$LEAKED" ]; then
+              echo "No instance labelled $RUN_LABEL found; nothing to destroy."
+            else
+              for IID in $LEAKED; do
+                echo "Destroying leaked instance $IID (label $RUN_LABEL)"
+                destroy "$IID"
+              done
+            fi
           fi