goceleris · FumingPower3925 · Jun 6, 2026 · Jun 6, 2026
diff --git a/.github/workflows/sync-benchmarks.yml b/.github/workflows/sync-benchmarks.yml
@@ -88,10 +88,19 @@ jobs:
           COMMIT: ${{ steps.payload.outputs.commit }}
         run: |
           set -euo pipefail
-          # The publisher pushed the cell in a separate commit; make sure we are
-          # validating exactly what was published, not a stale checkout.
-          git fetch origin "$COMMIT" || git fetch origin
-          git checkout "$COMMIT" -- results/ || echo "no results delta to sync"
+          # Bring the working tree to origin/main so the local results/
+          # holds the FULL tree (every previously-published cell) and not
+          # just the one commit the publisher pushed. The old `git checkout
+          # $COMMIT -- results/` replaced the whole results/ with the
+          # single commit's delta, which made the index-sync see a
+          # truncated tree and emit a manifest that listed only the
+          # most-recent run-K (e.g. only run-2 when run-1 + run-2 +
+          # run-3 were on disk). Fetching + checking out the entire
+          # origin/main branch is what makes update-index.mjs's whole-
+          # tree walk authoritative.
+          git fetch origin "$COMMIT" || true
+          git fetch origin main
+          git checkout origin/main -- results/
 
       - name: Validate published cell
         if: steps.payload.outputs.path != ''

diff --git a/scripts/lib/results.mjs b/scripts/lib/results.mjs
@@ -65,12 +65,16 @@ export function listArches(root, version, date) {
 }
 
 // listRuns tolerates both the Phase 4 flat layout (four files directly under
-// the arch dir as run-1) and a future Phase 5 run-N/ subdirectory layout.
+// the arch dir as run-1) and a future Phase 5 run-N/ subdirectory layout,
+// and the run-N-rated/ subdirectory a back-to-back rated pass publishes
+// to alongside its saturation pass (see mage_tier.go's RatedRunIDSuffix).
+// The regex accepts the rated suffix as an optional tail so the same
+// walker can enumerate both panels of a back-to-back run.
 export function listRuns(root, version, date, arch) {
   const dir = join(root, version, date, arch);
   if (!existsSync(dir)) return [];
   const subRuns = readdirSync(dir, { withFileTypes: true })
-    .filter((e) => e.isDirectory() && /^run-\d+$/.test(e.name))
+    .filter((e) => e.isDirectory() && /^run-\d+(?:-rated)?$/.test(e.name))
     .map((e) => e.name);
   if (subRuns.length > 0) return subRuns.sort(runCmp);
   if (existsSync(join(dir, "summary.json"))) return [DEFAULT_RUN];
@@ -90,8 +94,24 @@ export function runFilePath(version, date, arch, runId, name) {
   return posix.join("results", version, date, arch, runId, name);
 }
 
+// runKey splits a run id into its numeric k and optional variant suffix
+// ("run-2" -> [2, ""], "run-2-rated" -> [2, "rated"]). The numeric part
+// sorts first so the canonical order is run-1, run-1-rated, run-2,
+// run-2-rated, ...; the variant part breaks ties lexicographically, so
+// a future variant (e.g. "run-1-soak") would naturally land after the
+// rated one with no comparator change. Unknown shapes sort as 0/"" to
+// keep the comparator total.
+function runKey(r) {
+  const m = /^run-(\d+)(?:-(.+))?$/.exec(r);
+  if (!m) return [0, "", r];
+  return [Number(m[1]), m[2] || "", r];
+}
+
 export function runCmp(a, b) {
-  return Number(a.replace("run-", "")) - Number(b.replace("run-", ""));
+  const [na, sa] = runKey(a);
+  const [nb, sb] = runKey(b);
+  if (na !== nb) return na - nb;
+  return sa.localeCompare(sb);
 }
 
 // versionCmpDesc: semver, newest first; no-prerelease sorts ahead of prerelease.

diff --git a/scripts/selftest.mjs b/scripts/selftest.mjs
@@ -17,6 +17,7 @@ import {
   readFileSync,
   existsSync,
   rmSync,
+  writeFileSync,
 } from "node:fs";
 import { join, dirname } from "node:path";
 import { fileURLToPath } from "node:url";
@@ -40,8 +41,8 @@ try {
     copyFileSync(join(fixtureDir, f), join(cellAbs, f));
   }
 
-  const run = (script, args) => {
-    const r = spawnSync("node", [join(scriptsDir, script), ...args], { cwd: tmp, encoding: "utf8" });
+  const run = (script, args, cwd = tmp) => {
+    const r = spawnSync("node", [join(scriptsDir, script), ...args], { cwd, encoding: "utf8" });
     const out = `${r.stdout || ""}${r.stderr || ""}`.trim();
     if (out) process.stdout.write(`  [${script}] ${out}\n`);
     if (r.status !== 0) throw new Error(`${script} exited with status ${r.status}`);
@@ -74,6 +75,86 @@ try {
   assert(run0 && run0.files && run0.files.summary && run0.files.timeseries, "run.files references all artifacts");
   assert(existsSync(join(tmp, "results", "latest", ARCH, "summary.json")), `latest/${ARCH}/summary.json mirrored`);
   assert(existsSync(join(tmp, "results", "latest", ARCH, "env.json")), `latest/${ARCH}/env.json mirrored`);
+
+  // ---- Back-to-back + rated subdirs (see mage_tier.go's RatedRunIDSuffix) ----
+  // A second self-test cell tree that mirrors a real 3-pass saturation +
+  // 3-pass rated back-to-back run. The single-cell fixture above
+  // validates the flat (run-1) layout; this one validates the
+  // sibling run-K/ + run-K-rated/ subdirs coexist, get enumerated
+  // individually in the index, and that default_run still resolves
+  // to run-1 (the saturation pass, not a rated subdir).
+  const multiTmp = mkdtempSync(join(tmpdir(), "results-selftest-multi-"));
+  try {
+    const multiRel = `results/${VER}/${DATE}/${ARCH}`;
+    const multiAbs = join(multiTmp, multiRel);
+    // Saturation subdirs (run-2, run-3) — copy the fixture and re-stamp
+    // env.run_id so validate-results.mjs accepts each cell against its
+    // own dispatch payload. The fixture's env carries run-1; every
+    // subdir needs its own id.
+    for (const k of [2, 3]) {
+      const subAbs = join(multiAbs, `run-${k}`);
+      mkdirSync(subAbs, { recursive: true });
+      for (const f of ["summary.json", "histograms.json.gz", "timeseries.json.gz"]) {
+        copyFileSync(join(fixtureDir, f), join(subAbs, f));
+      }
+      const envRaw = JSON.parse(readFileSync(join(fixtureDir, "env.json"), "utf8"));
+      envRaw.run_id = `run-${k}`;
+      writeFileSync(join(subAbs, "env.json"), JSON.stringify(envRaw));
+    }
+    // Rated subdirs (run-2-rated, run-3-rated) — same shape with the
+    // rated suffix; the producer's env.run_id is the full rated id.
+    for (const k of [2, 3]) {
+      const subAbs = join(multiAbs, `run-${k}-rated`);
+      mkdirSync(subAbs, { recursive: true });
+      for (const f of ["summary.json", "histograms.json.gz", "timeseries.json.gz"]) {
+        copyFileSync(join(fixtureDir, f), join(subAbs, f));
+      }
+      const envRaw = JSON.parse(readFileSync(join(fixtureDir, "env.json"), "utf8"));
+      envRaw.run_id = `run-${k}-rated`;
+      writeFileSync(join(subAbs, "env.json"), JSON.stringify(envRaw));
+    }
+
+    // Validate each cell against its dispatch payload. The selftest's
+    // single-cell case ships --run; for the rated pass we re-stamp env
+    // to match. We use the multiTmp cwd (not the single-cell tmp) so
+    // the relative --path resolves against the right tree.
+    for (const k of [2, 3]) {
+      run("validate-results.mjs", [
+        "--path", `${multiRel}/run-${k}`,
+        "--version", VER, "--arch", ARCH, "--date", DATE, "--run", `run-${k}`,
+      ], multiTmp);
+      run("validate-results.mjs", [
+        "--path", `${multiRel}/run-${k}-rated`,
+        "--version", VER, "--arch", ARCH, "--date", DATE, "--run", `run-${k}-rated`,
+      ], multiTmp);
+    }
+    run("update-index.mjs", [], multiTmp);
+
+    const multiIdx = JSON.parse(readFileSync(join(multiTmp, "results", "index.json"), "utf8"));
+    const multiRuns = multiIdx.versions?.[0]?.dates?.[0]?.arches?.[0]?.runs || [];
+    const runIds = multiRuns.map((r) => r.run_id);
+    assert(
+      JSON.stringify(runIds) === JSON.stringify(["run-2", "run-2-rated", "run-3", "run-3-rated"]),
+      `runs enumerated in sorted order: ${JSON.stringify(runIds)}`,
+    );
+    assert(
+      multiIdx.versions?.[0]?.dates?.[0]?.default_run === "run-2",
+      `default_run still picks the lowest run-K when run-1 absent, got ${multiIdx.versions?.[0]?.dates?.[0]?.default_run}`,
+    );
+    // The rated subdirs each carry their own files pointers, distinct
+    // from the saturation subdirs (proves the rated pass didn't
+    // overwrite the saturation grid at run-K).
+    const ratedFiles = multiRuns.find((r) => r.run_id === "run-2-rated")?.files || {};
+    assert(
+      ratedFiles.summary && ratedFiles.summary.endsWith("/run-2-rated/summary.json"),
+      `rated subdir files pointer stays in run-2-rated/, got ${ratedFiles.summary}`,
+    );
+  } catch (e) {
+    console.error("  ERROR (multi-run):", e.message);
+    failed = true;
+  } finally {
+    rmSync(multiTmp, { recursive: true, force: true });
+  }
 } catch (e) {
   console.error("  ERROR:", e.message);
   failed = true;