From 4b73fbc08d43de93ea25fa2aace52019e4606bd7 Mon Sep 17 00:00:00 2001
From: Aaron Webster <awebster@gmail.com>
Date: Fri, 5 Jun 2026 02:18:27 -0700
Subject: [PATCH] Add multi-compiler code-size benchmark PR comment

On each pull request that can affect generated code, compile a fixed all-features benchmark schema and the many_conditionals Ok() highlight across x86-64 (gcc + clang), ARM Cortex-M4 (gcc) and MicroBlaze (gcc) at -Os/-O2/-O0, and post a sticky comment with the .text size and objdump instruction count, compared against the merge-base.

The benchmark schema is held fixed and pulled forward from the PR head, so only the code generator under test varies between base and head; adding or editing other test .emb files cannot move the numbers. clang has no MicroBlaze back end, so that target is gcc-only.

Adds testdata/benchmark.emb (fixed fixture), scripts/size_bench.py (matrix compile + size/objdump measurement to JSON), scripts/size_comment.py (renders the sticky comment), and .github/workflows/code-size.yml (the pull_request workflow).
---
 .github/workflows/code-size.yml | 121 +++++++++++
 scripts/size_bench.py           | 350 ++++++++++++++++++++++++++++++++
 scripts/size_comment.py         | 162 +++++++++++++++
 testdata/benchmark.emb          | 132 ++++++++++++
 4 files changed, 765 insertions(+)
 create mode 100644 .github/workflows/code-size.yml
 create mode 100644 scripts/size_bench.py
 create mode 100644 scripts/size_comment.py
 create mode 100644 testdata/benchmark.emb

diff --git a/.github/workflows/code-size.yml b/.github/workflows/code-size.yml
new file mode 100644
index 00000000..c7144d90
--- /dev/null
+++ b/.github/workflows/code-size.yml
@@ -0,0 +1,121 @@
+name: Code size
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    # Only run when something that can change generated code size moves.
+    paths:
+      - 'compiler/**'
+      - 'runtime/**'
+      - 'embossc'
+      - 'testdata/benchmark.emb'
+      - 'testdata/many_conditionals.emb'
+      - 'scripts/size_bench.py'
+      - 'scripts/size_comment.py'
+      - '.github/workflows/code-size.yml'
+
+# Read the PR's commits; post/update one sticky size comment.
+permissions:
+  contents: read
+  pull-requests: write
+
+# Supersede an in-flight size run when the PR is pushed again.
+concurrency:
+  group: code-size-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+jobs:
+  size-report:
+    name: Generated code size
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out PR head (full history for merge-base + revision checkouts)
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 0
+
+      # embossc formats generated headers with clang-format (the clang-format
+      # PyPI package, pinned in requirements.txt), so the codegen run needs the
+      # Emboss Python deps. setup-python avoids the runner's externally-managed
+      # system Python (PEP 668).
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+          cache: 'pip'
+          cache-dependency-path: requirements.txt
+      - name: Install Emboss Python deps (clang-format for codegen)
+        run: pip install -r requirements.txt
+
+      - name: Install host clang and the ARM (STM32 / Cortex-M4) gcc toolchain
+        run: sudo apt-get update && sudo apt-get install -y clang gcc-arm-none-eabi
+
+      # embedded code-size targets compile against the MicroBlaze gcc toolchain
+      # hard-coded under /opt/microblaze/...; cache the Bootlin tarball in a
+      # runner-writable path (caching /opt fights root perms on restore), then
+      # extract it into place. clang has no MicroBlaze back end, so MicroBlaze is
+      # gcc-only.
+      - name: Cache MicroBlaze (Bootlin) toolchain tarball
+        uses: actions/cache@v4
+        with:
+          path: ~/mb-toolchain/mb.tar.xz
+          key: bootlin-microblazebe--glibc--stable-2025.08-1
+      - name: Provision MicroBlaze toolchain
+        run: |
+          set -euo pipefail
+          url="https://toolchains.bootlin.com/downloads/releases/toolchains/microblazebe/tarballs/microblazebe--glibc--stable-2025.08-1.tar.xz"
+          if [ ! -f "$HOME/mb-toolchain/mb.tar.xz" ]; then
+            mkdir -p "$HOME/mb-toolchain"
+            curl -fsSL "$url" -o "$HOME/mb-toolchain/mb.tar.xz"
+          fi
+          sudo mkdir -p /opt/microblaze
+          sudo tar -C /opt/microblaze -xJf "$HOME/mb-toolchain/mb.tar.xz"
+          test -x /opt/microblaze/microblazebe--glibc--stable-2025.08-1/bin/microblaze-buildroot-linux-gnu-g++
+
+      - name: Measure generated code size (merge-base vs PR head)
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+        run: |
+          set -euo pipefail
+          # Baseline against the merge-base, NOT the base-branch tip, so codegen
+          # that landed on the base after this PR branched is not misattributed.
+          # size_bench.py holds the benchmark schema fixed (pulled forward from
+          # head), so only the generator/runtime under test differs.
+          base="$(git merge-base "$BASE_SHA" "$HEAD_SHA")"
+          echo "Comparing merge-base $base  ->  head $HEAD_SHA"
+          python3 scripts/size_bench.py --revisions "$base" "$HEAD_SHA" --out-dir "$RUNNER_TEMP/size"
+          # Fail loudly rather than post an empty table if the head build produced
+          # no data (missing toolchain/dep or a codegen break).
+          python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); h=d["revisions"][-1]["results"]; sys.exit(0 if any(cfg.get("benchmark") for t in h.values() for c in t.values() for cfg in c.values()) else 1)' "$RUNNER_TEMP/size/size_bench.json" \
+            || { echo "::error::size_bench produced no head benchmark data (toolchain/codegen failure); see output above."; exit 1; }
+          python3 scripts/size_comment.py "$RUNNER_TEMP/size/size_bench.json" > "$RUNNER_TEMP/size/comment.md"
+
+      - name: Post / update size comment
+        # Fork PRs get a read-only GITHUB_TOKEN; same-repo PRs (the chain branches)
+        # can comment. (Fork-safe upgrade: a separate workflow_run job.)
+        if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
+        uses: actions/github-script@v7
+        env:
+          COMMENT_PATH: ${{ runner.temp }}/size/comment.md
+        with:
+          script: |
+            const fs = require('fs');
+            const marker = '<!-- emboss-size-bench -->';
+            let body;
+            try {
+              body = fs.readFileSync(process.env.COMMENT_PATH, 'utf8');
+            } catch (e) {
+              return;  // measure step failed; its red check is the signal.
+            }
+            const { owner, repo } = context.repo;
+            const issue_number = context.issue.number;
+            const comments = await github.paginate(github.rest.issues.listComments, {
+              owner, repo, issue_number, per_page: 100,
+            });
+            const existing = comments.find(c => c.body && c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body });
+            } else {
+              await github.rest.issues.createComment({ owner, repo, issue_number, body });
+            }
diff --git a/scripts/size_bench.py b/scripts/size_bench.py
new file mode 100644
index 00000000..69f41c91
--- /dev/null
+++ b/scripts/size_bench.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Measures the size and instruction count of Emboss-generated code.
+
+Compiles a fixed benchmark schema (testdata/benchmark.emb) plus the
+many_conditionals Ok() highlight across a matrix of target x compiler x
+optimization level, reporting `.text` bytes and static instruction counts
+(from objdump disassembly -- a deterministic stand-in for a runtime benchmark).
+
+With --revisions BASE HEAD, each revision is measured with the benchmark schema
+held fixed (pulled forward from HEAD), so only the code generator under test
+varies between them. Results, including which compiler/version produced each
+number, are written as JSON.
+
+  python3 scripts/size_bench.py --revisions <base-sha> <head-sha> --out-dir out
+"""
+
+import argparse
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+
+REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+# Common flags: C++17, per-function/data sections (so `size`/objdump are
+# comparable), and no exceptions/RTTI to match typical embedded builds.
+COMMON_FLAGS = (
+    "-std=c++17 -ffunction-sections -fdata-sections -fno-exceptions -fno-rtti"
+)
+CONFIGS = {"Os": "-Os", "O2": "-O2", "O0": "-O0"}
+
+_MB = "/opt/microblaze/microblazebe--glibc--stable-2025.08-1/bin/microblaze-buildroot-linux-gnu"
+
+# Measurement matrix. clang has no MicroBlaze back end and needs extra wiring to
+# find the bare-metal ARM C++ headers, so clang is x86-64-only here; cross
+# targets use their gcc toolchains. Host binutils `size`/`nm`/`objdump` read the
+# x86-64 objects for both gcc and clang; cross targets use their own binutils.
+TARGETS = [
+    {
+        "target": "x86-64",
+        "compilers": [
+            {
+                "name": "gcc",
+                "cxx": "g++",
+                "nm": "nm",
+                "objdump": "objdump",
+                "flags": "",
+            },
+            {
+                "name": "clang",
+                "cxx": "clang++",
+                "nm": "nm",
+                "objdump": "objdump",
+                "flags": "",
+            },
+        ],
+    },
+    {
+        "target": "ARM Cortex-M4",
+        "compilers": [
+            {
+                "name": "gcc",
+                "cxx": "arm-none-eabi-g++",
+                "nm": "arm-none-eabi-nm",
+                "objdump": "arm-none-eabi-objdump",
+                "flags": "-mthumb -mcpu=cortex-m4 -mfloat-abi=soft",
+            },
+        ],
+    },
+    {
+        "target": "MicroBlaze",
+        "compilers": [
+            {
+                "name": "gcc",
+                "cxx": f"{_MB}-g++",
+                "nm": f"{_MB}-nm",
+                "objdump": f"{_MB}-objdump",
+                "flags": "",
+            },
+        ],
+    },
+]
+
+# Schema inputs held fixed across revisions so only the generator varies.
+PULL_FORWARD = ["testdata/benchmark.emb", "testdata/many_conditionals.emb"]
+
+# Driver over benchmark.emb: one fn per top-level view, each exercising Ok()
+# (validate/read) and CopyFrom() (read source + write dest). Compiled, never
+# run, so field values / conditional activeness don't matter -- only emitted
+# code. Keep the view list in sync with testdata/benchmark.emb.
+BENCHMARK_VIEWS = ["Scalars", "Bitfields", "Repeated", "Conditional", "Nested"]
+BENCHMARK_DRIVER = (
+    "#include <cstddef>\n#include <cstdint>\n\n"
+    '#include "testdata/benchmark.emb.h"\n\n'
+    "namespace { volatile ::std::uint64_t g_sink; }\n\n"
+    "#define BENCH(NAME, MAKER) \\\n"
+    '  extern "C" void bench_##NAME(char *a, char *b, ::std::size_t n) { \\\n'
+    "    auto va = ::emboss::benchmark::MAKER(a, n); \\\n"
+    "    auto vb = ::emboss::benchmark::MAKER(b, n); \\\n"
+    "    g_sink ^= static_cast<::std::uint64_t>(va.Ok()); \\\n"
+    "    va.CopyFrom(vb); \\\n"
+    "  }\n\n" + "".join(f"BENCH({v}, Make{v}View)\n" for v in BENCHMARK_VIEWS)
+)
+
+# Highlight driver: forces the optimized many_conditionals Ok() to be emitted.
+MANYCOND_DRIVER = (
+    "#include <cstdint>\n\n"
+    '#include "testdata/many_conditionals.emb.h"\n\n'
+    "volatile bool emboss_result_sink;\n"
+    'extern "C" void large_ok(const char *buf) {\n'
+    "  auto v = ::emboss::test::MakeLargeConditionalsView(buf, 100);\n"
+    "  emboss_result_sink = v.Ok();\n"
+    "}\n"
+)
+MANYCOND_OK_SYMBOL = r"GenericLargeConditionalsView<.*>::Ok\(\) const$"
+
+
+def run(args, **kw):
+    """Runs a command, returning stripped stdout; raises on non-zero exit."""
+    return subprocess.run(
+        args, cwd=kw.pop("cwd", REPO), capture_output=True, text=True, check=True, **kw
+    ).stdout.strip()
+
+
+def have(compiler):
+    """True if the compiler's executable is available."""
+    cxx = compiler["cxx"]
+    return bool(shutil.which(cxx) or (os.path.isabs(cxx) and os.path.exists(cxx)))
+
+
+def embossc(emb, out_include_dir):
+    """Generates the C++ header for `emb` under out_include_dir/testdata/."""
+    out = os.path.join(out_include_dir, "testdata")
+    os.makedirs(out, exist_ok=True)
+    name = os.path.basename(emb) + ".h"
+    run(
+        [
+            os.path.join(REPO, "embossc"),
+            "--import-dir=.",
+            "--import-dir=testdata",
+            "--output-file=" + name,
+            "--output-path=" + out,
+            os.path.join("testdata", os.path.basename(emb)),
+        ]
+    )
+
+
+def text_bytes(obj):
+    """`.text` size of an object file, via host binutils `size`."""
+    out = run(["size", obj])
+    return int(out.splitlines()[1].split()[0])
+
+
+def insn_count(objdump, obj):
+    """Static instruction count: disassembly lines that begin with an address."""
+    out = subprocess.run(
+        [objdump, "-d", obj], cwd=REPO, capture_output=True, text=True
+    ).stdout
+    return sum(1 for ln in out.splitlines() if re.match(r"\s+[0-9a-f]+:", ln))
+
+
+def symbol_bytes(nm, obj, pattern):
+    """Size in bytes of the last symbol matching `pattern` (nm --size-sort -S)."""
+    out = subprocess.run(
+        [nm, "--size-sort", "-S", "--demangle", obj],
+        cwd=REPO,
+        capture_output=True,
+        text=True,
+    ).stdout
+    matches = [ln for ln in out.splitlines() if re.search(pattern, ln)]
+    if not matches:
+        return None
+    return int(matches[-1].split()[1], 16)
+
+
+def compile_obj(compiler, flags, driver_path, include_dir, obj):
+    """Compiles a driver TU; returns True on success."""
+    cmd = (
+        [compiler["cxx"]]
+        + COMMON_FLAGS.split()
+        + flags.split()
+        + compiler["flags"].split()
+        + ["-I" + REPO, "-I" + include_dir, "-c", driver_path, "-o", obj]
+    )
+    return subprocess.run(cmd, cwd=REPO, capture_output=True, text=True).returncode == 0
+
+
+def measure_current(work):
+    """Measures the currently checked-out tree across the whole matrix."""
+    include_dir = os.path.join(work, "include")
+    os.makedirs(include_dir, exist_ok=True)
+
+    # Generate headers with the revision's own embossc (the generator under
+    # test). A revision that can't generate the fixed schema is reported as
+    # missing rather than skewing a total.
+    have_bench = have_manycond = True
+    try:
+        embossc("benchmark.emb", include_dir)
+    except subprocess.CalledProcessError:
+        have_bench = False
+    try:
+        embossc("many_conditionals.emb", include_dir)
+    except subprocess.CalledProcessError:
+        have_manycond = False
+
+    bench_drv = os.path.join(work, "benchmark_driver.cc")
+    manycond_drv = os.path.join(work, "manycond_driver.cc")
+    with open(bench_drv, "w") as f:
+        f.write(BENCHMARK_DRIVER)
+    with open(manycond_drv, "w") as f:
+        f.write(MANYCOND_DRIVER)
+
+    versions = {}
+    results = {}
+    for entry in TARGETS:
+        target = entry["target"]
+        results[target] = {}
+        for compiler in entry["compilers"]:
+            if not have(compiler):
+                continue
+            cname = compiler["name"]
+            try:
+                versions[f"{target}/{cname}"] = run(
+                    [compiler["cxx"], "--version"]
+                ).splitlines()[0]
+            except Exception:  # noqa: BLE001
+                versions[f"{target}/{cname}"] = "?"
+            results[target][cname] = {}
+            for cfg, cfg_flag in CONFIGS.items():
+                cell = {}
+                if have_bench:
+                    obj = os.path.join(work, f"bench_{target}_{cname}_{cfg}.o")
+                    if compile_obj(compiler, cfg_flag, bench_drv, include_dir, obj):
+                        cell["benchmark"] = {
+                            "text": text_bytes(obj),
+                            "insns": insn_count(compiler["objdump"], obj),
+                        }
+                if have_manycond:
+                    obj = os.path.join(work, f"mc_{target}_{cname}_{cfg}.o")
+                    if compile_obj(compiler, cfg_flag, manycond_drv, include_dir, obj):
+                        cell["many_conditionals_ok"] = {
+                            "text": symbol_bytes(
+                                compiler["nm"], obj, MANYCOND_OK_SYMBOL
+                            ),
+                            "insns": insn_count(compiler["objdump"], obj),
+                        }
+                results[target][cname][cfg] = cell
+    return {"compiler_versions": versions, "results": results}
+
+
+# ---- git revision driving (hold the schema fixed; vary only the generator) ----
+
+
+def current_ref():
+    r = subprocess.run(
+        ["git", "symbolic-ref", "--short", "HEAD"],
+        cwd=REPO,
+        capture_output=True,
+        text=True,
+    )
+    return r.stdout.strip() if r.returncode == 0 else run(["git", "rev-parse", "HEAD"])
+
+
+def measure_revision(rev, original_ref, work_root):
+    sha = run(["git", "rev-parse", rev])
+    run(["git", "checkout", sha])
+    # Pull the fixed schema forward from the starting ref so the only thing that
+    # differs between revisions is the generator (and runtime) under test.
+    for path in PULL_FORWARD:
+        subprocess.run(
+            ["git", "checkout", original_ref, "--", path], cwd=REPO, capture_output=True
+        )
+    try:
+        work = tempfile.mkdtemp(dir=work_root)
+        return {"sha": sha, **measure_current(work)}
+    finally:
+        run(["git", "reset", "--hard"])
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--revisions",
+        nargs="+",
+        default=["HEAD"],
+        help="Revisions to measure; with two, the first is the base for deltas.",
+    )
+    parser.add_argument("--out-dir", default="size_results")
+    args = parser.parse_args()
+
+    os.makedirs(args.out_dir, exist_ok=True)
+    work_root = tempfile.mkdtemp(dir=args.out_dir)
+
+    out = {"revisions": []}
+    if args.revisions == ["HEAD"]:
+        # Single-shot: measure the working tree as-is (no checkout dance).
+        out["revisions"].append(
+            {
+                "sha": run(["git", "rev-parse", "HEAD"]),
+                **measure_current(tempfile.mkdtemp(dir=work_root)),
+            }
+        )
+    else:
+        if subprocess.run(
+            ["git", "status", "--porcelain"], cwd=REPO, capture_output=True, text=True
+        ).stdout.strip():
+            print(
+                "error: working tree is dirty; commit or stash first.", file=sys.stderr
+            )
+            return 1
+        original = current_ref()
+        try:
+            for rev in args.revisions:
+                print(f"Measuring {rev}...", file=sys.stderr)
+                out["revisions"].append(measure_revision(rev, original, work_root))
+        finally:
+            run(["git", "checkout", original])
+            run(["git", "reset", "--hard"])
+
+    if len(out["revisions"]) >= 2:
+        out["base"] = out["revisions"][0]["sha"]
+        out["head"] = out["revisions"][-1]["sha"]
+
+    path = os.path.join(args.out_dir, "size_bench.json")
+    with open(path, "w") as f:
+        json.dump(out, f, indent=2)
+    print(f"Wrote {path}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/size_comment.py b/scripts/size_comment.py
new file mode 100644
index 00000000..bc3ea60c
--- /dev/null
+++ b/scripts/size_comment.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Renders a Markdown PR comment from size_bench.py JSON (base vs head)."""
+
+import json
+import re
+import sys
+
+MARKER = "<!-- emboss-size-bench -->"
+
+
+def short_ver(v):
+    """Pull an X.Y[.Z] version number out of a compiler --version line."""
+    # Compilers print the real version last (after any distro/buildroot tag).
+    matches = re.findall(r"\d+\.\d+(?:\.\d+)?", v or "")
+    return matches[-1] if matches else "?"
+
+
+CONFIGS = ["Os", "O2", "O0"]
+
+
+def cell(rev, target, compiler, config):
+    return (rev["results"].get(target, {}).get(compiler, {}) or {}).get(
+        config, {}
+    ) or {}
+
+
+def delta(base_v, head_v, unit=""):
+    """'1234 B  🟢−5' style cell: head value + colored delta (smaller is better)."""
+    if head_v is None:
+        return "—"
+    s = f"{head_v}{unit}"
+    if base_v is None:
+        return s
+    d = head_v - base_v
+    if d == 0:
+        return s
+    pct = f"{d / base_v * 100:+.1f}%" if base_v else ""
+    icon = "🟢" if d < 0 else "🔴"
+    return f"{s}  {icon}{d:+d} ({pct})"
+
+
+def rows(base, head):
+    """Ordered (target, compiler) pairs present in head."""
+    out = []
+    for target in head["results"]:
+        for compiler in head["results"][target]:
+            out.append((target, compiler))
+    return out
+
+
+def table(base, head, config):
+    lines = [
+        "| Target · Compiler | Code size | Instructions | `many_conditionals Ok()` |",
+        "|---|--:|--:|--:|",
+    ]
+    for target, compiler in rows(base, head):
+        b = cell(base, target, compiler, config)
+        h = cell(head, target, compiler, config)
+        bb, hb = b.get("benchmark", {}), h.get("benchmark", {})
+        bm, hm = b.get("many_conditionals_ok", {}), h.get("many_conditionals_ok", {})
+        lines.append(
+            f"| {target} · {compiler} "
+            f"| {delta(bb.get('text'), hb.get('text'), ' B')} "
+            f"| {delta(bb.get('insns'), hb.get('insns'))} "
+            f"| {delta(bm.get('text'), hm.get('text'), ' B')} |"
+        )
+    return "\n".join(lines)
+
+
+def verdict(base, head):
+    """Headline focused on `.text` (size) regressions, with a worst-case call-out."""
+    worst = None  # (delta_bytes, pct, label)
+    improved = 0
+    for target, compiler in rows(base, head):
+        for config in CONFIGS:
+            b = cell(base, target, compiler, config)
+            h = cell(head, target, compiler, config)
+            for key, label in (
+                ("benchmark", "bench .text"),
+                ("many_conditionals_ok", "Ok()"),
+            ):
+                bv = (b.get(key) or {}).get("text")
+                hv = (h.get(key) or {}).get("text")
+                if bv is None or hv is None:
+                    continue
+                if hv > bv:
+                    pct = (hv - bv) / bv * 100 if bv else 0.0
+                    if worst is None or (hv - bv) > worst[0]:
+                        worst = (hv - bv, pct, f"{target} {compiler} {label} -{config}")
+                elif hv < bv:
+                    improved += 1
+    if worst:
+        d, pct, lbl = worst
+        return f"⚠️ largest size regression +{d} B ({pct:+.1f}%) on {lbl}"
+    if improved:
+        return f"🟢 smaller in {improved} place(s), none larger"
+    return "✅ no change"
+
+
+def render(data):
+    revs = data.get("revisions", [])
+    if len(revs) < 2:
+        return MARKER + "\n_Need a base and head revision to compare._"
+    base, head = revs[0], revs[-1]
+    versions = " · ".join(
+        f"{k.replace('/', ' ')} {short_ver(v)}"
+        for k, v in head.get("compiler_versions", {}).items()
+    )
+
+    out = [
+        MARKER,
+        "### 📐 Generated code size & instructions",
+        f"`{base['sha'][:9]} → {head['sha'][:9]}` · smaller is better · **{verdict(base, head)}**",
+        "",
+        "<sub>**Code size** (compiled `.text` bytes) and **Instructions** (objdump count) are totals "
+        "for the generated code of the all-features `benchmark.emb` fixture. **`many_conditionals Ok()`** "
+        "is the size (bytes) of the optimized conditional-validation method, a highlight. "
+        "Δ vs the merge-base · 🟢 smaller / 🔴 larger.</sub>",
+        "",
+        "#### `-Os` (embedded)",
+        table(base, head, "Os"),
+        "",
+        "<details><summary><code>-O2</code> / <code>-O0</code></summary>",
+        "",
+        "**`-O2`**",
+        table(base, head, "O2"),
+        "",
+        "**`-O0`**",
+        table(base, head, "O0"),
+        "",
+        "</details>",
+        "",
+        f"<sub>Compilers: {versions}. `benchmark.emb` is a fixed fixture (Ok()+CopyFrom over every "
+        "top-level view); it is pulled forward from head, so only the code generator under test "
+        "varies between base and head.</sub>",
+    ]
+    return "\n".join(out)
+
+
+def main():
+    path = sys.argv[1] if len(sys.argv) > 1 else "size_results/size_bench.json"
+    with open(path) as f:
+        print(render(json.load(f)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testdata/benchmark.emb b/testdata/benchmark.emb
new file mode 100644
index 00000000..a5caa0a3
--- /dev/null
+++ b/testdata/benchmark.emb
@@ -0,0 +1,132 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+-- FIXED BENCHMARK FIXTURE -- avoid editing.
+--
+-- This schema exists solely to measure the *size* and *instruction count* of
+-- the code Emboss generates, across a deliberately broad slice of language
+-- features. It is NOT a correctness test and is held fixed so the numbers stay
+-- comparable over time: a change here moves the measured baseline. Only edit it
+-- to cover a genuinely new language feature, accepting that the numbers shift.
+--
+-- The benchmark driver instantiates each parameter-free top-level struct
+-- (Scalars, Bitfields, Repeated, Conditional, Nested) and exercises Ok(),
+-- Equals() (read-all) and CopyFrom() (read-all + write-all).
+
+[$default byte_order: "LittleEndian"]
+[(cpp) namespace: "emboss::benchmark"]
+
+
+enum Shape:
+  CIRCLE   = 0
+  SQUARE   = 1
+  TRIANGLE = 2
+  HEXAGON  = 7
+
+
+enum WideId:
+  [maximum_bits: 64]
+  LOW  = 0
+  HIGH = 0x7fff_ffff_ffff_ffff
+
+
+bits PackedFlags:
+  7 [+1]  Flag  alpha
+  6 [+1]  Flag  beta
+  2 [+4]  UInt  nibble
+  0 [+2]  UInt  pair
+
+
+struct Point:
+  0 [+2]  UInt  x
+  2 [+2]  UInt  y
+
+
+struct Scaled(scale: UInt:8):
+  0 [+4]  UInt  raw
+    [requires: this < 1000000]
+  let scaled = raw * scale
+
+
+# Scalar coverage: integer widths, signedness, an arbitrary byte width, an
+# opposite-endianness field, an enum, two floats, $next, and virtual fields.
+struct Scalars:
+  0     [+1]  UInt    u8
+  1     [+2]  UInt    u16
+  3     [+4]  UInt    u32
+  7     [+8]  UInt    u64
+  15    [+4]  Int     s32
+  19    [+3]  UInt    u24
+  22    [+4]  UInt    be32
+    [byte_order: "BigEndian"]
+  26    [+1]  Shape   shape
+  27    [+8]  WideId  wide
+  35    [+4]  Float   f32
+  39    [+8]  Float   f64
+  $next [+1]  UInt    trailer
+  let u32_doubled = u32 * 2
+  let u16_plus_u8 = u16 + u8
+
+
+# Bitfields: a named bits type, an inline anonymous bits block, and an inline
+# anonymous enum inside it.
+struct Bitfields:
+  0 [+1]  PackedFlags  flags
+  1 [+4]  bits:
+    31 [+1]  Flag  top
+    14 [+4]  enum  mode:
+      IDLE = 0
+      RUN  = 1
+    0  [+9]  UInt  low9
+  5 [+1]  UInt  tail
+
+
+# Arrays: fixed byte array, sub-byte-element array, array of structs, and a
+# dynamically-sized trailing array.
+struct Repeated:
+  0  [+1]      UInt:8      count
+  1  [+8]      UInt:8[8]   fixed_bytes
+  9  [+4]      UInt:16[2]  words
+  13 [+8]      Point[2]    points
+  21 [+count]  UInt:8[]    variable
+  let total = count + 8
+
+
+# Conditionals: a run of single-tag `if`s (switch-coalesced by the optimizer),
+# a disjunction, and a range conjunction.
+struct Conditional:
+  0 [+4]  UInt  tag
+  if tag == 0:
+    4 [+4]  UInt  a0
+  if tag == 1:
+    4 [+4]  UInt  a1
+  if tag == 2:
+    4 [+4]  UInt  a2
+  if tag == 3:
+    4 [+4]  UInt  a3
+  if tag == 4:
+    4 [+4]  UInt  a4
+  if tag == 10 || tag == 11 || tag == 12 || tag == 13:
+    4 [+4]  UInt  grouped
+  if tag >= 100 && tag <= 200:
+    8 [+4]  UInt  ranged
+
+
+# Nesting + parameter passing + a [requires] constraint.
+struct Nested:
+  0  [+1]  UInt       factor
+  1  [+4]  Scaled(2)  doubled
+  5  [+4]  Scaled(4)  quadrupled
+  9  [+4]  Point      origin
+  13 [+8]  Point[2]   corners