Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ansible/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@
docker rm -f probatorium-pg probatorium-redis probatorium-mc 2>/dev/null
docker run -d --rm --name probatorium-pg -p 127.0.0.1:54321:5432 \
-e POSTGRES_USER=bench -e POSTGRES_PASSWORD=bench -e POSTGRES_DB=bench \
postgres:17-alpine
postgres:17-alpine -c synchronous_commit=off
docker run -d --rm --name probatorium-redis -p 127.0.0.1:63791:6379 redis:8.2-alpine
docker run -d --rm --name probatorium-mc -p 127.0.0.1:21211:11211 memcached:1.6.41-alpine
args:
Expand Down
8 changes: 4 additions & 4 deletions ansible/tasks/run_bench_cell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,10 @@
# dbservices role only installs docker + pulls images; it never runs a
# container). Every driver-capable competitor reads its backend address
# from these PROBATORIUM_* vars; an unset var leaves that client nil and
# the driver cell 404s (→ capability-lie hard error). gin/celeris read
# PROBATORIUM_MEMCACHED_ADDR, echo reads PROBATORIUM_MC_ADDR, so we set
# both spellings. Harmless for static-only competitors (they ignore them).
# the driver cell 404s (→ capability-lie hard error). Every adapter reads
# PROBATORIUM_MEMCACHED_ADDR (standardized in v1.5.4 — fasthttp/echo/iris
# used to read PROBATORIUM_MC_ADDR). Harmless for static-only competitors
# (they ignore them).
# Ports MUST match bench.yml's container -p mappings AND validate.yml:
# postgres 54321 redis 63791 memcached 21211 (remapped off the
# defaults so a stray system postgres/redis on the bench host can't shadow
Expand All @@ -179,7 +180,6 @@
PROBATORIUM_PG_DSN: "postgres://bench:bench@127.0.0.1:54321/bench?sslmode=disable"
PROBATORIUM_REDIS_ADDR: "127.0.0.1:63791"
PROBATORIUM_MEMCACHED_ADDR: "127.0.0.1:21211"
PROBATORIUM_MC_ADDR: "127.0.0.1:21211"
# aspnet (.NET) is a FRAMEWORK-DEPENDENT publish: the native apphost
# (competitors/aspnet) must locate the shared .NET runtime at launch. We
# installed the SDK (which carries the runtime) pristinely under
Expand Down
4 changes: 2 additions & 2 deletions ansible/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
docker run -d --rm --name probatorium-pg \
-p 127.0.0.1:54321:5432 \
-e POSTGRES_USER=bench -e POSTGRES_PASSWORD=bench -e POSTGRES_DB=bench \
postgres:17-alpine
postgres:17-alpine -c synchronous_commit=off
register: dbsvc_pg
changed_when: dbsvc_pg.rc == 0
failed_when: false
Expand Down Expand Up @@ -158,7 +158,7 @@
# run is a no-op.
PROBATORIUM_PG_DSN: "postgres://bench:bench@127.0.0.1:54321/bench?sslmode=disable"
PROBATORIUM_REDIS_ADDR: "127.0.0.1:63791"
PROBATORIUM_MC_ADDR: "127.0.0.1:21211"
PROBATORIUM_MEMCACHED_ADDR: "127.0.0.1:21211"
ansible.builtin.shell: |
cd {{ bench_root }}
ulimit -l unlimited
Expand Down
2 changes: 1 addition & 1 deletion budget/budget.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ func plural(n int) string {
// Profile. Both "headline" and "full" now cover the SAME grid — every
// registered server × every scenario, capability-gated (Globs "*/*") —
// so NEITHER silently drops servers or scenarios. They differ only by the
// per-cell window: "headline" (the weekly cadence) uses a shorter 60s/15s
// per-cell window: "headline" (the weekly cadence) uses a shorter 40s/12s
// window so the whole grid fits the 24h budget single-arch; "full" uses a
// longer 90s/20s window for the occasional exhaustive sweep (over 24h,
// run as a manual dispatch with a raised BENCH_BUDGET). The default (empty
Expand Down
27 changes: 26 additions & 1 deletion budget/budget_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func TestForProfileDefaultHasFullCoverage(t *testing.T) {
t.Fatalf("default profile must cover the full grid (Globs '*/*'), got %v", def.Globs)
}
if def.Cells < 400 {
t.Errorf("default profile Cells: want >= 400 (the full matrix is ~800 capability-gated), got %d. "+
t.Errorf("default profile Cells: want >= 400 (the full matrix is ~1111 capability-gated), got %d. "+
"A value this low means the default was silently scoped down to a curated subset.",
def.Cells)
}
Expand All @@ -163,6 +163,31 @@ func TestGlobsAreNonEmpty(t *testing.T) {
}
}

// TestRatedRealizedCellsMatchSubset guards the rated-pin against the
// auto-mix-111 class of drift: the rated sweep runs RatedScenarios x
// RatedServers, and every curated rated scenario is a plain-H1 static row
// that applies to every curated rated server, so the realized count is the
// full cross product with no capability-gating loss. If a stale entry
// (an unregistered scenario whose glob matches nothing) sneaks back into
// RatedScenarios, the cross-product pin and this assertion diverge from
// reality — fail here rather than silently shrinking the published rated
// grid while the budget over-projects.
func TestRatedRealizedCellsMatchSubset(t *testing.T) {
want := len(RatedScenarios) * len(RatedServers)
if HeadlineRatedRealizedCells != want {
t.Errorf("HeadlineRatedRealizedCells = %d, want %d (len(RatedScenarios)=%d * len(RatedServers)=%d); "+
"a mismatch means a rated scenario is unregistered or the pin is stale",
HeadlineRatedRealizedCells, want, len(RatedScenarios), len(RatedServers))
}
if FullRatedRealizedCells != want {
t.Errorf("FullRatedRealizedCells = %d, want %d", FullRatedRealizedCells, want)
}
if len(ratedGlobs()) != want {
t.Errorf("len(ratedGlobs()) = %d, want %d (the expanded glob set must match the pin)",
len(ratedGlobs()), want)
}
}

// TestColumnWallClock pins the per-column projection the ansible hang
// guard is sized from. The "v3.8 rated column" case uses the REAL run
// config (33 capability-gated scenarios on celeris-epoll-h1-sync,
Expand Down
57 changes: 33 additions & 24 deletions budget/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ import "time"
// because it is the expensive additive dimension — see RatedServers below.

// RatedScenarios is the curated rated/SLO subset (#156): the SLO-knee
// scenarios where throughput-at-SLO carries the most signal.
// scenarios where throughput-at-SLO carries the most signal. The two
// registered static rows every rated server runs — a GET read and a 4 KiB
// POST. (A third entry, "auto-mix-111", used to be listed here but the
// scenario was deleted and never registered, so the -cells filter silently
// matched nothing and the rated grid was 16 cells while the pin claimed 24;
// removed from the rated pass in the v1.5.4 pre-run audit.)
var RatedScenarios = []string{
"get-json",
"post-4k",
"auto-mix-111",
}

// RatedServers is the curated rated column subset: the four celeris modes
Expand Down Expand Up @@ -54,20 +58,20 @@ var RatedServers = []string{
// rated sweep stays curated, so HeadlineRatedRealizedCells is unchanged.
const (
HeadlineRealizedCells = FullRealizedCells
HeadlineRatedRealizedCells = 24 // 8 rated servers x 3 rated scenarios, capability-gated
HeadlineRatedRealizedCells = 16 // 8 rated servers x 2 rated scenarios, capability-gated

// Full profile: every server x every scenario, capability-gated. After
// the mid-size payload rows (get/post-json-8k/16k) and the native h2c
// columns (axum/ntex/hyper/aspnet/fastapi/hono/elysia -h2) landed, the
// nominal grid is ~36 columns x 45 rows ~ 1620; capability gating (the
// streaming / driver / chain / TLS cells, plus the h2c-noupg columns
// skipping every H1 row) lands the realized count near ~800. Pinned
// conservatively high so FitWithin over-projects slightly and a registry
// change that blows the budget fails loudly rather than overflowing the
// run. Recompute with the scheduler's Applicable gate when the registry
// grows again.
FullRealizedCells = 820
FullRatedRealizedCells = 24
// Full profile: every server x every scenario, capability-gated. This is
// the SAME realized "*/*" grid Fast runs (FullRealizedCells ==
// FastRealizedCells); the profiles differ only by per-cell window. The
// v1.5.4 redesign reshaped the grid — saturated static rows pruned (W1),
// the driver set deepened 4->10 (W3), WS/SSE coverage added to three more
// columns (W4), and the 12 middleware/chain scenarios REMOVED (pre-run
// audit: they compared unequal work across adapters) — so the realized
// count moved off the older ~800/1257/1111 pins to 835. Recompute with
// `cmd/runner -dry-run -cells '*/*' | grep -c '^run0'` when the registry
// changes; the grid is now 52 columns x 29 rows, capability-gated.
FullRealizedCells = 835
FullRatedRealizedCells = 16
)

// HeadlineWeekly is the config the benchmark-tier workflow runs on the
Expand All @@ -88,16 +92,18 @@ const (
// the correct loud failure, since the full grid x 2 serial arches cannot fit
// 24h until ArchParallel (#168, blocked on loadgen linux/arm64) lands.
//
// Budget: ~820 cells x (15+60+5+12)s x 1 arch = ~20.9h saturation + ~0.7h
// curated rated = ~21.6h < 24h. The rated sweep stays curated (RatedGlobs)
// because it is the expensive additive dimension; expanding it to the full
// grid would blow the budget many times over.
// Budget: ~835 cells x (12+40+5+12)s x 1 arch = ~16.0h saturation + ~0.7h
// curated rated = ~16.7h < 24h. The per-cell window stays at the v1.5.4
// 40s/12s (the chain-scenario removal dropped the grid 1111->835, so there
// is now ample headroom). The rated sweep stays curated (RatedGlobs) because
// it is the expensive additive dimension; expanding it to the full grid
// would blow the budget many times over.
func HeadlineWeekly() Profile {
return Profile{
Name: "headline",
Cells: HeadlineRealizedCells,
Duration: 60 * time.Second,
Warmup: 15 * time.Second,
Duration: 40 * time.Second,
Warmup: 12 * time.Second,
Cooldown: defaultCooldown,
Runs: 1,
Arches: 1,
Expand All @@ -116,7 +122,10 @@ func HeadlineWeekly() Profile {
// Recompute with `cmd/runner -dry-run -cells '*/*' | grep -c '^run0'` when
// the registry grows; FitWithin uses it to assert the fast profile still
// fits 24h, so an over-large grid fails loudly instead of overrunning.
const FastRealizedCells = 1257
// v1.5.4 redesign: 1257 -> 1111 -> 835 (W1 pruned saturated static rows; W3
// deepened drivers 4->10; W4 added WS/SSE to three columns; pre-run audit
// REMOVED the 12 middleware/chain scenarios as unfair).
const FastRealizedCells = 835

// Fast is the DEFAULT routine + weekly profile: the FULL grid (every server
// × every scenario, capability-gated, "*/*") in SATURATION ONLY — no rated
Expand All @@ -126,8 +135,8 @@ const FastRealizedCells = 1257
// per cell, the dominant cost) is intentionally OFF here and belongs in a
// separate, scoped dispatch when latency-under-controlled-load is the story.
//
// Budget: 1257 cells × (10+35+5+12)s × 1 arch = ~21.6h saturation, rated=0
// → ~21.6h < 24h. RatedPasses=0 makes BenchTier skip the rated flag entirely
// Budget: 835 cells × (10+35+5+12)s × 1 arch = ~14.4h saturation, rated=0
// → ~14.4h < 24h. RatedPasses=0 makes BenchTier skip the rated flag entirely
// (rated OFF for every cell), so this is the cheap, full-breadth mode.
func Fast() Profile {
return Profile{
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.26.4
require (
github.com/HdrHistogram/hdrhistogram-go v1.2.0
github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c
github.com/goceleris/loadgen v1.4.9
github.com/goceleris/loadgen v1.4.10
github.com/google/gofuzz v1.2.0
github.com/jackc/pgx/v5 v5.10.0
github.com/pierrec/lz4/v4 v4.1.27
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxK
github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/goceleris/loadgen v1.4.9 h1:Kd/AmLHP520Su3azQ9tCNoc6tsaeEf7Nx8ECr4AdYfg=
github.com/goceleris/loadgen v1.4.9/go.mod h1:Olg2awQufUnRemRlCvFPFL6Ww3byUd+UvZYQAMJm6Co=
github.com/goceleris/loadgen v1.4.10 h1:j8qi6xQK4Bk1AqObhHBTbE1ZsG+s3vbpinLE4uPSfCI=
github.com/goceleris/loadgen v1.4.10/go.mod h1:9LvtFtzoZj8z3MkE4lFvOQ3VZt4jBBB0b/TXxfieIGA=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
Expand Down
18 changes: 15 additions & 3 deletions mage_bench_cellsglob_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,23 +83,30 @@ func TestCellsGlobServersWildcardUsesFullRegistry(t *testing.T) {
// the exclude).
// - the full negative "get-*/celeris-std-h1" then
// "!get-*/celeris-std-h1" leaves an empty include glob → fallback
// to "*", which keeps the 4 celeris servers via the implicit
// to "*", which keeps the celeris servers via the implicit
// include from the empty include. Documented behaviour: empty
// include with all excludes = "use the registry, but respect the
// excludes against it" — same as the runner.
func TestCellsGlobServersRespectsExcludes(t *testing.T) {
// Case 1: include "get-*/celeris-*" + exclude "get-simple/celeris-std-h1".
// The 4 celeris servers all match at least one (get-json etc.), so
// Every celeris engine column matches at least one (get-json etc.), so
// celeris-std-h1 stays (its get-json/celeris-std-h1 pair still
// matches the include, and no exclude covers the whole server).
// matches the include, and no exclude covers the whole server). The
// want set is the FULL celeris-* column family (the v1.5.4 redesign
// expanded it from 4 to the 9 engine modes below).
got, err := cellsGlobServers("get-*/celeris-*,!get-simple/celeris-std-h1")
if err != nil {
t.Fatalf("cellsGlobServers: %v", err)
}
want := []string{
"celeris-adaptive-auto+upg-async",
"celeris-adaptive-h1-async",
"celeris-epoll-auto+upg-async",
"celeris-epoll-h1-async",
"celeris-epoll-h1-sync",
"celeris-iouring-auto+upg-async",
"celeris-iouring-h1-async",
"celeris-iouring-h1-sync",
"celeris-std-h1",
}
if !reflect.DeepEqual(got, want) {
Expand All @@ -114,9 +121,14 @@ func TestCellsGlobServersRespectsExcludes(t *testing.T) {
t.Fatalf("cellsGlobServers: %v", err)
}
want = []string{
"celeris-adaptive-auto+upg-async",
"celeris-adaptive-h1-async",
"celeris-epoll-auto+upg-async",
"celeris-epoll-h1-async",
"celeris-epoll-h1-sync",
"celeris-iouring-auto+upg-async",
"celeris-iouring-h1-async",
"celeris-iouring-h1-sync",
"celeris-std-h1",
}
if !reflect.DeepEqual(got, want) {
Expand Down
16 changes: 8 additions & 8 deletions mage_tier.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ import (
//
// Per-cell execution: a cell visits ONE (server, scenario) pair and
// runs the saturation pass unconditionally. If the scenario is in
// the rated subset (currently get-json / post-4k / auto-mix-111) and
// the runner is launched with BENCH_RATED=1, the same cell ALSO runs
// the rated sweep after the saturation pass. The cell's JSON carries
// both maps on the same row; the bench's published Document has a
// per-scenario SaturationModeRPS (every scenario) + a per-scenario
// LatencyAtSLO (only the rated 3).
// the rated subset (currently get-json / post-4k) and the runner is
// launched with BENCH_RATED=1, the same cell ALSO runs the rated
// sweep after the saturation pass. The cell's JSON carries both maps
// on the same row; the bench's published Document has a per-scenario
// SaturationModeRPS (every scenario) + a per-scenario LatencyAtSLO
// (only the rated 2).
//
// Flow:
//
Expand All @@ -57,8 +57,8 @@ import (
// grid (every server × every scenario,
// capability-gated); they differ ONLY by the
// per-cell window. headline (the weekly
// cadence) uses 60s/15s so the whole grid fits
// 24h single-arch (~21.6h); full uses 90s/20s
// cadence) uses 40s/12s so the whole grid fits
// 24h single-arch (~16.7h); full uses 90s/20s
// for the exhaustive sweep (~30h on one arch,
// needs a raised BENCH_BUDGET). Default: full.
// BENCH_TARGET=both msa2-server | msr1 | both (both = 2 arches)
Expand Down
49 changes: 46 additions & 3 deletions report/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package report

import (
"sort"
"strings"
"time"
)

Expand Down Expand Up @@ -192,9 +193,13 @@ func BuildDocument(in BuildInput) *Document {
// loadgen still had CPU headroom is NIC-limited, not server-limited.
// Its saturation RPS converges across every fast adapter and must not
// be read as a ranking — the CPU efficiency in Resources is the real
// signal. Only flagged when the fabric's line rate is known (the LAN;
// the Tailscale overlay reports 0 and flags nothing).
if isNetworkBound(c.BytesMedian, c.LoadgenCPUP95, in.Environment.FabricLineRateBitsPerSec) {
// signal. Runtime detection only fires when the fabric's line rate is
// known (the LAN; the Tailscale overlay reports 0 and flags nothing),
// so the wire-bound-by-design scenarios are OR'd in unconditionally:
// post-1m is a documented wire-bound datapoint, never a ranking row,
// regardless of whether the line rate was measurable.
if isWireBoundByDesign(c.ScenarioName) ||
isNetworkBound(c.BytesMedian, c.LoadgenCPUP95, in.Environment.FabricLineRateBitsPerSec) {
if sr.NetworkBound == nil {
sr.NetworkBound = map[string]bool{}
}
Expand Down Expand Up @@ -240,6 +245,44 @@ const (
networkBoundLoadgenCPUCeiling = 8.0
)

// isWireBoundByDesign reports whether a scenario is wire-bound by design
// rather than by runtime measurement. post-1m is a documented 1 MiB-payload
// datapoint whose saturation RPS is dictated by the fabric, not the server,
// so it must always land in the wire-bound section and never head a raw-RPS
// ranking — even on overlays where the line rate is unknown and isNetworkBound
// cannot fire.
func isWireBoundByDesign(scenarioName string) bool {
return scenarioName == "post-1m"
}

// isFanoutBound reports whether a scenario's throughput is paced by the
// server's fixed publish tick rather than by CPU. The hub-broadcast and
// SSE-fanout cells push to N subscribers on a 1 ms cadence, so their RPS
// ceiling is ~1000*N regardless of server headroom — a fan-out rate, not a
// throughput the field can be ranked by. Their real signal is delivery
// latency (the tail-latency section), so they are kept out of the headline
// ranking just like the wire-bound cells. The echo modes (ws-echo /
// ws-large-echo) are client-driven round-trips and stay ranked.
func isFanoutBound(scenarioName string) bool {
switch scenarioName {
case "ws-hub-broadcast-128", "ws-hub-broadcast-1024",
"sse-fanout-128", "sse-fanout-1024":
return true
}
return false
}

// isLatencyProbeByDesign reports whether a scenario is a single-connection
// latency probe whose saturation "RPS" is a latency reciprocal (1/RTT)
// rather than a throughput. At one connection requests serialize, so the
// number rewards low per-request latency, not throughput, and must never
// head a raw-RPS ranking — its real signal is the tail-latency section. The
// "-1c" suffix is the single-conn marker (scenarios.ProfileSingle);
// get-json-1c is the only such scenario today.
func isLatencyProbeByDesign(scenarioName string) bool {
return strings.HasSuffix(scenarioName, "-1c")
}

// isNetworkBound reports whether a cell's achieved egress bandwidth sat at
// the fabric line rate (NIC-limited) rather than the server's CPU limit.
// bytesPerSec is the median across-runs throughput; loadgenCPUP95 is the
Expand Down
Loading