From 3373989420ee8b8cbc49c838c001a12ffb978126 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 10:12:19 +0200 Subject: [PATCH 01/27] fix(iouring): correct SEND_ZC opcode + CQE_F_NOTIF flag (zero-copy send was dead) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three wrong io_uring ABI constants silently disabled zero-copy send on every kernel: - opSENDZC was 53 (IORING_OP_FUTEX_WAITV); IORING_OP_SEND_ZC is 47. SEND_ZC SQEs carried the wrong opcode → kernel returned -EINVAL on the probe → SEND_ZC reported unsupported and was disabled everywhere. - cqeFNotif was 1<<2 (0x04 = IORING_CQE_F_SOCK_NONEMPTY); IORING_CQE_F_NOTIF is 1<<3 (0x08). Both the probe and the runtime notification handler (cqeIsNotif → handleSend) misread the zero-copy notification CQE. - probeSendZC hardcoded the 0x04 NOTIF check; now uses cqeFNotif. Also corrects the unused opSHUTDOWN constant (was 52=FUTEX_WAKE; IORING_OP_ SHUTDOWN is 34) to prevent the same class of bug if it is ever used. Verified on the cluster (kernel 7.0.0-22): the SEND_ZC probe now reports "true zero-copy" and the engine selects send_zc=true (previously send_zc=false / 'kernel rejected SEND_ZC opcode'). Note: perf-neutral on the current HTTP benchmark suite (interleaved A/B: get-json +0.4%, get-json-64k +0.0% — small responses don't use ZC's benefit, large are bandwidth-bound), but it is a genuine correctness fix that enables io_uring's zero-copy path and may help CPU-copy-bound workloads. Refs #356 --- engine/iouring/consts.go | 6 +++--- engine/iouring/probe.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/engine/iouring/consts.go b/engine/iouring/consts.go index 3ff11afd..35010dc0 100644 --- a/engine/iouring/consts.go +++ b/engine/iouring/consts.go @@ -41,8 +41,8 @@ const ( // opPROVIDEBUFFERS = 31 removed in v1.5.0 (celeris#320) — the legacy // BufferGroup / PROVIDE_BUFFERS SQE was never wired into the engine. // The supported path is IORING_REGISTER_PBUF_RING (BufferRing above). - opSHUTDOWN = 52 // IORING_OP_SHUTDOWN (kernel 5.11+) - opSENDZC = 53 // IORING_OP_SEND_ZC (kernel 6.0+) + opSHUTDOWN = 34 // IORING_OP_SHUTDOWN (kernel 5.11+) + opSENDZC = 47 // IORING_OP_SEND_ZC (kernel 6.0+) ) // SQE flags. @@ -57,7 +57,7 @@ const ( const ( cqeFBuffer = 1 << 0 cqeFMore = 1 << 1 - cqeFNotif = 1 << 2 // IORING_CQE_F_NOTIF: zero-copy send notification + cqeFNotif = 1 << 3 // IORING_CQE_F_NOTIF: zero-copy send notification ) // Accept flags. diff --git a/engine/iouring/probe.go b/engine/iouring/probe.go index 19875881..3a9ce69e 100644 --- a/engine/iouring/probe.go +++ b/engine/iouring/probe.go @@ -201,7 +201,7 @@ func probeSendZC() (SendZCProbeResult, string) { entry = ring.cqeAt(cqHead) notifFlags := entry.Flags - isNotif := notifFlags&0x04 != 0 // CQE_F_NOTIF + isNotif := notifFlags&cqeFNotif != 0 // CQE_F_NOTIF (1<<3); 0x04 is SOCK_NONEMPTY notifRes := entry.Res ring.EndCQ(cqHead + 1) From 008b1321cc91c7ba7cee9524af886b0198496dac Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 12:13:47 +0200 Subject: [PATCH 02/27] feat(core): adaptive inline-first dispatch under AsyncHandlers (#356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under Config.AsyncHandlers=true, a route that inherits the server/group default (not an explicit .Async()) is now ADAPTIVE: it runs INLINE on the event-loop worker (the ring-batched send path, which gets io_uring's ~33x syscall reduction) and is promoted to async dispatch only when an inline run is observed to block (>50us). Trivial handlers thus keep the cheap inline path; genuinely-blocking handlers (DB/cache round-trips) still get goroutine isolation after one inline run. Why: the shipped iouring-h1-async config dispatched EVERY request to a goroutine that did a direct write(2) per response, bypassing the ring (~42% CPU in unix.Write) — making it slower than epoll. Measured: on CPU-bound chain cells the ring path beats epoll +2.4..6.7%; this lets the async config reach that path for non-blocking handlers. - router: adaptiveRoutes set (built at registration) + promoted sync.Map; routeAsync returns async || (adaptive && promoted). Explicit .Async()/.Sync() removes the route from the adaptive set (setAsync). - handler: HandleStream times an inline adaptive run and promotes on block. - Non-adaptive configs (no AsyncHandlers default) keep the empty-map fast path, zero added overhead. Contract change (reflected in router_async_test.go): AsyncHandlers=true now means inline-first-adaptive, not all-async. Explicit .Async() is unchanged. Refs #356 --- handler.go | 29 +++++++++++- router.go | 110 ++++++++++++++++++++++++++++++++++++++----- router_async_test.go | 65 ++++++++++++++++++++----- 3 files changed, 179 insertions(+), 25 deletions(-) diff --git a/handler.go b/handler.go index 7c0e618b..8f32ff51 100644 --- a/handler.go +++ b/handler.go @@ -131,7 +131,21 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro c.handlers = handlers c.fullPath = fullPath - if err := c.Next(); err != nil { + // celeris#356: an adaptive route (inherited the AsyncHandlers=true default) + // runs INLINE here until observed to block. Time this inline run; if the + // handler chain exceeds adaptivePromoteThreshold it is genuinely blocking, + // so promote the route to async dispatch — future requests then run on a + // goroutine instead of stalling the event-loop worker. Non-adaptive configs + // (no AsyncHandlers default) hit the empty-map fast path and skip timing. + rt := a.server.router + if rt.adaptiveRoutes[fullPath] && !rt.isPromoted(fullPath) { + start := time.Now() + err := c.Next() + rt.recordInlineRun(fullPath, time.Since(start) > adaptivePromoteThreshold) + if err != nil { + a.handleError(c, s, err) + } + } else if err := c.Next(); err != nil { a.handleError(c, s, err) } if c.buffered && !c.written { @@ -141,6 +155,19 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro return nil } +// adaptivePromoteThreshold is the inline handler duration that counts as a +// "slow" run for adaptive promotion (celeris#356). A non-blocking handler +// (route + middleware, no I/O) returns in single-digit microseconds; a blocking +// one (DB/cache round-trip) takes 100µs+. 50µs separates them with margin for +// GC/scheduling jitter. +const adaptivePromoteThreshold = 50 * time.Microsecond + +// adaptivePromoteStreak is how many CONSECUTIVE slow inline runs promote an +// adaptive route to async. The consecutive requirement (a fast run resets the +// streak) makes a one-off cold start / GC pause harmless, while a handler that +// blocks on every request promotes within a handful of requests. +const adaptivePromoteStreak = 8 + // recoverAndRelease handles panic recovery and context release. Extracted to a // separate noinline function so that HandleStream's stack frame is not inflated // by the deferred closure and debug.Stack() call (P5). diff --git a/router.go b/router.go index a1f6b9e2..934c1dd4 100644 --- a/router.go +++ b/router.go @@ -5,6 +5,7 @@ import ( "log/slog" "slices" "sync" + "sync/atomic" ) // staticEntry holds the pre-composed handler chain and full path for a fully @@ -95,6 +96,25 @@ type router struct { // decide whether the async dispatch infrastructure is needed at // all (a server with zero async routes keeps the inline fast path). asyncRouteCount int + + // adaptiveRoutes (celeris#356) holds the fullPaths of routes that + // inherited the server-level AsyncHandlers=true default (rather than an + // explicit .Async()). They start INLINE (ring-batched send, the cheap + // path) and are promoted to async dispatch only when an inline run is + // observed to block — so trivial handlers keep io_uring's syscall + // batching while genuinely-blocking handlers still get goroutine + // isolation. Built at registration; read-only while serving. + adaptiveRoutes map[string]bool + // promoted records adaptive fullPaths that have been promoted to async + // after sustained blocking inline runs. sync.Map: concurrent reads on the + // hot path, rare writes at promotion time. + promoted sync.Map + // slowStreak tracks consecutive slow inline runs per adaptive fullPath + // (fullPath -> *atomic.Int32). Hysteresis: a one-off cold/GC outlier must + // not poison a fast route; only a handler that blocks on EVERY run (DB/ + // cache) accumulates adaptivePromoteStreak slow runs and gets promoted. A + // fast run resets the streak. + slowStreak sync.Map } // Route is an opaque handle to a registered route. Use the Name method to @@ -212,11 +232,25 @@ func (r *Route) setAsync(want bool) *Route { if r.node == nil { return r } - if r.node.async != want && r.router != nil { - if want { - r.router.asyncRouteCount++ - } else if r.router.asyncRouteCount > 0 { - r.router.asyncRouteCount-- + if r.router != nil { + // celeris#356: an explicit .Async()/.Sync() overrides the adaptive + // (server-default) classification — the route is no longer + // inline-first-with-promotion, so drop it from the adaptive set and + // clear any prior promotion. + if r.router.adaptiveRoutes[r.path] { + delete(r.router.adaptiveRoutes, r.path) + r.router.promoted.Delete(r.path) + // Adaptive routes were already counted (they may promote): keep the + // count when the explicit choice is async, drop it when sync. + if !want && r.router.asyncRouteCount > 0 { + r.router.asyncRouteCount-- + } + } else if r.node.async != want { + if want { + r.router.asyncRouteCount++ + } else if r.router.asyncRouteCount > 0 { + r.router.asyncRouteCount-- + } } } r.node.async = want @@ -234,7 +268,41 @@ func (r *Route) setAsync(want bool) *Route { func newRouter() *router { return &router{ - namedRoutes: make(map[string]*Route), + namedRoutes: make(map[string]*Route), + adaptiveRoutes: make(map[string]bool), + } +} + +// isPromoted reports whether an adaptive route (celeris#356) has been promoted +// to async dispatch after a blocking inline run. +func (r *router) isPromoted(fullPath string) bool { + _, ok := r.promoted.Load(fullPath) + return ok +} + +// promoteRoute marks an adaptive route as async after sustained blocking inline +// runs. Idempotent; subsequent routeAsync lookups return true so the engine +// dispatches the route to a goroutine. +func (r *router) promoteRoute(fullPath string) { + r.promoted.Store(fullPath, struct{}{}) +} + +// recordInlineRun feeds one inline-run observation into the adaptive +// classifier (celeris#356). A fast run resets the slow streak; a slow run +// increments it, and once a route is slow on adaptivePromoteStreak CONSECUTIVE +// runs it is promoted to async. The consecutive requirement makes a single cold +// start / GC pause harmless while a genuinely-blocking handler (slow on every +// run) promotes within a handful of requests. +func (r *router) recordInlineRun(fullPath string, slow bool) { + if !slow { + if v, ok := r.slowStreak.Load(fullPath); ok { + v.(*atomic.Int32).Store(0) + } + return + } + v, _ := r.slowStreak.LoadOrStore(fullPath, new(atomic.Int32)) + if v.(*atomic.Int32).Add(1) >= adaptivePromoteStreak { + r.promoteRoute(fullPath) } } @@ -253,6 +321,15 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, validatePath(path) async := r.resolveAsync(as) + // celeris#356: a route that inherits the server-level AsyncHandlers=true + // default (rather than an explicit .Async()) starts INLINE and is promoted + // to async only when an inline run is observed to block. Explicit + // .Async(true)/.Async(false) is honored verbatim. + adaptive := as == asyncDefault && r.defaultAsync + if adaptive { + async = false + r.adaptiveRoutes[path] = true + } root := r.getTree(method) if root == nil { @@ -271,7 +348,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, root.async = async route.node = root r.setStaticEntry(method, "/", staticEntry{handlers: handlers, fullPath: "/", async: async}) - if async { + if async || adaptive { r.asyncRouteCount++ } return route @@ -296,7 +373,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, r.setStaticEntry(method, path, staticEntry{handlers: handlers, fullPath: path, async: async}) } - if async { + if async || adaptive { r.asyncRouteCount++ } @@ -321,19 +398,28 @@ func (r *router) routeAsync(method, path string) bool { if idx >= 0 { if m := r.staticRoutes[idx]; m != nil { if e, ok := m[path]; ok { - return e.async + return e.async || r.adaptivePromoted(e.fullPath) } } } else if r.customStatic != nil { if m := r.customStatic[method]; m != nil { if e, ok := m[path]; ok { - return e.async + return e.async || r.adaptivePromoted(e.fullPath) } } } var params Params - _, _, async := r.find(method, path, ¶ms) - return async + _, fullPath, async := r.find(method, path, ¶ms) + return async || r.adaptivePromoted(fullPath) +} + +// adaptivePromoted reports whether an adaptive (server-default-async) route has +// been promoted to async dispatch after a blocking inline run (celeris#356). +// The adaptiveRoutes map is read-only while serving, so the lookup is a plain +// (lock-free) map read followed by a sync.Map load only for adaptive routes; +// non-adaptive configs (no AsyncHandlers default) keep the empty-map fast path. +func (r *router) adaptivePromoted(fullPath string) bool { + return r.adaptiveRoutes[fullPath] && r.isPromoted(fullPath) } // warnDuplicateRoute emits a warning when a route is registered twice for diff --git a/router_async_test.go b/router_async_test.go index 87e03bd4..b38b4564 100644 --- a/router_async_test.go +++ b/router_async_test.go @@ -21,16 +21,27 @@ func TestRouteAsync_ServerDefaultSync(t *testing.T) { } } -// TestRouteAsync_ServerDefaultAsync verifies routes inherit an async -// server default (Config.AsyncHandlers=true) when not overridden. -func TestRouteAsync_ServerDefaultAsync(t *testing.T) { +// TestRouteAsync_ServerDefaultAdaptive verifies the celeris#356 contract: a +// route that inherits the AsyncHandlers=true default (not an explicit .Async()) +// is ADAPTIVE — it starts INLINE (routeAsync=false, the ring-batched fast path) +// and is promoted to async only after a blocking inline run. hasAsyncRoutes +// stays true because adaptive routes may promote and need the async infra. +func TestRouteAsync_ServerDefaultAdaptive(t *testing.T) { s := New(Config{AsyncHandlers: true}) s.GET("/ping", noopHandler) - if !s.router.routeAsync("GET", "/ping") { - t.Fatal("route should inherit async server default") + if s.router.routeAsync("GET", "/ping") { + t.Fatal("adaptive route should start INLINE (not async) until it blocks") + } + if !s.router.adaptiveRoutes["/ping"] { + t.Fatal("/ping should be registered adaptive under AsyncHandlers=true") } if !s.router.hasAsyncRoutes() { - t.Fatal("hasAsyncRoutes should be true when default is async") + t.Fatal("hasAsyncRoutes should be true (adaptive routes may promote)") + } + // A blocking inline run promotes the route; it then resolves async. + s.router.promoteRoute("/ping") + if !s.router.routeAsync("GET", "/ping") { + t.Fatal("after promotion the adaptive route should resolve async") } } @@ -51,18 +62,25 @@ func TestRouteAsync_RouteOverrideOn(t *testing.T) { } } -// TestRouteAsync_RouteOverrideOff forces a single route sync on an -// async-default server. +// TestRouteAsync_RouteOverrideOff verifies that on an async-default server, an +// inherited route is adaptive (inline-first, celeris#356) while an explicit +// .Async(false) route is hard-sync and never adaptive. func TestRouteAsync_RouteOverrideOff(t *testing.T) { s := New(Config{AsyncHandlers: true}) - s.GET("/db", noopHandler) - s.GET("/cached", noopHandler).Async(false) - if !s.router.routeAsync("GET", "/db") { - t.Fatal("/db should inherit async default") + s.GET("/db", noopHandler) // inherits default → adaptive + s.GET("/cached", noopHandler).Async(false) // explicit sync → never adaptive + if s.router.routeAsync("GET", "/db") { + t.Fatal("/db inherits the async default → adaptive, starts inline") + } + if !s.router.adaptiveRoutes["/db"] { + t.Fatal("/db should be adaptive") } if s.router.routeAsync("GET", "/cached") { t.Fatal("/cached should be forced sync via Async(false)") } + if s.router.adaptiveRoutes["/cached"] { + t.Fatal("/cached is explicitly sync → must not be adaptive") + } } // TestRouteAsync_GroupInherit verifies group-level Async applies to its @@ -170,3 +188,26 @@ func TestRouteAsync_ResolverInterface(t *testing.T) { t.Fatal("adapter RouteAsync should report unmatched sync") } } + +// TestRouteAsync_AdaptiveHysteresis verifies celeris#356 promotion hysteresis: +// a single slow inline run (cold start / GC) must NOT promote a route, but +// adaptivePromoteStreak consecutive slow runs (a genuinely-blocking handler) +// must. A fast run resets the streak. +func TestRouteAsync_AdaptiveHysteresis(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/h", noopHandler) + rt := s.router + // One slow outlier, then a fast run → no promotion. + rt.recordInlineRun("/h", true) + rt.recordInlineRun("/h", false) + if rt.routeAsync("GET", "/h") { + t.Fatal("a single slow run must not promote (cold-start hysteresis)") + } + // Sustained slowness (blocking handler) → promotion after the streak. + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/h", true) + } + if !rt.routeAsync("GET", "/h") { + t.Fatalf("%d consecutive slow runs must promote to async", adaptivePromoteStreak) + } +} From b6c51af7b9ba68634b2b45a4bbec22f785db4694 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 12:43:19 +0200 Subject: [PATCH 03/27] test(router): lock in #356 no-regression guard for explicit .Async() routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blocking driver routes (probatorium /cache, /db, /mc) register with an explicit .Async() on an async-default server. setAsync drops them from the adaptive set, so handler.go's inline-first gate skips them and they resolve hard-async — no inline window, no worker stall. Pin that contract so the driver no-regression guarantee can't silently break. --- router_async_test.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/router_async_test.go b/router_async_test.go index b38b4564..581259f8 100644 --- a/router_async_test.go +++ b/router_async_test.go @@ -83,6 +83,28 @@ func TestRouteAsync_RouteOverrideOff(t *testing.T) { } } +// TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive is the celeris#356 +// no-regression guard for blocking handlers. A route explicitly marked +// .Async() on an async-default server (exactly how the probatorium driver +// routes — /cache, /db, /mc — register) must be hard-async, NOT adaptive: +// it must never enter the inline-first window, so a blocking handler never +// runs inline and stalls a worker. This is what makes #356 safe to ship +// without a driver-backend regression: trivial routes inherit (adaptive → +// inline win) while explicitly-async blocking routes stay always-async. +func TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/cache/:key", noopHandler).Async() // blocking driver route + if s.router.adaptiveRoutes["/cache/:key"] { + t.Fatal("explicit .Async() route must be removed from the adaptive set") + } + if _, promoted := s.router.promoted.Load("/cache/:key"); promoted { + t.Fatal("explicit .Async() route must carry no promotion state") + } + if !s.router.routeAsync("GET", "/cache/42") { + t.Fatal("explicit .Async() route must resolve hard-async (never inline)") + } +} + // TestRouteAsync_GroupInherit verifies group-level Async applies to its // routes. func TestRouteAsync_GroupInherit(t *testing.T) { From 96581bcc8459f4f832e97c84f3ea8b1c32192c74 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 13:26:28 +0200 Subject: [PATCH 04/27] perf(h1): drop redundant upfront findHeaderEnd block scan (#359) parseHeaders already detects an incomplete header block (a final line with no CRLF yields lineEnd==-1 -> (false,nil)), and every ParseRequest caller Reset()s parser+req first, so a partial parse is always retried cleanly. The upfront findHeaderEnd CRLF-walked the same bytes parseHeaders walks again -- pure double work on the common single-read request. Slow-drip re-parse is bounded by MaxHeaderSize(64K)/MaxHeaderCount(200)/ReadHeaderTimeout, matching net/http and fasthttp which also re-parse on partial reads. Correctness: full h1 race suite + 46M fuzz execs (ParseRequest+ChunkedBody) pass; findHeaderEnd retained (asm + its own tests). --- protocol/h1/parser.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/protocol/h1/parser.go b/protocol/h1/parser.go index 89b0058e..36e4bf53 100644 --- a/protocol/h1/parser.go +++ b/protocol/h1/parser.go @@ -86,15 +86,12 @@ func (p *Parser) ParseRequest(req *Request) (int, error) { return 0, nil } - remaining := p.buf[p.pos:] - // Quick check: if remaining starts with \r\n, headers are empty (no headers). - // Otherwise use SIMD-accelerated findHeaderEnd to verify \r\n\r\n is present. - if len(remaining) < 2 || remaining[0] != '\r' || remaining[1] != '\n' { - if findHeaderEnd(remaining) < 0 { - return 0, nil - } - } - + // No upfront whole-block findHeaderEnd scan: parseHeaders detects an + // incomplete block itself (a final line with no CRLF yields lineEnd==-1 → + // (false,nil)), and every caller Reset()s parser+req before re-parsing, so + // a partial parse is always retried cleanly. Scanning here first would + // CRLF-walk the same bytes parseHeaders walks again — pure double work on + // the common single-read request. if p.noStringHeaders { req.Headers = req.Headers[:0] } else { From a53643e8b56c32f5e9598138e2fc49008cd4cbbb Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 13:38:07 +0200 Subject: [PATCH 05/27] perf(core): reuse heap respHeaders backing across requests (#360) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a middleware stack pushes respHeaders past the inline 16-slot respHdrBuf (chain-fullstack: 17 headers), append() moved it onto a heap array that reset then dropped — forcing a fresh ~576B alloc every header-heavy request. Retain it as respHdrScratch and reuse it: one alloc per pooled Context, not per request. Also folds SetResponseHeaders' >16-header path onto the scratch and removes the old clear(respHdrBuf[:n>16]) clamp foot-gun (the cap check is the correct discriminator). NOTE: rps-neutral on the cluster (one small alloc/req was not the chain-fullstack bottleneck); this is a GC/RSS-hygiene + foot-gun-removal change. Verified 0 allocs/op (TestContextRespHeaderOverflowReuseZeroAlloc) + full root race suite + overflow regression guard. --- context.go | 43 +++++++++++++++++++++++++------------------ context_response.go | 4 ++++ context_test.go | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/context.go b/context.go index b0736478..f41e2dd4 100644 --- a/context.go +++ b/context.go @@ -168,6 +168,12 @@ type Context struct { hostOverride string respHdrBuf [16][2]string // reusable buffer for response headers (avoids heap escape) + // respHdrScratch retains the heap backing array the moment a middleware + // stack pushes respHeaders past respHdrBuf's 16 slots (e.g. chain-fullstack: + // 15 user headers + content-type + content-length = 17). Reused across + // requests so a >16-header route reallocates once per pooled Context, not + // once per request. nil until the first overflow. + respHdrScratch [][2]string trustedNets []*net.IPNet @@ -545,26 +551,27 @@ func (c *Context) reset() { c.fullPath = "" c.statusCode = 200 if n := len(c.respHeaders); n > 0 { - // respHeaders shares the respHdrBuf backing array up to its - // fixed capacity (16). Middleware stacks that emit more than 16 - // response headers (kitchen_sink with secure + cache + ratelimit - // + etag etc. easily exceeds it) trigger append() to allocate a - // new backing array — respHeaders now has > 16 entries but - // respHdrBuf is still just 16. Without this clamp, - // `clear(c.respHdrBuf[:n])` panics with "slice bounds out of - // range [:N] with length 16" — the panic propagates through - // recoverAndRelease and aborts the iouring/epoll async handler - // AFTER WriteResponse has queued bytes into the per-conn - // writeBuf but BEFORE flushSend pushes them to the socket, so - // the client sees an empty response. std-engine escapes the - // damage because Go's net/http writes the response inline - // before the cleanup panic. - if n > len(c.respHdrBuf) { - n = len(c.respHdrBuf) + // When a middleware stack emits >16 response headers (secure + cache + // + ratelimit + etag + ... easily exceeds it), append() has moved + // respHeaders onto a heap backing array. Retain it as respHdrScratch + // and reuse it next request rather than dropping it (which forced a + // fresh ~576B alloc on every header-heavy request). The cap check is + // the correct discriminator: cap<=16 means respHeaders is still the + // inline respHdrBuf, so clear only its used prefix; cap>16 means a + // heap array, clear its full length. (This also removes the old clamp + // that papered over a clear(respHdrBuf[:n>16]) bounds panic.) + if cap(c.respHeaders) > len(c.respHdrBuf) { + clear(c.respHeaders) + c.respHdrScratch = c.respHeaders + } else { + clear(c.respHdrBuf[:n]) } - clear(c.respHdrBuf[:n]) } - c.respHeaders = c.respHdrBuf[:0] + if c.respHdrScratch != nil { + c.respHeaders = c.respHdrScratch[:0] + } else { + c.respHeaders = c.respHdrBuf[:0] + } c.written = false c.aborted = false c.bytesWritten = 0 diff --git a/context_response.go b/context_response.go index ffe673c6..42dbacfd 100644 --- a/context_response.go +++ b/context_response.go @@ -761,9 +761,13 @@ func (c *Context) SetResponseHeaders(headers [][2]string) { if len(headers) <= len(c.respHdrBuf) { copy(c.respHdrBuf[:len(headers)], headers) c.respHeaders = c.respHdrBuf[:len(headers)] + } else if cap(c.respHdrScratch) >= len(headers) { + c.respHeaders = c.respHdrScratch[:len(headers)] + copy(c.respHeaders, headers) } else { c.respHeaders = make([][2]string, len(headers)) copy(c.respHeaders, headers) + c.respHdrScratch = c.respHeaders } } diff --git a/context_test.go b/context_test.go index 98429d31..b4ace633 100644 --- a/context_test.go +++ b/context_test.go @@ -249,6 +249,38 @@ func TestContextResetWithOverflowedRespHeaders(t *testing.T) { } } +// TestContextRespHeaderOverflowReuseZeroAlloc locks in celeris#360: once a +// middleware stack pushes respHeaders past the inline 16-slot respHdrBuf, the +// grown heap backing array is retained as respHdrScratch and reused on every +// subsequent request — so a >16-header route allocates the backing array ONCE +// per pooled Context, not once per request (chain-fullstack: 17 headers). +func TestContextRespHeaderOverflowReuseZeroAlloc(t *testing.T) { + s, _ := newTestStream("GET", "/test") + defer s.Release() + c := acquireContext(s) + defer releaseContext(c) + + // 17 clean lowercase headers > respHdrBuf's 16 slots → forces the heap path. + hdrs := [][2]string{ + {"h00", "v"}, {"h01", "v"}, {"h02", "v"}, {"h03", "v"}, {"h04", "v"}, + {"h05", "v"}, {"h06", "v"}, {"h07", "v"}, {"h08", "v"}, {"h09", "v"}, + {"h10", "v"}, {"h11", "v"}, {"h12", "v"}, {"h13", "v"}, {"h14", "v"}, + {"h15", "v"}, {"h16", "v"}, + } + avg := testing.AllocsPerRun(500, func() { + for _, h := range hdrs { + c.SetHeader(h[0], h[1]) + } + c.reset() + }) + if avg != 0 { + t.Fatalf("overflowed respHeaders must reuse the scratch backing array: got %.2f allocs/op, want 0", avg) + } + if cap(c.respHdrScratch) < len(hdrs) { + t.Fatalf("respHdrScratch should retain a >=%d cap backing array, got cap=%d", len(hdrs), cap(c.respHdrScratch)) + } +} + func TestContextFullPathResetOnRelease(t *testing.T) { s, _ := newTestStream("GET", "/test") defer s.Release() From 29ebaecec4fb13b841398086b8c796a17e4c8805 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 14:03:52 +0200 Subject: [PATCH 06/27] perf(core): settle non-blocking adaptive routes to stop per-req timing (#361) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #356 classifier timed every inline run of an adaptive route (two time.Now() vDSO calls + recordInlineRun) forever. A profile of iouring-h1-async chain-fullstack showed time.runtimeNow at 3.22% CPU on routes that never block. Add a settled-fast terminal state: after adaptiveSettleStreak (256) CONSECUTIVE fast inline runs a route is proven non-blocking and leaves the timed path (adaptiveLearning short-circuits on a single settled sync.Map.Load — the same lookup the prior isPromoted check cost, minus the two time.Now()). A slow run resets the fast streak; explicit .Async()/.Sync() clears settled (setAsync). A/B (iouring-h1-async, interleaved): chain-api +1.4% (both rounds), get-json +0.4%. Full root race suite + new TestRouteAsync_AdaptiveSettles + existing #356 hysteresis/promotion tests pass. --- handler.go | 11 ++++++++++- router.go | 36 ++++++++++++++++++++++++++++++++++++ router_async_test.go | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/handler.go b/handler.go index 8f32ff51..2458c99e 100644 --- a/handler.go +++ b/handler.go @@ -138,7 +138,7 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro // goroutine instead of stalling the event-loop worker. Non-adaptive configs // (no AsyncHandlers default) hit the empty-map fast path and skip timing. rt := a.server.router - if rt.adaptiveRoutes[fullPath] && !rt.isPromoted(fullPath) { + if rt.adaptiveRoutes[fullPath] && rt.adaptiveLearning(fullPath) { start := time.Now() err := c.Next() rt.recordInlineRun(fullPath, time.Since(start) > adaptivePromoteThreshold) @@ -168,6 +168,15 @@ const adaptivePromoteThreshold = 50 * time.Microsecond // blocks on every request promotes within a handful of requests. const adaptivePromoteStreak = 8 +// adaptiveSettleStreak is how many CONSECUTIVE fast inline runs SETTLE an +// adaptive route (celeris#361): proven non-blocking, it is removed from the +// timed path so the hot loop stops paying two time.Now() vDSO calls per +// request forever. High enough that only consistently-static routes settle (a +// slow run resets the streak); at scale a static route settles in well under a +// millisecond. A genuinely-blocking handler should be marked .Async() — it +// promotes long before it could settle. +const adaptiveSettleStreak = 256 + // recoverAndRelease handles panic recovery and context release. Extracted to a // separate noinline function so that HandleStream's stack frame is not inflated // by the deferred closure and debug.Stack() call (P5). diff --git a/router.go b/router.go index 934c1dd4..18b6105c 100644 --- a/router.go +++ b/router.go @@ -115,6 +115,16 @@ type router struct { // cache) accumulates adaptivePromoteStreak slow runs and gets promoted. A // fast run resets the streak. slowStreak sync.Map + // fastStreak tracks consecutive FAST inline runs per adaptive fullPath + // (fullPath -> *atomic.Int32). Once a route is fast on adaptiveSettleStreak + // consecutive runs it is provably non-blocking and gets SETTLED — future + // requests skip the per-request inline timing (two time.Now() vDSO calls) + // entirely (celeris#361). A slow run resets it. + fastStreak sync.Map + // settled holds adaptive fullPaths proven non-blocking (see fastStreak). + // A settled route is no longer timed/promotable; it runs inline like a + // plain sync route. Explicit .Async()/.Sync() clears it (setAsync). + settled sync.Map } // Route is an opaque handle to a registered route. Use the Name method to @@ -240,6 +250,9 @@ func (r *Route) setAsync(want bool) *Route { if r.router.adaptiveRoutes[r.path] { delete(r.router.adaptiveRoutes, r.path) r.router.promoted.Delete(r.path) + r.router.settled.Delete(r.path) + r.router.fastStreak.Delete(r.path) + r.router.slowStreak.Delete(r.path) // Adaptive routes were already counted (they may promote): keep the // count when the explicit choice is async, drop it when sync. if !want && r.router.asyncRouteCount > 0 { @@ -298,14 +311,37 @@ func (r *router) recordInlineRun(fullPath string, slow bool) { if v, ok := r.slowStreak.Load(fullPath); ok { v.(*atomic.Int32).Store(0) } + // celeris#361: a route fast on adaptiveSettleStreak CONSECUTIVE runs is + // provably non-blocking — settle it so adaptiveLearning short-circuits + // and handler.go stops timing every request forever. + fv, _ := r.fastStreak.LoadOrStore(fullPath, new(atomic.Int32)) + if fv.(*atomic.Int32).Add(1) >= adaptiveSettleStreak { + r.settled.Store(fullPath, struct{}{}) + } return } + if v, ok := r.fastStreak.Load(fullPath); ok { + v.(*atomic.Int32).Store(0) + } v, _ := r.slowStreak.LoadOrStore(fullPath, new(atomic.Int32)) if v.(*atomic.Int32).Add(1) >= adaptivePromoteStreak { r.promoteRoute(fullPath) } } +// adaptiveLearning reports whether an adaptive route is still being observed — +// i.e. neither settled (proven non-blocking, celeris#361) nor promoted (proven +// blocking, celeris#356). Only learning routes pay the per-request inline +// timing in handler.go; once decided, the hot path skips it. Steady state is a +// single sync.Map.Load (settled), the same lookup cost the prior isPromoted +// check carried, but without the two time.Now() vDSO calls per request. +func (r *router) adaptiveLearning(fullPath string) bool { + if _, ok := r.settled.Load(fullPath); ok { + return false + } + return !r.isPromoted(fullPath) +} + // addRoute registers a route inheriting the server-level async default. // Kept as the 3-arg form for existing callers/tests; addRouteWithAsync is // the underlying implementation that also accepts a per-route/group diff --git a/router_async_test.go b/router_async_test.go index 581259f8..daad24a3 100644 --- a/router_async_test.go +++ b/router_async_test.go @@ -233,3 +233,46 @@ func TestRouteAsync_AdaptiveHysteresis(t *testing.T) { t.Fatalf("%d consecutive slow runs must promote to async", adaptivePromoteStreak) } } + +// TestRouteAsync_AdaptiveSettles verifies celeris#361: a route fast on +// adaptiveSettleStreak CONSECUTIVE inline runs SETTLES (leaves the timed +// learning path so the hot loop stops paying two time.Now() per request), and a +// slow run before the streak completes resets it. A settled route still runs +// inline (not async). +func TestRouteAsync_AdaptiveSettles(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + route := s.GET("/s", noopHandler) + rt := s.router + + // One short of the streak → still learning. + for i := 0; i < adaptiveSettleStreak-1; i++ { + rt.recordInlineRun("/s", false) + } + if !rt.adaptiveLearning("/s") { + t.Fatal("route must still be learning before the settle streak completes") + } + + // A slow run resets the fast streak; the next near-full run must NOT settle. + rt.recordInlineRun("/s", true) + for i := 0; i < adaptiveSettleStreak-1; i++ { + rt.recordInlineRun("/s", false) + } + if !rt.adaptiveLearning("/s") { + t.Fatal("a slow run must reset the fast streak — route not settled yet") + } + + // One more fast run completes the streak → settle. + rt.recordInlineRun("/s", false) + if rt.adaptiveLearning("/s") { + t.Fatalf("route must settle after %d consecutive fast runs", adaptiveSettleStreak) + } + if rt.routeAsync("GET", "/s") { + t.Fatal("a settled route runs INLINE, not async") + } + + // An explicit override clears the settled state (setAsync). + route.Sync() + if _, settled := rt.settled.Load("/s"); settled { + t.Fatal("explicit .Sync() must clear the settled state") + } +} From d1baff8aaff2912f394521952a09cb1d566d925f Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 14:33:46 +0200 Subject: [PATCH 07/27] =?UTF-8?q?fix(core):=20raise=20adaptive=20promote?= =?UTF-8?q?=20threshold=2050=C2=B5s->300=C2=B5s=20(#364=20hardening)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 50µs bar was below the CPU-jitter range: a transient GC/scheduling burst could push a CPU-bound middleware chain (chain-fullstack, ~20µs base) over 50µs for 8 consecutive runs and wrongly promote it to the slower async path, intermittently collapsing iouring-async chain-fullstack by ~32% for a whole run (#364). 300µs sits far above the CPU-bound range (even a heavy chain under jitter) while still catching genuinely-blocking handlers (sub-ms+) — which are marked .Async() in practice anyway. Hardening, not a proven fix: the collapse is rare (<10%, did not reproduce in 10 fresh-SUT runs) so elimination can't be directly measured. rps-neutral within the ~±2% A/B noise floor (chain-fullstack/api/get-json). Inherited adaptive routes are CPU-bound (never legitimately promote); the only behavior change is a moderate-latency unmarked-blocking route promotes later — such routes should use explicit .Async(). --- handler.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/handler.go b/handler.go index 2458c99e..a3875e5b 100644 --- a/handler.go +++ b/handler.go @@ -156,11 +156,17 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro } // adaptivePromoteThreshold is the inline handler duration that counts as a -// "slow" run for adaptive promotion (celeris#356). A non-blocking handler -// (route + middleware, no I/O) returns in single-digit microseconds; a blocking -// one (DB/cache round-trip) takes 100µs+. 50µs separates them with margin for -// GC/scheduling jitter. -const adaptivePromoteThreshold = 50 * time.Microsecond +// "slow" run for adaptive promotion (celeris#356). A non-blocking handler — +// even a heavy middleware chain — returns in tens of microseconds; a blocking +// one (DB/cache round-trip) takes hundreds of µs to ms. The bar sits well above +// the CPU-bound range so a transient GC/scheduling burst cannot push a +// CPU-bound chain over it for adaptivePromoteStreak consecutive runs and +// wrongly promote it to the slower async path — a 50µs bar did exactly that, +// intermittently collapsing iouring-async chain-fullstack (celeris#364). +// Genuinely-blocking routes are marked .Async() explicitly (opting out of +// adaptive); auto-promotion is a safety net for an unmarked handler that blocks +// on EVERY request. +const adaptivePromoteThreshold = 300 * time.Microsecond // adaptivePromoteStreak is how many CONSECUTIVE slow inline runs promote an // adaptive route to async. The consecutive requirement (a fast run resets the From 0a542326b49b8166d7e8d7ca7bd46bab14135a76 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 15:13:56 +0200 Subject: [PATCH 08/27] perf(h1): drop dead-store writes in ResetH1Stream (#346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResetH1Stream re-zeroed Headers/LazyRawHeaders/Method/Path/Scheme/Authority/ IsHEAD/EndStream/ResponseWriter + an atomic state.Store(StateIdle) that populateCachedStream + handleH1Request unconditionally overwrite on the next request (h1.go:852-941, single caller) — pure dead work + a redundant atomic on every keep-alive request. Keep only the fields the caller does NOT redo: rawBody=nil (load-bearing — a bodyless GET must not inherit a prior body), lazyHeadersBuilt, pseudoMaterialized, headersSent. Verified: caller sets all 10 dropped fields; full h1/stream/internal-conn race suites + h1 conformance pass (a stale-state leak would surface across keep-alive requests); new TestResetH1StreamClearsUniqueFields guards the rawBody contract; 0 allocs/op. --- protocol/h2/stream/stream.go | 16 ++++------- protocol/h2/stream/stream_test.go | 48 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 protocol/h2/stream/stream_test.go diff --git a/protocol/h2/stream/stream.go b/protocol/h2/stream/stream.go index 77a95e56..46cf7b9b 100644 --- a/protocol/h2/stream/stream.go +++ b/protocol/h2/stream/stream.go @@ -314,6 +314,12 @@ func (s *Stream) resetAndPool() { } // ResetH1Stream performs a lightweight per-request reset for H1 stream reuse. +// It clears ONLY the fields populateCachedStream + handleH1Request do not +// unconditionally overwrite on the next request. rawBody is load-bearing: the +// caller installs it only when the request carries a body, so a bodyless GET +// would otherwise inherit the previous request's body. Headers/Method/Path/ +// Scheme/Authority/LazyRawHeaders/IsHEAD/EndStream/ResponseWriter/state are all +// re-set per request (h1.go:852-941), so re-zeroing them here is dead work. func ResetH1Stream(s *Stream) { if s.Data != nil { s.Data.Reset() @@ -321,19 +327,9 @@ func ResetH1Stream(s *Stream) { s.Data = nil } s.rawBody = nil - s.Headers = s.hdrBuf[:0] - s.LazyRawHeaders = nil s.lazyHeadersBuilt = false s.pseudoMaterialized = false - s.Method = "" - s.Path = "" - s.Scheme = "" - s.Authority = "" s.headersSent.Store(false) - s.EndStream = false - s.ResponseWriter = nil - s.IsHEAD = false - s.state.Store(int32(StateIdle)) } // MaterializeHeaders ensures that every header carried by this H1 stream diff --git a/protocol/h2/stream/stream_test.go b/protocol/h2/stream/stream_test.go new file mode 100644 index 00000000..854c6831 --- /dev/null +++ b/protocol/h2/stream/stream_test.go @@ -0,0 +1,48 @@ +package stream + +import "testing" + +// TestResetH1StreamClearsUniqueFields locks in the contract that ResetH1Stream +// owns: the fields the per-request caller does NOT unconditionally overwrite. +// rawBody is the load-bearing one — a bodyless GET that follows a request with +// a body must not inherit the stale body (celeris#346). +func TestResetH1StreamClearsUniqueFields(t *testing.T) { + s := NewH1Stream(1) + + s.SetRawBody([]byte("previous request body")) + s.GetBuf().WriteString("buffered data") + s.lazyHeadersBuilt = true + s.pseudoMaterialized = true + s.headersSent.Store(true) + + ResetH1Stream(s) + + if s.rawBody != nil { + t.Fatalf("rawBody = %q, want nil (stale body would leak into next request)", s.rawBody) + } + if s.Data != nil { + t.Fatalf("Data = %v, want nil (buffer must be returned to the pool)", s.Data) + } + if s.lazyHeadersBuilt { + t.Fatal("lazyHeadersBuilt = true, want false") + } + if s.pseudoMaterialized { + t.Fatal("pseudoMaterialized = true, want false") + } + if s.headersSent.Load() { + t.Fatal("headersSent = true, want false") + } +} + +func BenchmarkResetH1Stream(b *testing.B) { + s := NewH1Stream(1) + body := []byte("hello") + b.ReportAllocs() + for b.Loop() { + s.SetRawBody(body) + s.lazyHeadersBuilt = true + s.pseudoMaterialized = true + s.headersSent.Store(true) + ResetH1Stream(s) + } +} From 4cb0927d7adab9f5afaad25a4aaeb5e7eb4e1cd6 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 15:24:16 +0200 Subject: [PATCH 09/27] fix(adaptive): gate io_uring bias off by default (#341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioUringBias is a heuristic that estimates io_uring's advantage from connection count + CPU pressure but NEVER reads the standby engine's measured throughput. Ungated, biasModeledStandbyScore could fabricate a standby score high enough to switch adaptive onto an engine that is measurably SLOWER on the live workload (e.g. epoll-favored get-simple). Gate it behind CELERIS_ADAPTIVE_IOURING_BIAS (default off): with the bias off, bias=0 → the modeled standby never exceeds the active score → adaptive switches only on MEASURED active degradation vs a previously-observed standby, never speculatively onto an unmeasured/slower engine. The speculative bias stays opt-in for re-validation. Tests: TestControllerOrganicSwitch now forces biasEnabled=true (the bias is opt-in); new TestControllerNoSpeculativeSwitchBiasOff asserts the default-off no-switch on the same sweet-spot workload; score_test uses the 2-arg form. Full adaptive suite passes on linux/amd64 (cluster). --- adaptive/controller.go | 21 ++++++++++++++++--- adaptive/controller_test.go | 41 +++++++++++++++++++++++++++++++------ adaptive/score.go | 8 +++++++- adaptive/score_test.go | 2 +- 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/adaptive/controller.go b/adaptive/controller.go index 1687795a..9630e9ce 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -4,11 +4,21 @@ package adaptive import ( "log/slog" + "os" "time" "github.com/goceleris/celeris/engine" ) +// envIOUringBias gates the io_uring workload bias (celeris#341). It is OFF +// unless explicitly enabled, because the bias is a heuristic that does NOT read +// the standby engine's measured throughput — ungated it can speculatively +// switch adaptive onto an engine that is measurably SLOWER on the live +// workload. With it off, adaptive is purely measurement-driven and never leaves +// a faster active engine for an unmeasured estimate. Re-enable to re-validate +// once the bias is gated behind a real measured-parity check. +const envIOUringBias = "CELERIS_ADAPTIVE_IOURING_BIAS" + type controllerState struct { activeIsPrimary bool lastSwitch time.Time @@ -31,6 +41,7 @@ type controller struct { evalInterval time.Duration cooldown time.Duration threshold float64 + biasEnabled bool logger *slog.Logger } @@ -43,6 +54,7 @@ func newController(primary, secondary engine.Engine, sampler TelemetrySampler, l evalInterval: 5 * time.Second, cooldown: 30 * time.Second, threshold: 0.15, + biasEnabled: os.Getenv(envIOUringBias) == "1", logger: logger, state: controllerState{ activeIsPrimary: true, @@ -82,9 +94,12 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { activeSnap := c.sampler.Sample(active) baselineActiveScore := computeScore(activeSnap, c.weights) - // io_uring bias: the modeled io_uring advantage given the current workload - // (connection count + CPU pressure). Zero outside the empirical sweet spot. - bias := ioUringBias(activeSnap) + // io_uring bias: the modeled io_uring advantage for the current workload. + // Zero outside the empirical sweet spot, AND zero unless explicitly enabled + // (celeris#341, envIOUringBias). It never reads the standby's real + // throughput, so off-by-default keeps adaptive from speculatively switching + // onto a measurably-slower engine. + bias := ioUringBias(activeSnap, c.biasEnabled) // Apply the bias to the ACTIVE score: reinforce io_uring when it is already // active (resist leaving it), lightly penalise epoll when conditions favor diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go index 8d216e5e..9c2f1733 100644 --- a/adaptive/controller_test.go +++ b/adaptive/controller_test.go @@ -13,12 +13,8 @@ import ( // TestControllerOrganicSwitch verifies that, in the io_uring sweet spot // (high connection count + high CPU), the controller eventually recommends an // epoll→io_uring switch driven purely by the io_uring bias — no pre-seeded -// standby history, no active degradation. -// -// This FAILS against the pre-fix logic: the standby was seeded at -// activeScore*0.80 and only decayed, so standby/active maxed out at ~0.70 and -// could never clear the 1+threshold (1.15) bar. The bias-modeled standby -// estimate makes the switch reachable. +// standby history, no active degradation. The bias is opt-in (celeris#341), so +// this exercises it with biasEnabled forced on. func TestControllerOrganicSwitch(t *testing.T) { primary := newMockEngine(engine.Epoll) // active secondary := newMockEngine(engine.IOUring) // standby @@ -27,6 +23,7 @@ func TestControllerOrganicSwitch(t *testing.T) { cfg := resource.Config{Protocol: engine.HTTP1} e := newFromEngines(primary, secondary, sampler, cfg) e.ctrl.cooldown = 0 + e.ctrl.biasEnabled = true // bias is opt-in; this test exercises it // Active epoll snapshot lands squarely in io_uring's empirical sweet spot. sampler.Set(engine.Epoll, TelemetrySnapshot{ @@ -54,6 +51,38 @@ func TestControllerOrganicSwitch(t *testing.T) { } } +// TestControllerNoSpeculativeSwitchBiasOff is the celeris#341 safety guard: with +// the io_uring bias OFF (the default), the SAME io_uring-sweet-spot workload +// must NOT switch — the standby has never been measured, so the only basis for a +// switch would be the fabricated bias estimate, which could land adaptive on a +// measurably-slower engine. Off-by-default keeps adaptive measurement-driven. +func TestControllerNoSpeculativeSwitchBiasOff(t *testing.T) { + primary := newMockEngine(engine.Epoll) // active + secondary := newMockEngine(engine.IOUring) // standby, never measured + sampler := newSyntheticSampler() + + cfg := resource.Config{Protocol: engine.HTTP1} + e := newFromEngines(primary, secondary, sampler, cfg) + e.ctrl.cooldown = 0 + if e.ctrl.biasEnabled { + t.Skip("CELERIS_ADAPTIVE_IOURING_BIAS set in env; default-off assertion N/A") + } + + // Squarely in the io_uring bias sweet spot — would switch if the bias were on. + sampler.Set(engine.Epoll, TelemetrySnapshot{ + ThroughputRPS: 1000, + ActiveConnections: 2048, + CPUUtilization: 0.9, + }) + + now := time.Now() + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { + t.Fatal("bias off: must NOT speculatively switch to the unmeasured io_uring standby") + } + } +} + // TestControllerNoSwitchOutsideSweetSpot is the inverse: low CPU or too few // connections yields zero bias, so the controller must NOT recommend a switch // (no degradation, no favorable conditions). diff --git a/adaptive/score.go b/adaptive/score.go index 85034d55..43592fc0 100644 --- a/adaptive/score.go +++ b/adaptive/score.go @@ -47,7 +47,13 @@ func computeScore(snap TelemetrySnapshot, w ScoreWeights) float64 { // x86 — see the bench data above — not a bug. If #318 identifies a // different cause (e.g. a PbufRing bottleneck, fixed by #322), update // this comment to point at the new finding. -func ioUringBias(snap TelemetrySnapshot) float64 { +func ioUringBias(snap TelemetrySnapshot, enabled bool) float64 { + // Off by default (celeris#341): this heuristic never reads the standby's + // measured throughput, so an ungated bias can speculatively switch adaptive + // onto a measurably-slower engine. Gated behind envIOUringBias. + if !enabled { + return 0 + } conns := snap.ActiveConnections cpu := snap.CPUUtilization diff --git a/adaptive/score_test.go b/adaptive/score_test.go index 599c89da..9f9757a0 100644 --- a/adaptive/score_test.go +++ b/adaptive/score_test.go @@ -27,7 +27,7 @@ func TestIoUringBiasConnFactorFalloff(t *testing.T) { ActiveConnections: tt.conns, CPUUtilization: tt.cpu, } - got := ioUringBias(snap) + got := ioUringBias(snap, true) if absDiff(got, tt.wantBias) > tt.tolerance { t.Errorf("ioUringBias(conns=%d, cpu=%.2f) = %v, want %v ± %v", tt.conns, tt.cpu, got, tt.wantBias, tt.tolerance) From a557bcea077c1e210fff1151b70f059e402c7f3f Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 15:37:14 +0200 Subject: [PATCH 10/27] perf(epoll): inherit TCP_NODELAY from the listen socket (#337) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set TCP_NODELAY once on the listen socket (Linux copies it onto every accepted socket at SYN time) and drop it from the per-accept sockopts.Options, removing one setsockopt syscall per accept on the hot path. rps-neutral on the cluster (churn-close within the ~±2% A/B noise; the bench is NIC-bound) — a syscall-count efficiency win, not a throughput change. Verified: full epoll suite passes; new linux tests assert the listen socket has TCP_NODELAY AND accepted conns inherit it (guards that removing the per-accept setsockopt does NOT silently re-enable Nagle on the hot path). --- engine/epoll/loop.go | 8 +- engine/epoll/nodelay_inherit_linux_test.go | 92 ++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 engine/epoll/nodelay_inherit_linux_test.go diff --git a/engine/epoll/loop.go b/engine/epoll/loop.go index 72c8e591..3fe877cd 100644 --- a/engine/epoll/loop.go +++ b/engine/epoll/loop.go @@ -167,8 +167,10 @@ func newLoop(id, cpuID int, handler stream.Handler, acceptPaused: acceptPaused, wake: make(chan struct{}), ready: make(chan error, 1), + // TCPNoDelay is omitted here: accepted sockets inherit TCP_NODELAY + // from the listen socket (set in createListenSocket), so per-accept + // ApplyFD doesn't need to re-issue it. sockOpts: sockopts.Options{ - TCPNoDelay: true, TCPQuickAck: true, SOBusyPoll: 50 * time.Microsecond, RecvBuf: resolved.SocketRecv, @@ -2379,6 +2381,10 @@ func createListenSocket(addr string) (int, error) { return -1, err } + // TCP_NODELAY on the listen socket: Linux copies it onto every accepted + // socket at SYN time, so per-accept ApplyFD can skip its own NODELAY + // setsockopt (one fewer syscall per accept on the hot path). + _ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) // TCP_DEFER_ACCEPT: kernel holds connections until data arrives, // eliminating wasted accept+wait cycles for idle connections. _ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_DEFER_ACCEPT, 1) diff --git a/engine/epoll/nodelay_inherit_linux_test.go b/engine/epoll/nodelay_inherit_linux_test.go new file mode 100644 index 00000000..b6d7bcdf --- /dev/null +++ b/engine/epoll/nodelay_inherit_linux_test.go @@ -0,0 +1,92 @@ +//go:build linux + +package epoll + +import ( + "net" + "testing" + "time" + + "golang.org/x/sys/unix" +) + +// TestListenSocketHasNoDelay asserts createListenSocket enables TCP_NODELAY on +// the listen socket itself — the precondition for accepted sockets inheriting +// it (#337). +func TestListenSocketHasNoDelay(t *testing.T) { + lfd, err := createListenSocket("127.0.0.1:0") + if err != nil { + t.Skipf("listen socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(lfd) }) + + v, err := unix.GetsockoptInt(lfd, unix.IPPROTO_TCP, unix.TCP_NODELAY) + if err != nil { + t.Fatalf("getsockopt TCP_NODELAY: %v", err) + } + if v == 0 { + t.Fatal("listen socket TCP_NODELAY not set") + } +} + +// TestAcceptedConnInheritsNoDelay verifies accepted sockets carry TCP_NODELAY +// without acceptAll re-issuing the per-accept setsockopt. The kernel copies +// TCP_NODELAY from the listen socket at SYN time; this guards that the +// per-accept NODELAY removal (#337) leaves accepted conns nagle-disabled. +func TestAcceptedConnInheritsNoDelay(t *testing.T) { + lfd, err := createListenSocket("127.0.0.1:0") + if err != nil { + t.Skipf("listen socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(lfd) }) + + la := boundAddr(lfd) + if la == nil { + t.Skip("could not resolve bound addr") + } + tcpAddr, ok := la.(*net.TCPAddr) + if !ok { + t.Skipf("unexpected addr type %T", la) + } + + cfd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + t.Skipf("client socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(cfd) }) + + var sa unix.SockaddrInet4 + sa.Port = tcpAddr.Port + copy(sa.Addr[:], tcpAddr.IP.To4()) + if err := unix.Connect(cfd, &sa); err != nil { + t.Fatalf("connect: %v", err) + } + + // TCP_DEFER_ACCEPT is set on the listener, so the connection only becomes + // acceptable once data arrives — send a byte before accepting. + if _, err := unix.Write(cfd, []byte{'x'}); err != nil { + t.Fatalf("write: %v", err) + } + + var afd int + for i := 0; i < 200; i++ { + afd, _, err = unix.Accept4(lfd, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) + if err == unix.EAGAIN { + time.Sleep(time.Millisecond) + continue + } + break + } + if err != nil { + t.Fatalf("accept: %v", err) + } + t.Cleanup(func() { _ = unix.Close(afd) }) + + v, err := unix.GetsockoptInt(afd, unix.IPPROTO_TCP, unix.TCP_NODELAY) + if err != nil { + t.Fatalf("getsockopt TCP_NODELAY on accepted fd: %v", err) + } + if v == 0 { + t.Fatal("accepted socket did not inherit TCP_NODELAY from listener") + } +} From ce1073cc8d827b6a959f6566b3231b1ad274a3eb Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 15:37:58 +0200 Subject: [PATCH 11/27] perf(iouring): gate SEND_ZC by payload size (#332) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEND_ZC adds a second (NOTIF) CQE per send and holds zcNotifPending across the buffer's DMA lifetime, stalling the next flush — a net loss on small payloads where the avoided memcpy is tiny. Gate it behind sendZCMinBytes=4096 via a single useSendZC(sendZC, linked, n) helper used at all four send sites (highTier + optionalTier PrepareSend, worker prepSendSQE): small/linked sends use plain SEND (1 CQE, immediate buffer reuse), large unlinked sends still use ZC. rps-neutral on the cluster (get-json/get-simple within ±2% noise — NIC-bound; get-json-64k confirms ZC still chosen for >=4096, no regression) → removes one CQE/req on small async sends (efficiency), not a throughput change. Link invariant preserved (linked sends never ZC). Full iouring suite + new gating tests (TestUseSendZC, TestPrepSendSQEGatesBySize, TestPrepSendSQELinkedNeverZC) pass on linux/amd64. --- engine/iouring/consts.go | 8 +++ engine/iouring/send_zc_gate_test.go | 81 +++++++++++++++++++++++++++++ engine/iouring/tier.go | 4 +- engine/iouring/worker.go | 15 ++++-- 4 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 engine/iouring/send_zc_gate_test.go diff --git a/engine/iouring/consts.go b/engine/iouring/consts.go index 35010dc0..a8e24182 100644 --- a/engine/iouring/consts.go +++ b/engine/iouring/consts.go @@ -104,3 +104,11 @@ const ( sqeSize = 64 cqeSize = 16 ) + +// sendZCMinBytes is the payload threshold below which SEND_ZC is not worth its +// cost. Zero-copy adds a second (NOTIF) CQE per send and holds zcNotifPending +// across the buffer's DMA lifetime, stalling the next flush; it also forfeits +// the SEND→RECV link on the keep-alive fast path. Only above this size does the +// avoided memcpy outweigh the extra completion, so small responses use a plain +// linked SEND (1 CQE, immediate buffer reuse). +const sendZCMinBytes = 4096 diff --git a/engine/iouring/send_zc_gate_test.go b/engine/iouring/send_zc_gate_test.go new file mode 100644 index 00000000..dc99302f --- /dev/null +++ b/engine/iouring/send_zc_gate_test.go @@ -0,0 +1,81 @@ +//go:build linux + +package iouring + +import ( + "testing" + "unsafe" +) + +// TestUseSendZC pins the zero-copy gating policy (celeris#332): ZC is only +// chosen for unlinked sends whose payload is at least sendZCMinBytes. Below the +// threshold, or for any linked send, or when the capability is absent, the plain +// SEND path must win. +func TestUseSendZC(t *testing.T) { + cases := []struct { + name string + sendZC bool + linked bool + n int + want bool + }{ + {"large-unlinked-capable", true, false, sendZCMinBytes, true}, + {"above-threshold", true, false, sendZCMinBytes + 1, true}, + {"just-below-threshold", true, false, sendZCMinBytes - 1, false}, + {"small-unlinked", true, false, 100, false}, + {"empty", true, false, 0, false}, + {"large-linked", true, true, sendZCMinBytes * 4, false}, + {"large-no-capability", false, false, sendZCMinBytes * 4, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := useSendZC(tc.sendZC, tc.linked, tc.n); got != tc.want { + t.Errorf("useSendZC(%v, %v, %d) = %v, want %v", + tc.sendZC, tc.linked, tc.n, got, tc.want) + } + }) + } +} + +// prepSendSQEOpcode runs prepSendSQE against a freshly zeroed SQE for a worker +// with ZC enabled and an unlinked send of payload size n, returning the chosen +// opcode byte. +func prepSendSQEOpcode(t *testing.T, n int) byte { + t.Helper() + var sqe [sqeSize]byte + w := &Worker{sendZC: true} + cs := &connState{fd: 7, sendBuf: make([]byte, n)} + w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, false) + return sqe[0] +} + +// TestPrepSendSQEGatesBySize verifies the worker async flush path emits a plain +// SEND for sub-threshold responses (1 CQE, no NOTIF stall) and SEND_ZC only once +// the payload reaches sendZCMinBytes. +func TestPrepSendSQEGatesBySize(t *testing.T) { + if op := prepSendSQEOpcode(t, 100); op != opSEND { + t.Errorf("small response opcode = %d, want opSEND(%d)", op, opSEND) + } + if op := prepSendSQEOpcode(t, sendZCMinBytes-1); op != opSEND { + t.Errorf("just-below-threshold opcode = %d, want opSEND(%d)", op, opSEND) + } + if op := prepSendSQEOpcode(t, sendZCMinBytes); op != opSENDZC { + t.Errorf("at-threshold opcode = %d, want opSENDZC(%d)", op, opSENDZC) + } +} + +// TestPrepSendSQELinkedNeverZC guards the link invariant: a linked send must +// never use SEND_ZC (the NOTIF CQE would break the SEND→RECV chain), regardless +// of payload size. +func TestPrepSendSQELinkedNeverZC(t *testing.T) { + var sqe [sqeSize]byte + w := &Worker{sendZC: true} + cs := &connState{fd: 7, sendBuf: make([]byte, sendZCMinBytes*4)} + w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, true) + if sqe[0] != opSEND { + t.Errorf("linked large send opcode = %d, want opSEND(%d)", sqe[0], opSEND) + } + if sqe[1]&sqeIOLink == 0 { + t.Errorf("linked send missing IOSQE_IO_LINK in flags 0x%02x", sqe[1]) + } +} diff --git a/engine/iouring/tier.go b/engine/iouring/tier.go index 7860c302..e990f726 100644 --- a/engine/iouring/tier.go +++ b/engine/iouring/tier.go @@ -156,7 +156,7 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) { if sqe == nil { return } - if t.sendZC && !linked { + if useSendZC(t.sendZC, linked, len(buf)) { if t.fixedFiles { prepSendZCFixed(sqe, fd, buf, false) } else { @@ -224,7 +224,7 @@ func (t *optionalTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) if sqe == nil { return } - if t.sendZC && !linked { + if useSendZC(t.sendZC, linked, len(buf)) { // SEND_ZC cannot be linked (the notification CQE would break // the link chain), so fall back to regular SEND for linked ops. if t.fixedFiles { diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index c673c2d4..77163a99 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -3289,10 +3289,12 @@ func (w *Worker) flushSend(cs *connState) bool { } // prepSendSQE prepares a SEND or SEND_ZC SQE based on worker capabilities. -// SEND_ZC is only used for unlinked sends (the notification CQE would break -// the link chain). Linked sends always use regular SEND. +// SEND_ZC is only used for unlinked sends at or above sendZCMinBytes (the +// notification CQE would break the link chain, and on small payloads its extra +// completion costs more than the avoided memcpy). Smaller and linked sends use +// regular SEND. func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) { - if w.sendZC && !linked { + if useSendZC(w.sendZC, linked, len(cs.sendBuf)) { if cs.fixedFile { prepSendZCFixed(sqe, cs.fd, cs.sendBuf, false) } else { @@ -3305,6 +3307,13 @@ func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) { } } +// useSendZC decides whether a send should use zero-copy. ZC is only viable for +// unlinked sends whose payload is large enough that the saved memcpy outweighs +// the extra NOTIF CQE (see sendZCMinBytes). +func useSendZC(sendZC, linked bool, n int) bool { + return sendZC && !linked && n >= sendZCMinBytes +} + // flushSendLink is like flushSend but links a RECV SQE after the SEND using // IOSQE_IO_LINK. The kernel chains the operations: when SEND completes, RECV // starts automatically without another io_uring_enter. This eliminates one From b2bcf5fd5be9b9aabb1406ddc70585937f29f869 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 16:21:47 +0200 Subject: [PATCH 12/27] feat(adaptive): reversible io_uring bias, default on (#338) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #341 made the io_uring bias safe by DISABLING it, but that left adaptive parked on epoll at high concurrency (it starts on epoll), missing io_uring's measured +6.8% @1024c. Restore the win SAFELY by making the bias reversible: - the ACTIVE score is always the pure measurement (no reinforce/penalty), so leaving an engine is decided measured-vs-measured; - biasModeledStandbyScore boosts ONLY the io_uring standby (EXPLORE) and returns 0 for the epoll standby (never models it down) — so a wrongly- explored io_uring always REVERTS on measurement; - history records the unbiased measured score. Net: adaptive explores io_uring when the workload model favors it, keeps it only if it measures faster, and reverts otherwise. The 15% switch threshold + oscillation lock provide hysteresis (no thrash). Safe ON by default; CELERIS_ADAPTIVE_IOURING_BIAS=0 forces the conservative measurement-only controller (supersedes #341's default-off). Validated: full adaptive suite on linux/amd64 incl new explore/revert/ kill-switch/stability-under-fluctuation tests; cluster end-to-end confirmed — adaptive explored epoll→io_uring under sustained 1024c (892k→io_uring) and reverted to epoll when load stopped (0 throughput). Switch latency ~30s (deliberate observe-before-act; sustained-load win, tuning follow-up). --- adaptive/controller.go | 89 ++++++++++++++------------------- adaptive/controller_test.go | 99 ++++++++++++++++++++++++++++++++----- 2 files changed, 125 insertions(+), 63 deletions(-) diff --git a/adaptive/controller.go b/adaptive/controller.go index 9630e9ce..45752fc8 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -10,13 +10,14 @@ import ( "github.com/goceleris/celeris/engine" ) -// envIOUringBias gates the io_uring workload bias (celeris#341). It is OFF -// unless explicitly enabled, because the bias is a heuristic that does NOT read -// the standby engine's measured throughput — ungated it can speculatively -// switch adaptive onto an engine that is measurably SLOWER on the live -// workload. With it off, adaptive is purely measurement-driven and never leaves -// a faster active engine for an unmeasured estimate. Re-enable to re-validate -// once the bias is gated behind a real measured-parity check. +// envIOUringBias gates the io_uring workload bias. The bias is now REVERSIBLE +// (celeris#338): it only EXPLORES — boosting the io_uring standby so an +// epoll→io_uring switch is reachable when the workload model favors it — and +// never inflates the active score nor suppresses the epoll standby, so a +// wrongly-explored io_uring always reverts on measurement. That makes it safe +// ON by default (adaptive picks the real high-concurrency winner instead of +// parking on epoll); set CELERIS_ADAPTIVE_IOURING_BIAS=0 to force the +// conservative measurement-only controller. const envIOUringBias = "CELERIS_ADAPTIVE_IOURING_BIAS" type controllerState struct { @@ -54,7 +55,7 @@ func newController(primary, secondary engine.Engine, sampler TelemetrySampler, l evalInterval: 5 * time.Second, cooldown: 30 * time.Second, threshold: 0.15, - biasEnabled: os.Getenv(envIOUringBias) == "1", + biasEnabled: os.Getenv(envIOUringBias) != "0", logger: logger, state: controllerState{ activeIsPrimary: true, @@ -101,40 +102,30 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { // onto a measurably-slower engine. bias := ioUringBias(activeSnap, c.biasEnabled) - // Apply the bias to the ACTIVE score: reinforce io_uring when it is already - // active (resist leaving it), lightly penalise epoll when conditions favor - // io_uring (encourage leaving it). + // Reversible bias (celeris#338): the ACTIVE score is ALWAYS the pure + // measurement — never inflated or penalised — so leaving the active engine + // is decided measured-vs-measured, never blocked by a sticky bias bonus. activeScore := baselineActiveScore - if active.Type() == engine.IOUring { - activeScore *= (1.0 + bias) // Bonus: io_uring already active, reinforce - } else if bias > 0 { - activeScore *= (1.0 - bias*0.5) // Penalty: epoll active but conditions favor io_uring - } - // Store active's (biased) score for historical reference. - c.state.lastActiveScore[active.Type()] = activeScore + // Record the measured (unbiased) score as history, so a later revert + // compares real throughput rather than a biased estimate. + c.state.lastActiveScore[active.Type()] = baselineActiveScore c.state.lastActiveTime[active.Type()] = now - // Seed standby with 80% of active if no history exists. + // Seed standby with 80% of active if no history exists. Harmless: 0.8 never + // clears the switch threshold on its own — only the explore-bias does. if _, ok := c.state.lastActiveScore[standby.Type()]; !ok { - c.state.lastActiveScore[standby.Type()] = activeScore * 0.80 + c.state.lastActiveScore[standby.Type()] = baselineActiveScore * 0.80 c.state.lastActiveTime[standby.Type()] = now } - // Standby estimate. Two independent signals, combined by max: - // - // 1. Historical: the last directly-observed score for the standby engine, - // decayed at 1%/sec. This drives switching when the ACTIVE engine - // genuinely degrades below a previously-measured standby. - // - // 2. Bias-modeled: the standby score implied by the io_uring bias for the - // CURRENT workload, recomputed EACH tick. When io_uring is the standby - // and conditions favor it, the standby is modeled as the unbiased active - // baseline scaled up by the bias; when epoll is the standby (io_uring - // active and favored) it is scaled DOWN so we do not switch back. This - // is what makes an organic epoll→io_uring switch reachable — the - // historical-only path could never exceed the threshold (max attainable - // ratio ~0.70 < 1+threshold). + // Standby estimate. The historical (measured, decayed) score ALWAYS counts — + // it is what drives a measurement-based revert. The io_uring bias may + // additionally EXPLORE: it boosts the io_uring standby when the workload + // model favors it (making an organic epoll→io_uring switch reachable), but + // it NEVER suppresses the epoll standby — so reverting from a wrongly-explored + // io_uring is always allowed on measurement. A bad exploration self-corrects + // the next eval; the oscillation lock bounds any thrash. standbyScore := c.historicalScore(standby.Type(), now) if modeled := c.biasModeledStandbyScore(standby.Type(), baselineActiveScore, bias); modeled > standbyScore { standbyScore = modeled @@ -153,27 +144,21 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { return false } -// biasModeledStandbyScore models the standby engine's score for the CURRENT -// workload from the io_uring bias, using the unbiased active baseline as the -// reference point. The sign of the adjustment depends on which engine is the -// standby: -// -// - io_uring standby: when conditions favor io_uring (bias>0) it is modeled -// as bias-better than the active baseline → standby = baseline*(1+bias). -// - epoll standby (io_uring active and favored): epoll is modeled as -// bias-worse → standby = baseline*(1-bias), so a favorable-for-io_uring -// workload never recommends switching back to epoll. +// biasModeledStandbyScore models the io_uring standby's score for the CURRENT +// workload from the io_uring bias (celeris#338): when conditions favor io_uring +// it is modeled as bias-better than the active baseline → standby = +// baseline*(1+bias), making an organic epoll→io_uring EXPLORATION reachable +// (the historical-only path could never clear 1+threshold from a cold standby). // -// Recomputed every tick so the estimate tracks live conditions rather than a -// stale seed. +// It returns 0 for the epoll standby — the bias never models epoll DOWN. That +// asymmetry is the reversibility guarantee: a revert from a wrongly-explored +// io_uring back to epoll is driven purely by epoll's real (historical) +// measurement and is never blocked by the bias. func (c *controller) biasModeledStandbyScore(standby engine.EngineType, baselineActiveScore, bias float64) float64 { - if bias <= 0 { - return baselineActiveScore - } - if standby == engine.IOUring { - return baselineActiveScore * (1.0 + bias) + if bias <= 0 || standby != engine.IOUring { + return 0 } - return baselineActiveScore * (1.0 - bias) + return baselineActiveScore * (1.0 + bias) } // historicalScore returns the last known score for an engine type, decayed diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go index 9c2f1733..7ea7b1df 100644 --- a/adaptive/controller_test.go +++ b/adaptive/controller_test.go @@ -51,12 +51,43 @@ func TestControllerOrganicSwitch(t *testing.T) { } } -// TestControllerNoSpeculativeSwitchBiasOff is the celeris#341 safety guard: with -// the io_uring bias OFF (the default), the SAME io_uring-sweet-spot workload -// must NOT switch — the standby has never been measured, so the only basis for a -// switch would be the fabricated bias estimate, which could land adaptive on a -// measurably-slower engine. Off-by-default keeps adaptive measurement-driven. -func TestControllerNoSpeculativeSwitchBiasOff(t *testing.T) { +// TestControllerRevertsFromSlowerExploredEngine is the celeris#338 reversibility +// guard — the core of the safe bias. io_uring is active (as if just explored to) +// in the bias sweet spot, but it MEASURES slower than epoll's known score. The +// controller MUST revert to epoll: the bias may explore but must never block a +// measurement-driven reversion (it neither inflates the active score nor +// suppresses the epoll standby). This is exactly the case the old sticky bias +// got wrong (it parked adaptive on the slower engine). +func TestControllerRevertsFromSlowerExploredEngine(t *testing.T) { + primary := newMockEngine(engine.IOUring) // active (explored-to) + secondary := newMockEngine(engine.Epoll) // standby, measured-faster historically + sampler := newSyntheticSampler() + + cfg := resource.Config{Protocol: engine.HTTP1} + e := newFromEngines(primary, secondary, sampler, cfg) + e.ctrl.cooldown = 0 + e.ctrl.biasEnabled = true + + now := time.Now() + // epoll was measured fast before; io_uring now measures slow IN the sweet spot. + e.ctrl.state.lastActiveScore[engine.Epoll] = 1000 + e.ctrl.state.lastActiveTime[engine.Epoll] = now + sampler.Set(engine.IOUring, TelemetrySnapshot{ + ThroughputRPS: 500, + ActiveConnections: 2048, + CPUUtilization: 0.9, + }) + + if !e.ctrl.evaluate(now, false) { + t.Fatal("io_uring measuring slower than epoll's historical must REVERT to epoll even in the io_uring bias sweet spot — the bias must not block measured reversion") + } +} + +// TestControllerBiasOffNoExplore verifies the kill-switch: with the bias forced +// off (CELERIS_ADAPTIVE_IOURING_BIAS=0), the controller is purely +// measurement-driven and does NOT explore the unmeasured io_uring standby even +// in the sweet spot. +func TestControllerBiasOffNoExplore(t *testing.T) { primary := newMockEngine(engine.Epoll) // active secondary := newMockEngine(engine.IOUring) // standby, never measured sampler := newSyntheticSampler() @@ -64,11 +95,8 @@ func TestControllerNoSpeculativeSwitchBiasOff(t *testing.T) { cfg := resource.Config{Protocol: engine.HTTP1} e := newFromEngines(primary, secondary, sampler, cfg) e.ctrl.cooldown = 0 - if e.ctrl.biasEnabled { - t.Skip("CELERIS_ADAPTIVE_IOURING_BIAS set in env; default-off assertion N/A") - } + e.ctrl.biasEnabled = false // kill-switch - // Squarely in the io_uring bias sweet spot — would switch if the bias were on. sampler.Set(engine.Epoll, TelemetrySnapshot{ ThroughputRPS: 1000, ActiveConnections: 2048, @@ -78,11 +106,60 @@ func TestControllerNoSpeculativeSwitchBiasOff(t *testing.T) { now := time.Now() for i := range 5 { if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - t.Fatal("bias off: must NOT speculatively switch to the unmeasured io_uring standby") + t.Fatal("bias off: must not explore the unmeasured io_uring standby") } } } +// TestControllerStableUnderFluctuation is the celeris#338 real-load stability +// guard: at the 1024c sweet spot where io_uring is marginally faster (~+7%, well +// under the 15% switch threshold), with ±4% telemetry jitter on both engines, +// the controller must EXPLORE to io_uring and SETTLE there without thrashing. +// The 15% threshold provides the hysteresis; the reversible bias provides the +// explore. (Unit tests cover static telemetry; this covers fluctuation.) +func TestControllerStableUnderFluctuation(t *testing.T) { + primary := newMockEngine(engine.Epoll) // start active + secondary := newMockEngine(engine.IOUring) // marginally faster at 1024c + sampler := newSyntheticSampler() + + cfg := resource.Config{Protocol: engine.HTTP1} + e := newFromEngines(primary, secondary, sampler, cfg) + c := e.ctrl + c.cooldown = 0 + c.biasEnabled = true + + now := time.Now() + switches := 0 + // activeType derives the controller's chosen engine from its decision state + // (recordSwitch toggles activeIsPrimary; the engine-level performSwitch is + // out of scope for a controller-logic test). + activeType := func() engine.EngineType { + if c.state.activeIsPrimary { + return primary.Type() + } + return secondary.Type() + } + for i := range 60 { + // Deterministic ±4% jitter (no rand): pattern over -2..+2. Both engines + // are set each tick; evaluate samples whichever it currently calls active. + jit := func(base float64) float64 { return base * (1.0 + 0.04*float64((i%5)-2)/2) } + sampler.Set(engine.Epoll, TelemetrySnapshot{ThroughputRPS: jit(1240), ActiveConnections: 1024, CPUUtilization: 0.85}) + sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: jit(1330), ActiveConnections: 1024, CPUUtilization: 0.85}) + tn := now.Add(time.Duration(i+1) * time.Second) + if c.evaluate(tn, false) { + c.recordSwitch(tn) + switches++ + } + } + + if activeType() != engine.IOUring { + t.Fatalf("adaptive should explore + settle on the faster io_uring at 1024c, got %s", activeType()) + } + if switches > 3 { + t.Fatalf("excessive switching under fluctuation (%d) — possible thrash; 15%% threshold should hold once settled", switches) + } +} + // TestControllerNoSwitchOutsideSweetSpot is the inverse: low CPU or too few // connections yields zero bias, so the controller must NOT recommend a switch // (no degradation, no favorable conditions). From 09113c8eba6dc80e809cda570bf1c2fff6312e7c Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 16:59:52 +0200 Subject: [PATCH 13/27] perf(core): reuse blobHdrScratch for >16-header responses (#374) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blob assembled its response header list (content-type + content-length + user headers) via make([][2]string, 0, total) on EVERY response whose total exceeds respHdrBuf's 16 slots. An allocation profile of chain-fullstack (18 headers) showed this was the DOMINANT per-request alloc — ~77% of all allocations, ~1.16 GB/s → GC pressure → the throughput cost. (get-json and other <=14-user-header responses already hit the alloc-free inline fast path and are unaffected.) Reuse a per-Context blobHdrScratch (alloc once per pooled Context, not per request), mirroring respHdrScratch (#360). respHeaders never aliases it (separate buffer; the append copies the [2]string values). A/B (interleaved, 2 rounds, vs baseline): chain-fullstack +4.4% (iouring-h1- async) / +5.0% (epoll-h1-sync); get-json neutral (-0.2%/+0.1%, control — it never enters this path). Full root race suite + new TestContextBlobManyHeadersZeroAlloc (0 allocs/op) pass. --- context.go | 7 +++++++ context_response.go | 9 ++++++++- context_test.go | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/context.go b/context.go index f41e2dd4..c3f142aa 100644 --- a/context.go +++ b/context.go @@ -174,6 +174,13 @@ type Context struct { // requests so a >16-header route reallocates once per pooled Context, not // once per request. nil until the first overflow. respHdrScratch [][2]string + // blobHdrScratch is the reused assembly buffer for Blob's response header + // list (content-type + content-length + user headers) when the total exceeds + // respHdrBuf's 16 slots. Without it, Blob allocates make([][2]string,0,total) + // on EVERY many-header response (chain-fullstack: 18 headers ⇒ the dominant + // per-request alloc + GC pressure). Reused so it reallocates once per pooled + // Context, not once per request. nil until the first many-header Blob. + blobHdrScratch [][2]string trustedNets []*net.IPNet diff --git a/context_response.go b/context_response.go index 42dbacfd..7a5d749e 100644 --- a/context_response.go +++ b/context_response.go @@ -589,7 +589,14 @@ func (c *Context) Blob(code int, contentType string, data []byte) error { headers = append(headers, [2]string{"content-length", itoa(len(data))}) headers = append(headers, tmp[:nUser]...) } else { - headers = make([][2]string, 0, total) + // Reuse a per-Context scratch instead of allocating per request — the + // dominant chain-fullstack alloc (18 headers > respHdrBuf's 16) lived + // here. respHeaders never aliases blobHdrScratch (separate buffers; the + // append below copies the [2]string values). + if cap(c.blobHdrScratch) < total { + c.blobHdrScratch = make([][2]string, 0, total) + } + headers = c.blobHdrScratch[:0] headers = append(headers, [2]string{"content-type", ct}) headers = append(headers, [2]string{"content-length", itoa(len(data))}) headers = append(headers, c.respHeaders...) diff --git a/context_test.go b/context_test.go index b4ace633..8f2be7a6 100644 --- a/context_test.go +++ b/context_test.go @@ -281,6 +281,41 @@ func TestContextRespHeaderOverflowReuseZeroAlloc(t *testing.T) { } } +// nopRW is a no-op ResponseWriter so an alloc test can isolate the Context path +// from the mock writer's own header/body copies. +type nopRW struct{} + +func (nopRW) WriteResponse(_ *stream.Stream, _ int, _ [][2]string, _ []byte) error { return nil } + +// TestContextBlobManyHeadersZeroAlloc locks in the chain-fullstack fix: when a +// response carries more than the inline-buffer's headers (15 user + content-type +// + content-length = 17 > 16), Blob must reuse blobHdrScratch instead of +// allocating make([][2]string,0,total) every request — that was the dominant +// per-request allocation (≈77% of chain-fullstack allocs → GC pressure). +func TestContextBlobManyHeadersZeroAlloc(t *testing.T) { + s, _ := newTestStream("GET", "/test") + defer s.Release() + s.ResponseWriter = nopRW{} // the default mock writer allocates; isolate Blob + c := acquireContext(s) + defer releaseContext(c) + + // 15 user headers ⇒ total 17 > respHdrBuf's 16 ⇒ Blob's many-header path. + for i := 0; i < 15; i++ { + c.SetHeader("h"+strconv.Itoa(i), "v") + } + body := []byte("hello") + avg := testing.AllocsPerRun(500, func() { + c.written = false + _ = c.Blob(200, "application/json", body) + }) + if avg != 0 { + t.Fatalf("Blob many-header path must reuse blobHdrScratch: got %.2f allocs/op, want 0", avg) + } + if cap(c.blobHdrScratch) < 17 { + t.Fatalf("blobHdrScratch should retain a >=17 cap backing array, got cap=%d", cap(c.blobHdrScratch)) + } +} + func TestContextFullPathResetOnRelease(t *testing.T) { s, _ := newTestStream("GET", "/test") defer s.Release() From e905e39482f67cec6742051fcd869f8b98393065 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 17:25:09 +0200 Subject: [PATCH 14/27] fix(secure): default Cross-Origin-Embedder-Policy + X-Download-Options off (#338) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEHAVIOR CHANGE. secure.New() no longer emits Cross-Origin-Embedder-Policy (require-corp) or X-Download-Options (noopen) by default — both are now opt-in. COEP=require-corp by default is a footgun: it blocks cross-origin resources (images/scripts without CORP/CORS), silently breaking many sites — the config's own comment warned about it, yet it was on by default. Helmet leaves COEP off for this reason; we now match. X-Download-Options only ever affected legacy IE and is obsolete. Set either field explicitly to re-enable. Default header count 11 -> 9 (HSTS still runtime-gated to HTTPS). Beyond the footgun fix, the smaller response flips chain-security to a WIN vs fasthttp (-1.2% -> +0.7%) and improves chain-fullstack (-6.0% -> -4.9%); chain-api (no secure mw) unchanged. secure suite + middleware + conformance pass; new coep/x-download opt-in test cases added. --- middleware/secure/config.go | 22 +++++++++++------- middleware/secure/doc.go | 18 +++++++------- middleware/secure/secure_test.go | 40 +++++++++++++++++--------------- 3 files changed, 45 insertions(+), 35 deletions(-) diff --git a/middleware/secure/config.go b/middleware/secure/config.go index f6151f4c..c630a409 100644 --- a/middleware/secure/config.go +++ b/middleware/secure/config.go @@ -67,12 +67,13 @@ type Config struct { CrossOriginResourcePolicy string `yaml:"cross_origin_resource_policy"` // CrossOriginEmbedderPolicy sets the Cross-Origin-Embedder-Policy header. - // Default: "require-corp". + // Default: "" (NOT emitted — opt-in). // - // WARNING: "require-corp" blocks cross-origin resources (images, scripts, - // etc.) that do not carry a Cross-Origin-Resource-Policy header or valid - // CORS headers. If your application loads third-party assets, use - // "credentialless" or [Suppress] to avoid breakage. + // COEP is off by default because "require-corp" blocks cross-origin + // resources (images, scripts, etc.) that lack a Cross-Origin-Resource-Policy + // or valid CORS headers — enabling it by default silently breaks many sites, + // so like helmet we leave it opt-in. Set "require-corp" or "credentialless" + // to enable. CrossOriginEmbedderPolicy string `yaml:"cross_origin_embedder_policy"` // XDNSPrefetchControl sets the X-DNS-Prefetch-Control header. @@ -88,7 +89,9 @@ type Config struct { OriginAgentCluster string `yaml:"origin_agent_cluster"` // XDownloadOptions sets the X-Download-Options header. - // Default: "noopen". + // Default: "" (NOT emitted — opt-in). This header only affected legacy + // Internet Explorer downloads and is obsolete on modern browsers; set + // "noopen" to restore it. XDownloadOptions string `yaml:"x_download_options"` } @@ -103,11 +106,14 @@ var defaultConfig = Config{ ReferrerPolicy: "strict-origin-when-cross-origin", CrossOriginOpenerPolicy: "same-origin", CrossOriginResourcePolicy: "same-origin", - CrossOriginEmbedderPolicy: "require-corp", + // COEP off by default (opt-in): require-corp breaks cross-origin resource + // loads; enabling it by default is a footgun (matches helmet). #338. + CrossOriginEmbedderPolicy: "", XDNSPrefetchControl: "off", XPermittedCrossDomain: "none", OriginAgentCluster: "?1", - XDownloadOptions: "noopen", + // X-Download-Options off by default (opt-in): legacy IE-only, obsolete. + XDownloadOptions: "", } func applyDefaults(cfg Config) Config { diff --git a/middleware/secure/doc.go b/middleware/secure/doc.go index 156acdb3..7c7025a4 100644 --- a/middleware/secure/doc.go +++ b/middleware/secure/doc.go @@ -14,14 +14,16 @@ // - Referrer-Policy: "strict-origin-when-cross-origin" // - Cross-Origin-Opener-Policy: "same-origin" // - Cross-Origin-Resource-Policy: "same-origin" -// - Cross-Origin-Embedder-Policy: "require-corp" // - X-DNS-Prefetch-Control: "off" // - X-Permitted-Cross-Domain-Policies: "none" // - Origin-Agent-Cluster: "?1" -// - X-Download-Options: "noopen" // -// Content-Security-Policy and Permissions-Policy are only included when -// their respective [Config] fields are non-empty. +// Cross-Origin-Embedder-Policy ("require-corp") and X-Download-Options +// ("noopen") are OPT-IN (off by default, #338): COEP-by-default breaks +// cross-origin resource loads (matching helmet, which leaves it off), and +// X-Download-Options only affected legacy Internet Explorer. Content-Security- +// Policy and Permissions-Policy are likewise only emitted when their [Config] +// fields are non-empty. // // # Usage // @@ -64,8 +66,8 @@ // # CORS Interaction // // When using CORS middleware alongside secure, note that the default -// CrossOriginEmbedderPolicy (require-corp) and CrossOriginResourcePolicy -// (same-origin) block cross-origin resource loading. APIs serving -// cross-origin requests should set CrossOriginResourcePolicy: "cross-origin" -// and CrossOriginEmbedderPolicy: [Suppress]. +// CrossOriginResourcePolicy (same-origin) blocks cross-origin resource +// loading; APIs serving cross-origin requests should set +// CrossOriginResourcePolicy: "cross-origin". COEP is off by default so it no +// longer needs suppressing; enable it explicitly only when isolating the page. package secure diff --git a/middleware/secure/secure_test.go b/middleware/secure/secure_test.go index 67beefd7..abadfee8 100644 --- a/middleware/secure/secure_test.go +++ b/middleware/secure/secure_test.go @@ -28,11 +28,13 @@ func TestDefaultConfigSetsAllHeaders(t *testing.T) { testutil.AssertHeader(t, rec, "referrer-policy", "strict-origin-when-cross-origin") testutil.AssertHeader(t, rec, "cross-origin-opener-policy", "same-origin") testutil.AssertHeader(t, rec, "cross-origin-resource-policy", "same-origin") - testutil.AssertHeader(t, rec, "cross-origin-embedder-policy", "require-corp") + // COEP (require-corp) and X-Download-Options (legacy IE) are OFF by default + // — opt-in (#338); COEP-by-default breaks cross-origin loads. + testutil.AssertNoHeader(t, rec, "cross-origin-embedder-policy") testutil.AssertHeader(t, rec, "x-dns-prefetch-control", "off") testutil.AssertHeader(t, rec, "x-permitted-cross-domain-policies", "none") testutil.AssertHeader(t, rec, "origin-agent-cluster", "?1") - testutil.AssertHeader(t, rec, "x-download-options", "noopen") + testutil.AssertNoHeader(t, rec, "x-download-options") } func TestDefaultConfigOmitsCSPAndPermissionsPolicy(t *testing.T) { @@ -56,8 +58,8 @@ func TestHeaderDefaultsAndOverrides(t *testing.T) { {"xss-protection custom", &Config{XSSProtection: "1; mode=block"}, "x-xss-protection", "1; mode=block"}, {"origin-agent-cluster default", nil, "origin-agent-cluster", "?1"}, {"origin-agent-cluster custom", &Config{OriginAgentCluster: "?0"}, "origin-agent-cluster", "?0"}, - {"x-download-options default", nil, "x-download-options", "noopen"}, - {"x-download-options custom", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"}, + {"x-download-options custom (opt-in)", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"}, + {"coep opt-in", &Config{CrossOriginEmbedderPolicy: "require-corp"}, "cross-origin-embedder-policy", "require-corp"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -275,8 +277,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) { if cfg.CrossOriginResourcePolicy != "same-origin" { t.Fatalf("CrossOriginResourcePolicy: got %q, want %q", cfg.CrossOriginResourcePolicy, "same-origin") } - if cfg.CrossOriginEmbedderPolicy != "require-corp" { - t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q", cfg.CrossOriginEmbedderPolicy, "require-corp") + if cfg.CrossOriginEmbedderPolicy != "" { + t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q (opt-in default, #338)", cfg.CrossOriginEmbedderPolicy, "") } if cfg.XDNSPrefetchControl != "off" { t.Fatalf("XDNSPrefetchControl: got %q, want %q", cfg.XDNSPrefetchControl, "off") @@ -287,8 +289,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) { if cfg.OriginAgentCluster != "?1" { t.Fatalf("OriginAgentCluster: got %q, want %q", cfg.OriginAgentCluster, "?1") } - if cfg.XDownloadOptions != "noopen" { - t.Fatalf("XDownloadOptions: got %q, want %q", cfg.XDownloadOptions, "noopen") + if cfg.XDownloadOptions != "" { + t.Fatalf("XDownloadOptions: got %q, want %q (opt-in default, #338)", cfg.XDownloadOptions, "") } } @@ -342,14 +344,14 @@ func TestBuildHeadersSkipsEmptyStrings(t *testing.T) { func TestBuildHeadersDefaultCount(t *testing.T) { cfg := applyDefaults(Config{}) headers := buildHeaders(cfg) - // Default config should produce 11 headers (no CSP, no PermissionsPolicy). - // x-content-type-options, x-frame-options, x-xss-protection, - // referrer-policy, cross-origin-opener-policy, cross-origin-resource-policy, - // cross-origin-embedder-policy, x-dns-prefetch-control, x-permitted-cross-domain-policies, - // origin-agent-cluster, x-download-options. - // Note: HSTS is not in buildHeaders (runtime check). - if len(headers) != 11 { - t.Fatalf("expected 11 headers from default config, got %d", len(headers)) + // Default config produces 9 headers (no CSP, no PermissionsPolicy; COEP + + // X-Download-Options are opt-in, #338): x-content-type-options, + // x-frame-options, x-xss-protection, referrer-policy, + // cross-origin-opener-policy, cross-origin-resource-policy, + // x-dns-prefetch-control, x-permitted-cross-domain-policies, + // origin-agent-cluster. Note: HSTS is not in buildHeaders (runtime check). + if len(headers) != 9 { + t.Fatalf("expected 9 headers from default config, got %d", len(headers)) } } @@ -359,9 +361,9 @@ func TestBuildHeadersWithCSPAndPermissions(t *testing.T) { PermissionsPolicy: "camera=()", }) headers := buildHeaders(cfg) - // 11 defaults + CSP + PermissionsPolicy = 13. - if len(headers) != 13 { - t.Fatalf("expected 13 headers, got %d", len(headers)) + // 9 defaults + CSP + PermissionsPolicy = 11. + if len(headers) != 11 { + t.Fatalf("expected 11 headers, got %d", len(headers)) } } From fd1d6130c0145f16c3fd8fb0dbcd654790271d4a Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 19:28:59 +0200 Subject: [PATCH 15/27] fix(iouring): stop sticky async-promotion of inline conns on split-recv bodies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A keep-alive connection that handled even one fixed-length body split across recvs was permanently promoted to the async dispatch goroutine (worker.go HasPendingData gate), then served every subsequent request via a blocking unix.Write + cross-goroutine condvar handoff instead of the inline io_uring linked SEND. Under sustained small-POST load ~11% of requests split, so essentially every long-lived conn was poisoned onto the slow path within its first few requests. A fixed-length body in progress resumes via the inline re-parse path (ProcessH1 bodyNeeded>0), which runs on the worker that owns h1State and is already async-checked (provably non-async) — exactly like the sync engine. Only buffered partial headers / chunked bodies genuinely need the InlineMode=false dispatch path. Split the gate: HasPendingDispatchState promotes for buffered-headers/chunked only, never for a fixed body. Also tighten pickRecvTarget: gate the zero-copy direct-into-bodyBuf recv bail on (w.async && cs.asyncPromoted) rather than blanket w.async, so inline-owned conns get the zero-copy body recv the sync path already uses. The worker still owns h1State for non-promoted conns, so no new races (this strictly REDUCES cross-goroutine handoff). Async-marked routes are still promoted at the fresh-parse site before the body, and partial headers still re-run the async check on completion. --- engine/iouring/worker.go | 13 ++++++++----- internal/conn/h1.go | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index 77163a99..aa287bb6 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -1918,7 +1918,7 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // continuation runs on the dispatch goroutine — the partial-state // parse paths must not run inline (only the fresh-parse site // honors the async check). - if processErr == nil && cs.h1State.HasPendingData() { + if processErr == nil && cs.h1State.HasPendingDispatchState() { cs.asyncPromoted = true w.asyncPromoted.Add(1) } @@ -3052,10 +3052,13 @@ func (w *Worker) pickRecvTarget(cs *connState) []byte { // completed, so no kernel SQE still targets the old bodyBuf array. The // body path below re-sets the pin when it arms into bodyBuf again. cs.bodyRecvPin = nil - // Async mode: the dispatch goroutine owns h1State; the worker cannot - // safely observe NextRecvBuf without synchronization. Always use - // cs.buf so the goroutine handles body accumulation on its side. - if w.async || w.bufRing != nil || cs.h1State == nil { + // Async mode: only a PROMOTED conn hands h1State to the dispatch + // goroutine, which the worker cannot safely observe NextRecvBuf against. + // A non-promoted conn (celeris#356 inline-first) runs ProcessH1 on the + // worker itself (tryInline), so the worker owns h1State exactly as the + // sync path does and the zero-copy direct-into-bodyBuf recv is safe — + // gate the bail on cs.asyncPromoted, not blanket w.async. + if (w.async && cs.asyncPromoted) || w.bufRing != nil || cs.h1State == nil { return cs.buf } if !w.h1Only && engine.Protocol(cs.protocol.Load()) != engine.HTTP1 { diff --git a/internal/conn/h1.go b/internal/conn/h1.go index ea5b0d7d..34cb8cbf 100644 --- a/internal/conn/h1.go +++ b/internal/conn/h1.go @@ -246,6 +246,20 @@ func (s *H1State) HasPendingData() bool { return s.buffer.Len() > 0 || s.bodyNeeded > 0 } +// HasPendingDispatchState reports whether ProcessH1 left partial state that +// MUST be continued on the dispatch goroutine (InlineMode=false): buffered +// partial headers or a chunked body in progress, both of which resume via the +// buffered parse path that does not re-run the per-route async check. A +// fixed-length body in progress (bodyNeeded > 0) is deliberately EXCLUDED: its +// continuation re-parses the already-async-checked (so provably non-async) +// request and dispatches the handler inline on the worker — exactly as the +// sync engine does — so it must NOT force the conn onto the slower async +// dispatch path. Promoting on it permanently poisoned keep-alive conns that +// hit even one split-across-recvs body (the post-4k regression). +func (s *H1State) HasPendingDispatchState() bool { + return s.buffer.Len() > 0 && s.bodyNeeded <= 0 +} + // UpdateWriteFn replaces the response adapter's write function. Called by // OnDetach to route StreamWriter writes through the mutex-guarded writeFn. func (s *H1State) UpdateWriteFn(fn func([]byte)) { From e1a0119d0c5226dcedbb9e8f1c676ca688220ed8 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 19:40:57 +0200 Subject: [PATCH 16/27] fix(iouring): never auto-select SQPOLL (#377) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optionalTier was the only path that set IORING_SETUP_SQPOLL, reachable when the provided-buffers probe fails on an otherwise-High kernel. That path is doubly broken: (1) celeris runs one ring per worker, so SQPOLL spawns one kernel poll thread per worker — N spinning cores that starve the workers (measured -83% throughput, 75% idle CPU on a 16-worker box); (2) the dormant SQPOLL submit path has a latent SQ-tail-publish race in Ring.GetSQE (the shared tail is advanced before the SQE payload is written, safe only because io_uring_enter is the sync point on the non-SQPOLL path). optionalTier now uses the task-run completion model like highTier (DEFER_TASKRUN|SINGLE_ISSUER, or COOP_TASKRUN on 6.0), keeping provided buffers / multishot / SEND_ZC but never SQPOLL. SQPollIdle returns 0 so the worker SQPOLL branch is unreachable. Documented the GetSQE SQPOLL-unsafety so any future SQPOLL work fixes the tail-publish first. Test updated to assert the new contract (task-run, never SQPOLL). --- engine/iouring/engine_test.go | 38 ++++++++++++++++++++++++++--------- engine/iouring/ring.go | 8 ++++++++ engine/iouring/tier.go | 36 ++++++++++++++++++++------------- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/engine/iouring/engine_test.go b/engine/iouring/engine_test.go index cdee385a..f44acdf8 100644 --- a/engine/iouring/engine_test.go +++ b/engine/iouring/engine_test.go @@ -197,9 +197,11 @@ func TestSelectTierOptionalWithoutSendZC(t *testing.T) { } } -func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) { - // SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN. - // SQPOLL path must use neither IPI-related flag. +func TestSelectTierOptionalUsesTaskrunNotSQPoll(t *testing.T) { + // #377: SQPOLL is never selected. Per-worker rings would spawn one kernel + // poll thread per worker (N spinning cores), and the dormant SQPOLL submit + // path has a latent SQ-tail-publish race. The optional tier uses the + // task-run completion model like the high tier instead. profile := engine.CapabilityProfile{ IOUringTier: engine.Optional, CoopTaskrun: true, @@ -212,18 +214,36 @@ func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) { t.Fatal("expected non-nil tier") } flags := tier.SetupFlags() - if flags&setupDeferTaskrun != 0 { - t.Error("DEFER_TASKRUN must not be set with SQPOLL (incompatible)") + if flags&setupSQPoll != 0 { + t.Error("SQPOLL must not be selected (#377)") } - if flags&setupCoopTaskrun != 0 { - t.Error("COOP_TASKRUN must not be set with SQPOLL (incompatible)") + if flags&setupDeferTaskrun == 0 { + t.Error("expected DEFER_TASKRUN when available") + } + if flags&setupSingleIssuer == 0 { + t.Error("expected SINGLE_ISSUER") } - if flags&setupSQPoll == 0 { - t.Error("expected SQPOLL in setup flags") + if tier.SQPollIdle() != 0 { + t.Error("SQPollIdle must be 0 when SQPOLL is disabled") } if !tier.SupportsFixedFiles() { t.Error("fixed files should be enabled when profile.FixedFiles is true") } + + // Without DEFER_TASKRUN (e.g. a 6.0 kernel), fall back to COOP_TASKRUN — + // still never SQPOLL. + coopProfile := engine.CapabilityProfile{ + IOUringTier: engine.Optional, + CoopTaskrun: true, + SQPoll: true, + } + coopFlags := SelectTier(coopProfile, 0).SetupFlags() + if coopFlags&setupSQPoll != 0 { + t.Error("SQPOLL must not be selected without DeferTaskrun either (#377)") + } + if coopFlags&setupCoopTaskrun == 0 { + t.Error("expected COOP_TASKRUN when DEFER_TASKRUN unavailable") + } } func TestSelectTierNone(t *testing.T) { diff --git a/engine/iouring/ring.go b/engine/iouring/ring.go index f2d3ed87..fd5921ec 100644 --- a/engine/iouring/ring.go +++ b/engine/iouring/ring.go @@ -231,6 +231,14 @@ func (r *Ring) mmap() error { } // GetSQE returns a pointer to the next available SQE, or nil if the ring is full. +// +// NOT SQPOLL-safe: this advances the SHARED SQ tail before the caller writes +// the SQE payload, which is correct only because io_uring_enter is the kernel +// sync point on the non-SQPOLL submit path. Under IORING_SETUP_SQPOLL the +// kernel poll thread reads the tail continuously and could consume a +// half-written SQE — so SQPOLL must not be enabled without first switching to +// a deferred local tail + a release-store publish after fill. No tier selects +// SQPOLL (see optionalTier / #377), so this path is never exercised today. func (r *Ring) GetSQE() unsafe.Pointer { var tail, head uint32 if r.singleIssuer { diff --git a/engine/iouring/tier.go b/engine/iouring/tier.go index e990f726..ad07d406 100644 --- a/engine/iouring/tier.go +++ b/engine/iouring/tier.go @@ -24,9 +24,9 @@ type TierStrategy interface { } // SelectTier returns the highest available tier strategy for the given profile. -// sqPollIdle is the objective-specific SQPOLL thread idle timeout; if zero, -// defaults to 2000ms. -func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) TierStrategy { +// The sqPollIdle parameter is retained for signature stability but no longer +// used: SQPOLL is not selected by any tier (see optionalTier doc / #377). +func SelectTier(profile engine.CapabilityProfile, _ time.Duration) TierStrategy { switch { // DEFER_TASKRUN: completions run in worker's context (no extra kernel thread). // Preferred over SQPOLL because the SQPOLL kernel thread steals CPU from workers. @@ -40,12 +40,7 @@ func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) Tier multishotRecv: profile.MultishotRecv, } case profile.IOUringTier >= engine.Optional && profile.SQPoll: - idle := uint32(sqPollIdle.Milliseconds()) - if idle == 0 { - idle = 2000 - } return &optionalTier{ - sqPollIdle: idle, deferTaskrun: profile.DeferTaskrun, fixedFiles: profile.FixedFiles, sendZC: profile.SendZC, @@ -171,9 +166,18 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) { setSQEUserData(sqe, encodeUserData(udSend, fd)) } -// optionalTier: kernel 6.0+, adds SQPOLL, SEND_ZC. With 6.1+: DEFER_TASKRUN, fixed files. +// optionalTier: kernel 6.0+ with provided buffers but below the High tier +// (e.g. a 6.x kernel where the provided-buffers probe disabled the High path). +// Uses the task-run completion model, NOT SQPOLL. +// +// SQPOLL is deliberately never used here: celeris runs one io_uring ring per +// worker, so SQPOLL would spawn one kernel poll thread PER worker (N spinning +// cores — measured -83% throughput / 75% idle CPU on a 16-worker box), and the +// dormant SQPOLL submit path has a latent SQ-tail-publish race in GetSQE +// (the shared tail is advanced before the SQE payload is written, which is +// safe only because io_uring_enter is the sync point on the non-SQPOLL path). +// See #377. SQPollIdle returns 0 so the worker never enters the SQPOLL branch. type optionalTier struct { - sqPollIdle uint32 deferTaskrun bool fixedFiles bool sendZC bool @@ -183,16 +187,20 @@ type optionalTier struct { func (t *optionalTier) Tier() engine.Tier { return engine.Optional } func (t *optionalTier) SetupFlags() uint32 { - // SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN. - // SINGLE_ISSUER is compatible and enables SQ head optimization. - return setupSQPoll | setupSingleIssuer + // Mirror highTier: task-run completions in the worker's own context, no + // extra kernel thread. (Was setupSQPoll|setupSingleIssuer — see the type + // doc and #377 for why SQPOLL is not used.) + if t.deferTaskrun { + return setupDeferTaskrun | setupSingleIssuer + } + return setupCoopTaskrun | setupSingleIssuer } func (t *optionalTier) SupportsProvidedBuffers() bool { return true } func (t *optionalTier) SupportsMultishotAccept() bool { return t.multishotAccept } func (t *optionalTier) SupportsMultishotRecv() bool { return t.multishotRecv } func (t *optionalTier) SupportsFixedFiles() bool { return t.fixedFiles } func (t *optionalTier) SupportsSendZC() bool { return t.sendZC } -func (t *optionalTier) SQPollIdle() uint32 { return t.sqPollIdle } +func (t *optionalTier) SQPollIdle() uint32 { return 0 } // SQPOLL disabled — see type doc / #377 func (t *optionalTier) PrepareAccept(ring *Ring, listenFD int) { sqe := ring.GetSQE() From eacddc74c95a8086b789804979fd9a53a10d0f0b Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 20:23:01 +0200 Subject: [PATCH 17/27] fix(router): make adaptive route promotion reversible (#364) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #356 adaptive promotion was terminal: once a route accumulated adaptivePromoteStreak slow inline runs it was pinned to async dispatch forever (adaptiveLearning returns false for promoted routes, and a promoted route runs async so it is never re-timed). A CPU-bound chain whose inline WALL-CLOCK briefly crossed adaptivePromoteThreshold under transient worker contention (not actual blocking) was therefore stuck on the ~32%-slower async path until process restart — the intermittent chain-fullstack collapse. Promotion now expires after adaptivePromoteTTL (5s): isPromoted drops the route from the promoted set and resets its slow streak once the stamp is older than the TTL, so the next request runs inline again and is re-timed. A genuinely-blocking route re-promotes within adaptivePromoteStreak runs; a transient false-positive stays inline and re-settles. The clock (nowNano, a test-stubbable package var) is read only for routes already in the promoted set, so the inline/learning/ settled fast paths are unchanged. Tests: promotion expires + slow-streak reset; de-promoted route settles when fast; still-blocking route re-promotes after expiry. NOTE: this reverts promotion at the ROUTING layer. A connection already promoted to its async dispatch goroutine (cs.asyncPromoted) stays there until it closes — the worker owns recv but the async->inline conn handoff is separate, larger work. So long-lived keep-alive conns recover on reconnect / new conns; full in-place conn recovery is a follow-up. --- handler.go | 11 +++++ router.go | 45 +++++++++++++++--- router_async_test.go | 108 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 7 deletions(-) diff --git a/handler.go b/handler.go index a3875e5b..2ecd2e19 100644 --- a/handler.go +++ b/handler.go @@ -183,6 +183,17 @@ const adaptivePromoteStreak = 8 // promotes long before it could settle. const adaptiveSettleStreak = 256 +// adaptivePromoteTTL bounds how long a promotion lasts before the route is +// re-evaluated inline (celeris#364). Promotion is otherwise terminal — a +// promoted route runs async and is never re-timed — so a CPU-bound chain that +// was falsely promoted by a transient load/jitter spike (inline wall-clock +// crossing adaptivePromoteThreshold under worker contention, not actual +// blocking) stayed on the ~32%-slower async path until restart. After the TTL +// the route runs inline again and re-settles if fast, or re-promotes within +// adaptivePromoteStreak runs if genuinely blocking. The clock is read only for +// already-promoted routes, so the fast path is unaffected. +const adaptivePromoteTTL = 5 * time.Second + // recoverAndRelease handles panic recovery and context release. Extracted to a // separate noinline function so that HandleStream's stack frame is not inflated // by the deferred closure and debug.Stack() call (P5). diff --git a/router.go b/router.go index 18b6105c..0ba24f68 100644 --- a/router.go +++ b/router.go @@ -6,8 +6,14 @@ import ( "slices" "sync" "sync/atomic" + "time" ) +// nowNano returns the current time in Unix nanoseconds. A package var so the +// adaptive promotion TTL (celeris#364) can be exercised deterministically in +// tests without sleeping. +var nowNano = func() int64 { return time.Now().UnixNano() } + // staticEntry holds the pre-composed handler chain and full path for a fully // static route, enabling O(1) map lookup instead of a trie walk. type staticEntry struct { @@ -286,18 +292,43 @@ func newRouter() *router { } } -// isPromoted reports whether an adaptive route (celeris#356) has been promoted -// to async dispatch after a blocking inline run. +// isPromoted reports whether an adaptive route (celeris#356) is currently +// promoted to async dispatch. +// +// Promotion is REVERSIBLE (celeris#364): it expires after adaptivePromoteTTL. +// Once expired, the route is dropped from the promoted set and its slow streak +// is cleared, so the next request runs INLINE again and is re-timed — both the +// routing decision (adaptivePromoted) and the per-request timing decision +// (adaptiveLearning) key off this method, so they flip back together. A route +// that is genuinely blocking re-promotes within adaptivePromoteStreak runs; a +// route that was promoted by a transient load/jitter spike (a CPU-bound chain +// whose inline wall-clock briefly crossed the threshold under contention) stays +// inline. Without this, a single spike pinned a route to the slower async path +// until process restart (the intermittent ~32% chain collapse). +// +// The clock is read only for routes actually in the promoted set, so learning +// and settled routes pay nothing here. func (r *router) isPromoted(fullPath string) bool { - _, ok := r.promoted.Load(fullPath) - return ok + v, ok := r.promoted.Load(fullPath) + if !ok { + return false + } + if nowNano()-v.(int64) > int64(adaptivePromoteTTL) { + // Expired: re-enter the learning/inline path to re-evaluate. + r.promoted.Delete(fullPath) + if sv, ok := r.slowStreak.Load(fullPath); ok { + sv.(*atomic.Int32).Store(0) + } + return false + } + return true } // promoteRoute marks an adaptive route as async after sustained blocking inline -// runs. Idempotent; subsequent routeAsync lookups return true so the engine -// dispatches the route to a goroutine. +// runs, stamped with the promotion time so isPromoted can expire it +// (celeris#364). Re-promotion refreshes the stamp. func (r *router) promoteRoute(fullPath string) { - r.promoted.Store(fullPath, struct{}{}) + r.promoted.Store(fullPath, nowNano()) } // recordInlineRun feeds one inline-run observation into the adaptive diff --git a/router_async_test.go b/router_async_test.go index daad24a3..0185159a 100644 --- a/router_async_test.go +++ b/router_async_test.go @@ -276,3 +276,111 @@ func TestRouteAsync_AdaptiveSettles(t *testing.T) { t.Fatal("explicit .Sync() must clear the settled state") } } + +// stubNowNano installs a controllable clock for the adaptive promotion TTL +// (celeris#364) and returns a pointer to advance it plus a restore func. +func stubNowNano() (clock *int64, restore func()) { + old := nowNano + var c int64 + nowNano = func() int64 { return c } + return &c, func() { nowNano = old } +} + +// TestRouteAsync_PromotionExpires verifies celeris#364: promotion is reversible. +// A route promoted by a transient spike must de-promote after adaptivePromoteTTL +// and run inline again, and the de-promotion must reset the slow streak so a +// single later slow run does not immediately re-promote. +func TestRouteAsync_PromotionExpires(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/d", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/d", true) + } + if !rt.routeAsync("GET", "/d") { + t.Fatal("expected promotion after the slow streak") + } + + // Within the TTL → still promoted. + *clock += int64(adaptivePromoteTTL) - 1 + if !rt.routeAsync("GET", "/d") { + t.Fatal("promotion must persist within the TTL") + } + + // Past the TTL → de-promoted, route runs inline again. + *clock += 2 + if rt.routeAsync("GET", "/d") { + t.Fatal("promotion must expire after the TTL (route runs inline again)") + } + if _, ok := rt.promoted.Load("/d"); ok { + t.Fatal("expired promotion must be removed from the promoted set") + } + + // The slow streak was reset: one slow run must NOT immediately re-promote. + rt.recordInlineRun("/d", true) + if rt.routeAsync("GET", "/d") { + t.Fatal("de-promotion must reset the slow streak (one slow run must not re-promote)") + } +} + +// TestRouteAsync_DePromotedRouteCanSettle verifies that after a falsely-promoted +// route de-promotes, sustained fast runs SETTLE it (proven non-blocking, inline +// forever) — i.e. re-evaluation works end to end. +func TestRouteAsync_DePromotedRouteCanSettle(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/d", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/d", true) + } + *clock += int64(adaptivePromoteTTL) + 1 + if rt.routeAsync("GET", "/d") { + t.Fatal("expected de-promotion at expiry") + } + + for i := 0; i < adaptiveSettleStreak; i++ { + rt.recordInlineRun("/d", false) + } + if rt.adaptiveLearning("/d") { + t.Fatal("a de-promoted route that runs fast must settle (inline forever)") + } + if rt.routeAsync("GET", "/d") { + t.Fatal("a settled route runs inline, not async") + } +} + +// TestRouteAsync_RePromotesAfterExpiryWhenBlocking verifies a genuinely-blocking +// adaptive route re-promotes after the TTL re-evaluation — de-promotion must not +// pin a blocking handler to the inline path. +func TestRouteAsync_RePromotesAfterExpiryWhenBlocking(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/b", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/b", true) + } + *clock += int64(adaptivePromoteTTL) + 1 + if rt.routeAsync("GET", "/b") { + t.Fatal("expected de-promotion at expiry") + } + + // Still blocking → re-promotes after a fresh streak. + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/b", true) + } + if !rt.routeAsync("GET", "/b") { + t.Fatal("a still-blocking route must re-promote after expiry") + } +} From 28b28d4a4a39716a8e86417203270b03fce0881b Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Wed, 17 Jun 2026 20:54:45 +0200 Subject: [PATCH 18/27] fix(iouring): revert a promoted connection to inline on route de-promotion (#364) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the celeris#364 fix. PR's first commit made ROUTE promotion reversible (TTL); this makes the per-CONNECTION promotion reversible too, so a long-lived keep-alive conn that was promoted to its async dispatch goroutine returns to the inline fast path when the promoting route de-promotes — without it, such a conn stayed on the ~32%-slower blocking-write+handoff path until it closed (the bench scenario only recovered on reconnect). Mechanism: - The worker records the route that forced promotion (h1State.CurrentRoute, single-shot recv only) before starting the dispatch goroutine. - The dispatch goroutine, at its idle park point (asyncInBuf drained, last response written, no partial request), checks canRevertToInline: route's RouteAsync now false. If so it clears asyncPromoted and exits; the worker already owns recv and resumes the inline fast path on the next CQE. - cs.asyncPromoted becomes atomic.Bool: the goroutine clears it while the worker reads it on the recv hot path. The worker's feed path re-checks it under asyncInMu (the same lock the goroutine clears it under) before appending to asyncInBuf, closing the feed-vs-revert race. Only at a clean request boundary (HasPendingData false) so h1State ownership flips back to the worker exactly as for a fresh inline conn; #256 bodyRecvPin retained. Tests (engine, linux): TestAsyncConnRevertsOnRouteDepromotion proves revert via re-promotion (a still-async conn cannot re-promote); TestAsyncConnRevertRace hammers promote/feed/revert/re-promote across 64 keep-alive conns. Both pass under -race; full async-churn UAF suite stays green under -race. --- engine/iouring/async_revert_test.go | 243 ++++++++++++++++++++++++++++ engine/iouring/conn.go | 22 ++- engine/iouring/worker.go | 62 ++++++- internal/conn/h1.go | 9 ++ 4 files changed, 325 insertions(+), 11 deletions(-) create mode 100644 engine/iouring/async_revert_test.go diff --git a/engine/iouring/async_revert_test.go b/engine/iouring/async_revert_test.go new file mode 100644 index 00000000..c298e2cd --- /dev/null +++ b/engine/iouring/async_revert_test.go @@ -0,0 +1,243 @@ +//go:build linux + +package iouring + +import ( + "bufio" + "context" + "fmt" + "io" + "net" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/protocol/h2/stream" + "github.com/goceleris/celeris/resource" +) + +// revertResolverHandler is an AsyncRouteResolver whose async classification is +// flipped at runtime, simulating celeris#356 route promotion (true) and the +// celeris#364 TTL de-promotion (false) without waiting on the real router TTL. +type revertResolverHandler struct{ async atomic.Bool } + +func (h *revertResolverHandler) HandleStream(_ context.Context, s *stream.Stream) error { + if s.ResponseWriter == nil { + return nil + } + return s.ResponseWriter.WriteResponse(s, 200, + [][2]string{{"content-type", "text/plain"}, {"content-length", "2"}}, []byte("ok")) +} +func (h *revertResolverHandler) RouteAsync(_, _ string) bool { return h.async.Load() } +func (h *revertResolverHandler) HasAsyncRoutes() bool { return true } + +func startRevertEngine(t *testing.T, h stream.Handler) *Engine { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("pick port: %v", err) + } + addr := ln.Addr().String() + _ = ln.Close() + + e, err := New(resource.Config{ + Addr: addr, + Protocol: engine.HTTP1, + Resources: resource.Resources{Workers: 4}, + AsyncHandlers: true, + }, h) + if err != nil { + t.Skipf("iouring engine unavailable: %v", err) + } + ctx, cancel := context.WithCancel(t.Context()) + errCh := make(chan error, 1) + go func() { errCh <- e.Listen(ctx) }() + t.Cleanup(func() { + cancel() + select { + case <-errCh: + case <-time.After(3 * time.Second): + } + }) + dl := time.Now().Add(30 * time.Second) + for time.Now().Before(dl) && e.Addr() == nil { + select { + case err := <-errCh: + if err != nil && (strings.Contains(err.Error(), "cannot allocate memory") || + strings.Contains(err.Error(), "io_uring_setup") || strings.Contains(err.Error(), "tier")) { + t.Skipf("io_uring unavailable on this runner: %v", err) + } + t.Fatalf("engine.Listen returned early: %v", err) + default: + } + time.Sleep(10 * time.Millisecond) + } + if e.Addr() == nil { + t.Fatal("engine did not bind in time") + } + return e +} + +// sendKeepAlive issues one keep-alive GET on an existing conn and reads the +// full 2-byte "ok" response, leaving the conn open for the next request. +func sendKeepAlive(c net.Conn, br *bufio.Reader, timeout time.Duration) error { + _ = c.SetDeadline(time.Now().Add(timeout)) + if _, err := c.Write([]byte("GET /x HTTP/1.1\r\nHost: x\r\n\r\n")); err != nil { + return fmt.Errorf("write: %w", err) + } + statusLine, err := br.ReadString('\n') + if err != nil { + return fmt.Errorf("read status: %w", err) + } + if len(statusLine) < 12 || statusLine[9:12] != "200" { + return fmt.Errorf("bad status: %q", statusLine) + } + for { + line, err := br.ReadString('\n') + if err != nil { + return fmt.Errorf("read header: %w", err) + } + if line == "\r\n" || line == "\n" { + break + } + } + body := make([]byte, 2) + if _, err := io.ReadFull(br, body); err != nil { + return fmt.Errorf("read body: %w", err) + } + if string(body) != "ok" { + return fmt.Errorf("bad body: %q", body) + } + return nil +} + +// TestAsyncConnRevertsOnRouteDepromotion is the celeris#364 conn-level revert +// regression: a keep-alive conn promoted to async dispatch must return to the +// inline fast path once its route de-promotes (RouteAsync flips false), proven +// by its ability to RE-promote afterwards (a still-async conn cannot re-promote +// — it never re-runs the inline ErrAsyncDispatch gate). +func TestAsyncConnRevertsOnRouteDepromotion(t *testing.T) { + h := &revertResolverHandler{} + h.async.Store(true) + e := startRevertEngine(t, h) + target := e.Addr().String() + + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer func() { _ = c.Close() }() + br := bufio.NewReader(c) + send := func(phase string) { + if err := sendKeepAlive(c, br, 2*time.Second); err != nil { + t.Fatalf("%s request failed: %v", phase, err) + } + } + + // Phase A — RouteAsync=true: the conn promotes to async dispatch. + for i := 0; i < 20; i++ { + send("promote") + } + p1 := e.Metrics().AsyncPromotedConns + if p1 == 0 { + t.Fatal("conn never promoted under RouteAsync=true") + } + + // Phase B — RouteAsync=false: the dispatch goroutine must revert the conn + // to inline at its next idle point. Requests must keep succeeding. + h.async.Store(false) + for i := 0; i < 30; i++ { + send("revert") + time.Sleep(time.Millisecond) + } + + // Phase C — RouteAsync=true again: a reverted (now inline) conn re-promotes; + // a conn still stuck async would NOT, so AsyncPromotedConns must increase. + h.async.Store(true) + for i := 0; i < 30; i++ { + send("repromote") + } + p2 := e.Metrics().AsyncPromotedConns + if p2 <= p1 { + t.Fatalf("conn did not revert+re-promote (still stuck async): promoted before=%d after=%d", p1, p2) + } + t.Logf("promotions: after-A=%d after-C=%d (revert confirmed by re-promotion)", p1, p2) +} + +// TestAsyncConnRevertRace hammers the promote/feed/revert/re-promote interaction +// across many keep-alive conns with RouteAsync flipping continuously. Run under +// -race it validates that the worker recv path and the dispatch goroutine never +// race on cs.asyncPromoted / asyncInBuf during a revert. Gated on -short. +func TestAsyncConnRevertRace(t *testing.T) { + if testing.Short() { + t.Skip("revert race test needs sustained toggling load; -short skips it") + } + h := &revertResolverHandler{} + h.async.Store(true) + e := startRevertEngine(t, h) + target := e.Addr().String() + + const ( + concurrency = 64 + duration = 10 * time.Second + ) + stop := make(chan struct{}) + // Toggler: flip the route's async classification fast enough that conns are + // constantly promoting and reverting. + go func() { + tk := time.NewTicker(3 * time.Millisecond) + defer tk.Stop() + for { + select { + case <-stop: + return + case <-tk.C: + h.async.Store(!h.async.Load()) + } + } + }() + + deadline := time.Now().Add(duration) + var ok, failed atomic.Int64 + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + failed.Add(1) + return + } + defer func() { _ = c.Close() }() + br := bufio.NewReader(c) + for time.Now().Before(deadline) { + if err := sendKeepAlive(c, br, 2*time.Second); err != nil { + failed.Add(1) + return // conn broken; a corrupted response would surface here + } + ok.Add(1) + } + }() + } + wg.Wait() + close(stop) + + if ok.Load() < 1000 { + t.Fatalf("too few successful requests (ok=%d failed=%d) — server may have stalled", ok.Load(), failed.Load()) + } + // Liveness epilogue. + h.async.Store(false) + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + t.Fatalf("engine not serving after revert churn: %v", err) + } + defer func() { _ = c.Close() }() + if err := sendKeepAlive(c, bufio.NewReader(c), 2*time.Second); err != nil { + t.Fatalf("engine not serving after revert churn: %v", err) + } + t.Logf("ok=%d failed=%d over %s", ok.Load(), failed.Load(), duration) +} diff --git a/engine/iouring/conn.go b/engine/iouring/conn.go index 06a8476f..4521400d 100644 --- a/engine/iouring/conn.go +++ b/engine/iouring/conn.go @@ -149,10 +149,20 @@ type connState struct { asyncClosed atomic.Bool // asyncPromoted is set once an async-marked route has been observed // on this conn while it ran inline on the worker (per-handler async, - // celeris #300). Sticky: once promoted every subsequent recv goes - // straight to the dispatch goroutine instead of retrying the inline - // fast path. Reset on release. - asyncPromoted bool + // celeris #300). Once promoted, recv goes to the dispatch goroutine. + // REVERSIBLE (celeris#364): the dispatch goroutine clears it to revert + // the conn to inline when the promoting route de-promotes. Atomic because + // the worker reads it on the recv hot path while the goroutine may clear + // it; the worker re-reads it under asyncInMu before feeding to close the + // feed-vs-revert race. Reset on release. + asyncPromoted atomic.Bool + // promotedMethod/promotedPath record the route that forced this conn's + // promotion (celeris#364). Written by the worker before the dispatch + // goroutine starts (happens-before), read by the goroutine to decide + // revert. Empty => not revert-eligible (e.g. promoted for a buffered + // partial-header / chunked continuation, where no full route is known). + promotedMethod string + promotedPath string // asyncH2Promoted signals the worker that runAsyncHandler observed // ErrUpgradeH2C and completed the cs-local H1→H2 state swap under // detachMu. drainDetachQueue finishes the promotion by appending @@ -304,7 +314,9 @@ func releaseConnState(cs *connState) { cs.asyncOutBuf = cs.asyncOutBuf[:0] cs.asyncRun = false cs.asyncClosed.Store(false) - cs.asyncPromoted = false + cs.asyncPromoted.Store(false) + cs.promotedMethod = "" + cs.promotedPath = "" cs.asyncDetachUnlocked = false cs.asyncDetachPending = false cs.bodyBuf = nil diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index aa287bb6..fc74d0af 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -1424,7 +1424,7 @@ func (w *Worker) initProtocol(cs *connState) { // process on the first /ws or /events request. The // asyncPromoted gate restricts the Unlock to the dispatch- // goroutine path that actually holds the lock. See celeris#309. - if w.async && cs.asyncPromoted && cs.detachMu != nil && !cs.asyncDetachUnlocked { + if w.async && cs.asyncPromoted.Load() && cs.detachMu != nil && !cs.asyncDetachUnlocked { cs.asyncDetachUnlocked = true cs.detachMu.Unlock() } @@ -1725,8 +1725,20 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // which point the conn is promoted and its stashed request handed to // the goroutine. This lets sync routes run inline on the worker // (no handoff) on a server that mixes sync + async handlers. - if w.async && cs.asyncPromoted && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) { + asyncFeed := false + if w.async && cs.asyncPromoted.Load() && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) { cs.asyncInMu.Lock() + // celeris#364: re-check under asyncInMu — the dispatch goroutine clears + // asyncPromoted (reverting the conn to inline) under this same lock. If + // it won the race, do NOT feed asyncInBuf (the goroutine is exiting); + // fall through to inline with the unconsumed `data` instead. + if cs.asyncPromoted.Load() { + asyncFeed = true + } else { + cs.asyncInMu.Unlock() + } + } + if asyncFeed { // Backpressure: drop the conn if the dispatch goroutine is // falling behind. Prevents a pipelining client from ballooning // asyncInBuf without bound. @@ -1793,7 +1805,7 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // hasn't been promoted yet, run ProcessH1 inline in InlineMode so it // bails (ErrAsyncDispatch) when it hits an async route. The flag is // set only around the ProcessH1 call(s) below. - tryInline := w.async && !cs.asyncPromoted && cs.h1State != nil && + tryInline := w.async && !cs.asyncPromoted.Load() && cs.h1State != nil && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) // h1Only mode (Protocol=HTTP1 + EnableH2Upgrade=false): no atomic // Load, no switch dispatch, no upgrade-handling block — ProcessH1 @@ -1891,7 +1903,15 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // goes straight to the dispatch path (asyncPromoted guard above). if tryInline { if errors.Is(processErr, conn.ErrAsyncDispatch) { - cs.asyncPromoted = true + // celeris#364: record the route that forced promotion (single-shot + // recv only — the revert path assumes the worker-owned cs.buf recv + // model). Set BEFORE asyncPromoted/goroutine start so the dispatch + // goroutine observes it (happens-before). Empty path => the + // goroutine treats the conn as not revert-eligible. + if w.bufRing == nil { + cs.promotedMethod, cs.promotedPath = cs.h1State.CurrentRoute() + } + cs.asyncPromoted.Store(true) w.asyncPromoted.Add(1) stashed := cs.h1State.TakeBufferedBytes() // Flush any inline-handled response (pipelined sync request @@ -1919,7 +1939,9 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // parse paths must not run inline (only the fresh-parse site // honors the async check). if processErr == nil && cs.h1State.HasPendingDispatchState() { - cs.asyncPromoted = true + // No complete request parsed yet (buffered headers / chunked), so + // no route to record — leave promotedPath empty: not revert-eligible. + cs.asyncPromoted.Store(true) w.asyncPromoted.Add(1) } } @@ -2688,6 +2710,21 @@ func (w *Worker) promoteConnToAsync(cs *connState, _ int, stashed []byte, c *com } } +// canRevertToInline reports whether a promoted conn should be reverted to the +// inline fast path (celeris#364). True when single-shot recv is in use, the +// conn recorded the route that promoted it, and that route's promotion has +// since expired (RouteAsync now false — the route-level TTL de-promotion). +// Called by runAsyncHandler ONLY while holding asyncInMu with asyncInBuf empty. +func (w *Worker) canRevertToInline(cs *connState) bool { + return w.bufRing == nil && cs.promotedPath != "" && cs.h1State != nil && + cs.h1State.RouteAsync != nil && + // Clean request boundary only: never revert mid-request (a partial body + // or buffered headers still accumulating), so h1State ownership flips + // back to the worker between requests, exactly like a fresh inline conn. + !cs.h1State.HasPendingData() && + !cs.h1State.RouteAsync(cs.promotedMethod, cs.promotedPath) +} + func (w *Worker) runAsyncHandler(cs *connState) { defer w.asyncWG.Done() defer func() { @@ -2720,6 +2757,19 @@ func (w *Worker) runAsyncHandler(cs *connState) { for { cs.asyncInMu.Lock() for len(cs.asyncInBuf) == 0 && !cs.asyncClosed.Load() { + // celeris#364: revert this conn to inline when the route that + // promoted it has de-promoted (its TTL expired). Safe ONLY here: + // asyncInBuf is empty (no in-flight input, last response already + // written) and we hold asyncInMu, which the worker's feed path + // re-acquires and re-checks asyncPromoted against — so clearing it + // here cannot race a concurrent feed. The worker owns recv and + // resumes the inline fast path on the next CQE. + if w.canRevertToInline(cs) { + cs.asyncPromoted.Store(false) + cs.asyncRun = false + cs.asyncInMu.Unlock() + return + } cs.asyncCond.Wait() } if cs.asyncClosed.Load() { @@ -3058,7 +3108,7 @@ func (w *Worker) pickRecvTarget(cs *connState) []byte { // worker itself (tryInline), so the worker owns h1State exactly as the // sync path does and the zero-copy direct-into-bodyBuf recv is safe — // gate the bail on cs.asyncPromoted, not blanket w.async. - if (w.async && cs.asyncPromoted) || w.bufRing != nil || cs.h1State == nil { + if (w.async && cs.asyncPromoted.Load()) || w.bufRing != nil || cs.h1State == nil { return cs.buf } if !w.h1Only && engine.Protocol(cs.protocol.Load()) != engine.HTTP1 { diff --git a/internal/conn/h1.go b/internal/conn/h1.go index 34cb8cbf..415f6164 100644 --- a/internal/conn/h1.go +++ b/internal/conn/h1.go @@ -260,6 +260,15 @@ func (s *H1State) HasPendingDispatchState() bool { return s.buffer.Len() > 0 && s.bodyNeeded <= 0 } +// CurrentRoute returns the method and path of the request currently parsed into +// the H1 state. The engine records these when a route forces an async promotion +// so the dispatch goroutine can revert the connection to inline once that +// route's promotion expires (celeris#364). req.Method/req.Path are interned +// (stable) strings, safe to retain past the recv buffer's lifetime. +func (s *H1State) CurrentRoute() (method, path string) { + return s.req.Method, s.req.Path +} + // UpdateWriteFn replaces the response adapter's write function. Called by // OnDetach to route StreamWriter writes through the mutex-guarded writeFn. func (s *H1State) UpdateWriteFn(fn func([]byte)) { From 112f5153300f0c5018c1031956a007c02363b9cc Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Thu, 18 Jun 2026 16:35:09 +0200 Subject: [PATCH 19/27] refactor(engine): remove dynamic worker scaler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load-based worker scaler (pause/resume by connection count) is obsolete on kernel 7.0+ — its concentration premise has reversed (more workers win at every concurrency, 4-core and 16-core alike) and its down-scale strands keep-alive throughput on surge-after-quiet (-31% on get-simple-1024c in the harness sequence). Removing it recovers that throughput with no regression. Worker pool is now static = numCPU (Resources.Workers), all workers always active; the adaptive engine's accept-pause/suspend lifecycle is unaffected. --- adaptive/engine.go | 35 +- ...st.go => pauseaccept_h2_dial_race_test.go} | 27 +- adaptive/scaler.go | 114 ------ adaptive/scaler_test.go | 116 ------- config.go | 39 --- engine/engine.go | 18 - engine/epoll/engine.go | 12 - engine/epoll/loop.go | 13 +- engine/epoll/scaler.go | 62 ---- engine/epoll/scaler_test.go | 8 - engine/iouring/buf_ring_scale_test.go | 2 +- engine/iouring/engine.go | 12 - engine/iouring/scaler.go | 65 ---- engine/iouring/scaler_test.go | 10 - engine/iouring/worker.go | 50 ++- engine/scaler/doc.go | 16 - engine/scaler/scaler.go | 285 --------------- engine/scaler/scaler_test.go | 327 ------------------ resource/config.go | 97 ------ server_test.go | 38 -- 20 files changed, 49 insertions(+), 1297 deletions(-) rename adaptive/{scaler_h2_dial_race_test.go => pauseaccept_h2_dial_race_test.go} (72%) delete mode 100644 adaptive/scaler.go delete mode 100644 adaptive/scaler_test.go delete mode 100644 engine/epoll/scaler.go delete mode 100644 engine/epoll/scaler_test.go delete mode 100644 engine/iouring/scaler.go delete mode 100644 engine/iouring/scaler_test.go delete mode 100644 engine/scaler/doc.go delete mode 100644 engine/scaler/scaler.go delete mode 100644 engine/scaler/scaler_test.go diff --git a/adaptive/engine.go b/adaptive/engine.go index 10fe6644..66750f7b 100644 --- a/adaptive/engine.go +++ b/adaptive/engine.go @@ -15,7 +15,6 @@ import ( "github.com/goceleris/celeris/engine" "github.com/goceleris/celeris/engine/epoll" "github.com/goceleris/celeris/engine/iouring" - "github.com/goceleris/celeris/engine/scaler" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" ) @@ -44,7 +43,7 @@ type Engine struct { freezeCooldown time.Duration // listenMu guards listenCancel/listenDone, which let Shutdown deterministically - // stop and JOIN the evaluation/scaler goroutines started by Listen. Without + // stop and JOIN the evaluation-loop goroutine started by Listen. Without // this, Shutdown could return (sub-engines stopped) while the eval loop is // still mid-Sample on the CPU monitor the server is about to close. listenMu sync.Mutex @@ -92,20 +91,12 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) } } - // Suppress the per-engine built-in scaler in both sub-engines — - // adaptive runs ONE higher-level scaler that delegates to whichever - // sub-engine is currently active. Two scalers fighting over the same - // worker pool produced -54 % to +118 % variance on pinning tests - // during the spike-B exploration; gating this way eliminates that. - subCfg := cfg - subCfg.SkipBuiltinScaler = true - - primary, err := epoll.New(subCfg, handler) + primary, err := epoll.New(cfg, handler) if err != nil { return nil, fmt.Errorf("epoll sub-engine: %w", err) } - secondary, err := iouring.New(subCfg, handler) + secondary, err := iouring.New(cfg, handler) if err != nil { return nil, fmt.Errorf("io_uring sub-engine: %w", err) } @@ -161,7 +152,7 @@ func (e *Engine) Listen(ctx context.Context) error { defer innerCancel() // Publish the cancel + a done channel so Shutdown can stop and join the - // goroutines this Listen owns (eval loop, scaler) before the server closes + // goroutine this Listen owns (the eval loop) before the server closes // shared resources such as the CPU monitor. done := make(chan struct{}) e.listenMu.Lock() @@ -264,22 +255,10 @@ bindLoop: e.runEvalLoop(innerCtx) }) - // Start the higher-level dynamic worker scaler. This is the only - // worker scaler that runs in an adaptive setup — sub-engines have - // their built-in scalers suppressed via Config.SkipBuiltinScaler. - // Typed cfg.WorkerScaling takes precedence over env vars. The - // algorithm lives in engine/scaler; adaptive provides a switch-aware - // Source via adaptive/scaler.go. - if scalerCfg := scaler.Resolve(e.cfg, e.cfg.Resources.Resolve().Workers); scalerCfg.Enabled { - wg.Go(func() { - e.runScaler(innerCtx, scalerCfg) - }) - } - select { case <-innerCtx.Done(): // Parent context cancelled, or Shutdown cancelled innerCtx directly to - // stop and join the eval/scaler goroutines. + // stop and join the eval-loop goroutine. case err := <-errCh: innerCancel() wg.Wait() @@ -423,8 +402,8 @@ func (e *Engine) maybeThawLocked() { // Shutdown gracefully shuts down both sub-engines. // -// It first cancels and JOINS the goroutines started by Listen (the evaluation -// loop and worker scaler), so that no controller tick can still be sampling +// It first cancels and JOINS the goroutine started by Listen (the evaluation +// loop), so that no controller tick can still be sampling // telemetry — including the CPU monitor the server closes immediately after // Shutdown returns — by the time this function completes. Only then are the // sub-engines shut down. This is purely a join/sequencing concern; it does not diff --git a/adaptive/scaler_h2_dial_race_test.go b/adaptive/pauseaccept_h2_dial_race_test.go similarity index 72% rename from adaptive/scaler_h2_dial_race_test.go rename to adaptive/pauseaccept_h2_dial_race_test.go index 5dfaafaa..28e0ecb0 100644 --- a/adaptive/scaler_h2_dial_race_test.go +++ b/adaptive/pauseaccept_h2_dial_race_test.go @@ -16,23 +16,27 @@ import ( "github.com/goceleris/celeris/resource" ) -// TestAdaptiveScaler_H2DialNoRSTRace mirrors TestAdaptiveH2DialNoRSTRace -// with the dynamic worker scaler enabled. Locks in that the scaler's -// start-high default does not pause workers before the listen FDs -// settle in the SO_REUSEPORT group — which would RST in-flight H2 -// prior-knowledge handshakes mid-flush. +// TestAdaptivePauseAccept_H2DialNoRSTRace is a second angle on the +// H2-dial-RST race that TestAdaptiveH2DialNoRSTRace guards, run with a +// larger (4) worker pool so the standby engine has more listen FDs to +// evict from the SO_REUSEPORT group before Addr() is published. The +// race is the same: if PauseAccept on the standby has not synchronously +// removed every listen FD from the routing pool by the time adaptive +// exposes Addr, a burst of dials gets split across active and standby, +// and the standby's FD close RSTs the conns that landed on it — fatal +// for H2 prior-knowledge handshakes mid-flush. // // Iterations bound at 3: the race only fires on engine spin-up, so we -// just need "scaler + H2 dial burst" coverage in addition to the -// no-scaler test's larger budget. -func TestAdaptiveScaler_H2DialNoRSTRace(t *testing.T) { +// just need an additional burst against a wider pool in addition to the +// main test's larger budget. +func TestAdaptivePauseAccept_H2DialNoRSTRace(t *testing.T) { const iterations = 3 for i := 0; i < iterations; i++ { - runScalerH2Once(t, i) + runPauseAcceptH2Once(t, i) } } -func runScalerH2Once(t *testing.T, iter int) { +func runPauseAcceptH2Once(t *testing.T, iter int) { t.Helper() ln, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { @@ -47,9 +51,8 @@ func runScalerH2Once(t *testing.T, iter int) { Protocol: engine.Auto, EnableH2Upgrade: true, Resources: resource.Resources{ - Workers: 4, // 4 workers so MinActive=2 has 2 paused at start + Workers: 4, }, - WorkerScaling: &resource.WorkerScalingConfig{}, // zero value → start-high } e, err := New(cfg, &h2PrefaceHandler{}, nil) if err != nil { diff --git a/adaptive/scaler.go b/adaptive/scaler.go deleted file mode 100644 index b136d923..00000000 --- a/adaptive/scaler.go +++ /dev/null @@ -1,114 +0,0 @@ -//go:build linux - -package adaptive - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" -) - -// adaptiveScalerSource adapts the adaptive Engine to the scaler.Source -// interface. The active sub-engine can change at runtime -// (performSwitch); this source watches for that and routes -// PauseWorker / ResumeWorker calls to whichever engine is active right -// now. Generation increments on each switch so the shared scaler loop -// can re-baseline the new active engine's worker pause state. -// -// Both sub-engines must implement engine.WorkerScaler — verified via a -// type-assertion in the constructor. Both must have the same NumWorkers; -// if they ever diverge (defensive), the smaller pool is used as the cap. -type adaptiveScalerSource struct { - e *Engine - primary engine.WorkerScaler - standby engine.WorkerScaler - gen atomic.Uint64 - - // lastActive tracks which sub-engine was active on the previous - // scaler-source method call. When activeFor() observes a different - // engine, gen is incremented so the scaler.Run loop notices and - // re-baselines. - lastActive atomic.Pointer[engine.Engine] -} - -// newAdaptiveScalerSource attempts to construct a Source. Returns nil if -// either sub-engine fails the engine.WorkerScaler assertion (defensive -// — should not happen in production since both iouring and epoll -// implement it). -func newAdaptiveScalerSource(e *Engine) *adaptiveScalerSource { - primary, ok1 := e.primary.(engine.WorkerScaler) - standby, ok2 := e.secondary.(engine.WorkerScaler) - if !ok1 || !ok2 { - return nil - } - return &adaptiveScalerSource{e: e, primary: primary, standby: standby} -} - -// activeFor returns the WorkerScaler view of the engine that is -// currently active in adaptive. Bumps Generation if the active engine -// changed since the previous call, so scaler.Run re-baselines. -func (s *adaptiveScalerSource) activeFor() engine.WorkerScaler { - cur := *s.e.active.Load() - prev := s.lastActive.Load() - if prev == nil || *prev != cur { - s.lastActive.Store(&cur) - s.gen.Add(1) - } - if cur == s.e.secondary { - return s.standby - } - return s.primary -} - -func (s *adaptiveScalerSource) NumWorkers() int { - a, b := s.primary.NumWorkers(), s.standby.NumWorkers() - if a < b { - return a - } - return b -} - -func (s *adaptiveScalerSource) ActiveConns() int64 { - // Sum BOTH sub-engines' active conns, not just the currently-active - // one. During an adaptive switch, in-flight conns on the OLD - // sub-engine continue to be served until their clients close — the - // scaler must count them so the desired-worker calculation doesn't - // undershoot the real CPU load (#300 G1). - return s.e.primary.Metrics().ActiveConnections + - s.e.secondary.Metrics().ActiveConnections -} - -func (s *adaptiveScalerSource) PauseWorker(i int) { - s.activeFor().PauseWorker(i) -} - -func (s *adaptiveScalerSource) ResumeWorker(i int) { - s.activeFor().ResumeWorker(i) -} - -func (s *adaptiveScalerSource) Generation() uint64 { - // Refresh: a tick reading Generation should also drive the - // active-engine detection so we don't miss a switch that happens - // between two ticks where neither Pause/Resume nor ActiveConns ran. - s.activeFor() - return s.gen.Load() -} - -func (s *adaptiveScalerSource) Logger() *slog.Logger { return s.e.logger } - -// runScaler is started by Engine.Listen when the scaler is enabled. The -// algorithm itself lives in engine/scaler; this just wires up the -// adaptive-flavoured source. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config) { - src := newAdaptiveScalerSource(e) - if src == nil { - if e.logger != nil { - e.logger.Warn("adaptive scaler disabled: sub-engine does not implement WorkerScaler") - } - return - } - scaler.Run(ctx, src, cfg) -} diff --git a/adaptive/scaler_test.go b/adaptive/scaler_test.go deleted file mode 100644 index 5d5bec6b..00000000 --- a/adaptive/scaler_test.go +++ /dev/null @@ -1,116 +0,0 @@ -//go:build linux - -package adaptive - -import ( - "context" - "sync" - "testing" - "time" - - "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" - "github.com/goceleris/celeris/resource" -) - -// scalableMockEngine extends mockEngine with the engine.WorkerScaler -// interface so the adaptive scaler can exercise its -// PauseWorker / ResumeWorker delegation logic against a controllable -// fake. Pause/Resume calls are tracked per-worker and per-direction so -// tests can assert the scaler routed work to the active engine and -// not the standby. -type scalableMockEngine struct { - *mockEngine - numWorkers int - pauseLog []int - resumeLog []int - logMu sync.Mutex -} - -func newScalableMockEngine(et engine.EngineType, n int) *scalableMockEngine { - return &scalableMockEngine{mockEngine: newMockEngine(et), numWorkers: n} -} - -func (m *scalableMockEngine) NumWorkers() int { return m.numWorkers } - -func (m *scalableMockEngine) PauseWorker(i int) { - m.logMu.Lock() - m.pauseLog = append(m.pauseLog, i) - m.logMu.Unlock() -} - -func (m *scalableMockEngine) ResumeWorker(i int) { - m.logMu.Lock() - m.resumeLog = append(m.resumeLog, i) - m.logMu.Unlock() -} - -func (m *scalableMockEngine) snapshotLog() (pause, resume []int) { - m.logMu.Lock() - defer m.logMu.Unlock() - pause = append(pause, m.pauseLog...) - resume = append(resume, m.resumeLog...) - return -} - -// TestAdaptiveScaler_DelegatesToActiveEngine spins the higher-level -// scaler with two mock engines, sets connection counts on the active -// one, and verifies the scaler calls PauseWorker / ResumeWorker on -// the active engine — never on the standby. This is the architectural -// invariant that the v1.4.1 adaptive scaler refactor exists to enforce -// (the pre-refactor design ran two scalers, producing -54 % to +118 % -// pinning-test variance — see PR #257). -func TestAdaptiveScaler_DelegatesToActiveEngine(t *testing.T) { - primary := newScalableMockEngine(engine.Epoll, 8) - secondary := newScalableMockEngine(engine.IOUring, 8) - sampler := newSyntheticSampler() - cfg := resource.Config{} - e := newFromEngines(primary, secondary, sampler, cfg) - - // Active = primary (matches newFromEngines default). - // Set activeConns high enough that desired = 8 (max), so the scaler - // will call ResumeWorker on workers 4..7 of the active engine. - primary.SetMetrics(engine.EngineMetrics{ActiveConnections: 200}) - secondary.SetMetrics(engine.EngineMetrics{ActiveConnections: 0}) - - scfg := scaler.Config{ - Enabled: true, - MinActive: 4, - TargetConnsPerWorker: 20, - Interval: 10 * time.Millisecond, - ScaleUpStep: 4, // burst-resume to active=8 in one tick - ScaleDownStep: 1, - ScaleDownHysteresis: 1, - ScaleDownIdleTicks: 100, // never scale-down within the test budget - } - - ctx, cancel := context.WithCancel(t.Context()) - defer cancel() - go e.runScaler(ctx, scfg) - - // Give the scaler a few ticks to ramp up. - time.Sleep(100 * time.Millisecond) - cancel() - // Tiny grace for the scaler goroutine to observe the cancel. - time.Sleep(20 * time.Millisecond) - - pPause, pResume := primary.snapshotLog() - sPause, sResume := secondary.snapshotLog() - - // Initial state pauses workers 4..7 on the active engine. Then - // scaler resumes 4..7 on the ACTIVE (primary) engine to reach - // desired=8. - if len(pPause) < 4 { - t.Errorf("primary expected ≥4 PauseWorker calls (init), got %d: %v", len(pPause), pPause) - } - if len(pResume) == 0 { - t.Errorf("primary expected ResumeWorker calls (active scaling up), got 0") - } - // Standby (secondary) should NEVER see ResumeWorker — that's the - // invariant. The scaler routes scale-up to the active engine only. - if len(sResume) != 0 { - t.Errorf("secondary should NOT see ResumeWorker calls; got %v", sResume) - } - // And no spurious calls on the secondary either (it's the standby). - _ = sPause -} diff --git a/config.go b/config.go index ecb84547..7a9f8ef2 100644 --- a/config.go +++ b/config.go @@ -177,34 +177,6 @@ type Config struct { // - non-nil false: force disabled, even on Protocol=Auto. Useful when // the engine intentionally only serves HTTP/1. EnableH2Upgrade *bool - - // WorkerScaling configures the dynamic worker scaler. As of v1.4.6 - // the scaler is DEFAULT-ON — leaving this field nil resolves to a - // zero-value [resource.WorkerScalingConfig], which activates the - // scaler with the data-validated defaults (Strategy=StartHigh, - // MinActive=max(2, NumCPU/2), TargetConnsPerWorker=20, - // Interval=250ms, ScaleUpStep=2, ScaleDownStep=1, - // ScaleDownHysteresis=1, ScaleDownIdleTicks=4). This matches the - // "just-works" public design intent already in place for the - // Engine=Adaptive and Protocol=Auto defaults. - // - // Pre-v1.4.6 behaviour (scaler always disabled unless explicitly - // configured) is achievable by passing a non-nil struct that - // effectively makes the scaler a no-op: - // - // WorkerScaling: &resource.WorkerScalingConfig{ - // MinActive: runtime.GOMAXPROCS(0), // pin at NumCPU; never scale down - // } - // - // Set to a non-nil pointer with custom values to override one or - // more defaults. See [resource.WorkerScalingConfig] for tuning. - // The scaler keeps connections-per-active-worker around the target - // ratio by pausing/resuming workers; this dramatically improves - // CQE/event batching at low/mid concurrency where the static - // numCPU default would otherwise lose 30-90 % CPU/req to under- - // batched syscalls. See PR #257 / issue #281 for the full - // rationale and benchmark data. - WorkerScaling *resource.WorkerScalingConfig } // EngineMetrics is a point-in-time snapshot of engine-level performance counters. @@ -256,17 +228,6 @@ func (c Config) toResourceConfig() resource.Config { rc.OnExpectContinue = c.OnExpectContinue rc.OnConnect = c.OnConnect rc.OnDisconnect = c.OnDisconnect - // Dynamic worker scaler default-on (issue #281). A nil - // WorkerScaling field — i.e. the user did not configure it — now - // resolves to a zero-value struct so the scaler activates with - // the data-validated defaults. Opt-out is documented as setting - // MinActive=NumCPU (no-op scaler), which preserves backward - // compatibility for users who really do want pre-v1.4.6 behaviour. - if c.WorkerScaling != nil { - rc.WorkerScaling = c.WorkerScaling - } else { - rc.WorkerScaling = &resource.WorkerScalingConfig{} - } // h2c upgrade resolution. Nil → protocol-dependent default (Auto → true, // HTTP1/H2C → false). Non-nil → user override honored verbatim. diff --git a/engine/engine.go b/engine/engine.go index 295e375a..73ce169f 100644 --- a/engine/engine.go +++ b/engine/engine.go @@ -43,24 +43,6 @@ type SwitchFreezer interface { UnfreezeSwitching() } -// WorkerScaler is implemented by engines that support per-worker pause/resume -// for dynamic capacity adjustment based on load. Used by the higher-level -// scaler in the adaptive engine to delegate worker activation to whichever -// sub-engine is currently active. Per-worker pause is asynchronous — the -// worker drains in-flight connections before going SUSPENDED. Resume wakes a -// suspended worker so it re-creates its listen socket and rejoins the -// SO_REUSEPORT group. -type WorkerScaler interface { - // NumWorkers returns the total worker pool size (max active count). - NumWorkers() int - // PauseWorker deactivates worker i. Asynchronous; returns immediately. - // Idempotent — pausing an already-paused worker is a no-op. - PauseWorker(i int) - // ResumeWorker reactivates worker i. Wakes the worker if SUSPENDED. - // Idempotent — resuming an active worker is a no-op. - ResumeWorker(i int) -} - // SendfileCapable is an optional interface implemented by engines that // support zero-copy file responses via sendfile(2). The H1 static-file // response path type-asserts the engine for it; engines that do not diff --git a/engine/epoll/engine.go b/engine/epoll/engine.go index 8096b699..6bc69970 100644 --- a/engine/epoll/engine.go +++ b/engine/epoll/engine.go @@ -13,7 +13,6 @@ import ( "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" "github.com/goceleris/celeris/internal/platform" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" @@ -126,16 +125,6 @@ func (e *Engine) Listen(ctx context.Context) error { ) } - // Dynamic loop scaler. Typed cfg.WorkerScaling takes precedence over - // env vars. Suppressed when wrapped by adaptive — adaptive runs ONE - // higher-level scaler that delegates to the active sub-engine. The - // algorithm itself lives in engine/scaler. - if !e.cfg.SkipBuiltinScaler { - if scalerCfg := scaler.Resolve(e.cfg, len(e.loops)); scalerCfg.Enabled { - go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns) - } - } - <-ctx.Done() wg.Wait() return nil @@ -250,7 +239,6 @@ func (e *Engine) ResumeAccept() error { var ( _ engine.Engine = (*Engine)(nil) _ engine.AcceptController = (*Engine)(nil) - _ engine.WorkerScaler = (*Engine)(nil) ) // Addr returns the bound listener address. diff --git a/engine/epoll/loop.go b/engine/epoll/loop.go index 3fe877cd..b8c67b6c 100644 --- a/engine/epoll/loop.go +++ b/engine/epoll/loop.go @@ -77,11 +77,6 @@ type Loop struct { wake chan struct{} wakeMu sync.Mutex suspended atomic.Bool - // inactive is the per-worker pause flag used by the dynamic worker - // scaler. ORed with acceptPaused (which is engine-wide) when computing - // effective paused state. The scaler flips this to deactivate idle - // loops under low load and reactivate them under burst load. - inactive atomic.Bool // listenFDClosed signals that the loop has closed its listen FD in // response to acceptPaused=true. PauseAccept polls this so it only // returns once the SO_REUSEPORT group has actually shed this listener @@ -284,8 +279,7 @@ func (l *Loop) run(ctx context.Context) { // Cache the atomic load: ACTIVE→DRAINING and SUSPENDED→ACTIVE // branches both read it. Saves 1 atomic load per event-loop // iteration on the steady-state hot path. - // OR with the per-loop inactive flag (dynamic scaler). - paused := l.acceptPaused.Load() || l.inactive.Load() + paused := l.acceptPaused.Load() if l.listenFD >= 0 && paused { // Drain any pending accepts from the kernel's listen queue so // they get a clean shutdown (FIN) rather than the RST that @@ -551,10 +545,9 @@ func (l *Loop) run(ctx context.Context) { } // DRAINING → SUSPENDED: no listen socket, no connections, events processed. - // Combined paused: engine-wide OR per-loop (dynamic scaler). - if l.listenFD < 0 && l.connCount == 0 && (l.acceptPaused.Load() || l.inactive.Load()) { + if l.listenFD < 0 && l.connCount == 0 && l.acceptPaused.Load() { l.wakeMu.Lock() - if !l.acceptPaused.Load() && !l.inactive.Load() { + if !l.acceptPaused.Load() { l.wakeMu.Unlock() continue } diff --git a/engine/epoll/scaler.go b/engine/epoll/scaler.go deleted file mode 100644 index f8e948ad..00000000 --- a/engine/epoll/scaler.go +++ /dev/null @@ -1,62 +0,0 @@ -//go:build linux - -package epoll - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine/scaler" -) - -// epollScalerSource adapts the epoll Engine to the scaler.Source -// interface. Generation always returns 0 — epoll isn't a meta-engine. -type epollScalerSource struct { - e *Engine - activeConns *atomic.Int64 -} - -func (s *epollScalerSource) NumWorkers() int { return s.e.NumWorkers() } -func (s *epollScalerSource) ActiveConns() int64 { return s.activeConns.Load() } -func (s *epollScalerSource) PauseWorker(i int) { s.e.PauseWorker(i) } -func (s *epollScalerSource) ResumeWorker(i int) { s.e.ResumeWorker(i) } -func (s *epollScalerSource) Generation() uint64 { return 0 } -func (s *epollScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger } - -// runScaler is started by Engine.Listen when scaler config is enabled. -// All algorithm logic lives in engine/scaler. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) { - scaler.Run(ctx, &epollScalerSource{e: e, activeConns: activeConns}, cfg) -} - -// PauseWorker deactivates loop i. The loop drains in-flight conns and -// goes SUSPENDED. Asynchronous; returns immediately. -func (e *Engine) PauseWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.loops) { - return - } - e.loops[i].inactive.Store(true) -} - -// ResumeWorker reactivates loop i. Wakes the loop from SUSPENDED if it was -// already idle. -func (e *Engine) ResumeWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.loops) { - return - } - l := e.loops[i] - l.inactive.Store(false) - l.listenFDClosed.Store(false) - l.wakeMu.Lock() - if l.suspended.Load() { - close(l.wake) - l.wake = make(chan struct{}) - l.suspended.Store(false) - } - l.wakeMu.Unlock() -} diff --git a/engine/epoll/scaler_test.go b/engine/epoll/scaler_test.go deleted file mode 100644 index ee7507ac..00000000 --- a/engine/epoll/scaler_test.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build linux - -package epoll - -// The scaler algorithm itself lives in engine/scaler and has its own -// test suite there. The epoll engine's scaler.go is just the -// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on loops -// slice). diff --git a/engine/iouring/buf_ring_scale_test.go b/engine/iouring/buf_ring_scale_test.go index e535d782..6aa5a5fe 100644 --- a/engine/iouring/buf_ring_scale_test.go +++ b/engine/iouring/buf_ring_scale_test.go @@ -16,7 +16,7 @@ func TestResolveBufRingCountDefaults(t *testing.T) { targetConns int want int }{ - // Formula: 2 * TargetConnsPerWorker, rounded up to a power of 2, + // Formula: 2 * connsPerWorker, rounded up to a power of 2, // clamped to [bufRingCountMin=1024, bufRingCountMax]. The ring is // PER-WORKER, so the result MUST NOT depend on Workers (celeris#322 // follow-up — the prior 2*Workers*target over-sized every worker's diff --git a/engine/iouring/engine.go b/engine/iouring/engine.go index 722c64bf..e4ebdde5 100644 --- a/engine/iouring/engine.go +++ b/engine/iouring/engine.go @@ -12,7 +12,6 @@ import ( "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" "github.com/goceleris/celeris/internal/platform" "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" @@ -235,16 +234,6 @@ func (e *Engine) Listen(ctx context.Context) error { ) } - // Dynamic worker scaler. Typed cfg.WorkerScaling takes precedence over - // env vars. Suppressed when wrapped by adaptive — adaptive runs ONE - // higher-level scaler that delegates to the active sub-engine. The - // algorithm itself lives in engine/scaler; this is just the call site. - if !e.cfg.SkipBuiltinScaler { - if scalerCfg := scaler.Resolve(e.cfg, len(workers)); scalerCfg.Enabled { - go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns) - } - } - <-ctx.Done() // Workers use SubmitAndWaitTimeout and check ctx.Err() on each iteration, // so they will exit within ~100ms of context cancellation. @@ -388,7 +377,6 @@ var ( _ engine.Engine = (*Engine)(nil) _ engine.AcceptController = (*Engine)(nil) _ engine.EventLoopProvider = (*Engine)(nil) - _ engine.WorkerScaler = (*Engine)(nil) ) // NumWorkers returns the number of worker event loops available for diff --git a/engine/iouring/scaler.go b/engine/iouring/scaler.go deleted file mode 100644 index d4ef07fd..00000000 --- a/engine/iouring/scaler.go +++ /dev/null @@ -1,65 +0,0 @@ -//go:build linux - -package iouring - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine/scaler" -) - -// iouringScalerSource adapts the iouring Engine to the scaler.Source -// interface. Generation always returns 0 — iouring isn't a meta-engine -// and doesn't switch identities at runtime; only the adaptive engine's -// source returns non-zero generations. -type iouringScalerSource struct { - e *Engine - activeConns *atomic.Int64 -} - -func (s *iouringScalerSource) NumWorkers() int { return s.e.NumWorkers() } -func (s *iouringScalerSource) ActiveConns() int64 { return s.activeConns.Load() } -func (s *iouringScalerSource) PauseWorker(i int) { s.e.PauseWorker(i) } -func (s *iouringScalerSource) ResumeWorker(i int) { s.e.ResumeWorker(i) } -func (s *iouringScalerSource) Generation() uint64 { return 0 } -func (s *iouringScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger } - -// runScaler is started by Engine.Listen when scaler config is enabled. -// All algorithm logic lives in engine/scaler; this is a thin source -// adapter so the iouring engine plugs into the shared loop. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) { - scaler.Run(ctx, &iouringScalerSource{e: e, activeConns: activeConns}, cfg) -} - -// PauseWorker deactivates worker i. The worker drains in-flight conns -// and goes SUSPENDED. Asynchronous; returns immediately. -func (e *Engine) PauseWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.workers) { - return - } - e.workers[i].inactive.Store(true) -} - -// ResumeWorker reactivates worker i. Wakes the worker from SUSPENDED if -// it was already idle. -func (e *Engine) ResumeWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.workers) { - return - } - w := e.workers[i] - w.inactive.Store(false) - w.listenFDClosed.Store(false) - w.wakeMu.Lock() - if w.suspended.Load() { - close(w.wake) - w.wake = make(chan struct{}) - w.suspended.Store(false) - } - w.wakeMu.Unlock() -} diff --git a/engine/iouring/scaler_test.go b/engine/iouring/scaler_test.go deleted file mode 100644 index 91819e3d..00000000 --- a/engine/iouring/scaler_test.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build linux - -package iouring - -// The scaler algorithm itself lives in engine/scaler and has its own -// test suite there. The iouring engine's scaler.go is just the -// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on workers -// slice). The runtime exercises this adapter via the existing engine -// integration tests (e.g., async_churn_uaf_test.go) and the spike-B -// strict-matrix runs that ran with CELERIS_DYN_WORKERS=1. diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index fc74d0af..fd029415 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -71,17 +71,24 @@ const bufRingCountMax = 1 << 15 // 32768 entries × 8 KiB = 256 MiB worst case ( // the case for very-high-concurrency benchmarks (16k+ connections) where // each worker may have more in-flight multishot recvs than the formula // anticipates. Setting 0 (or leaving the env var unset) reverts to -// auto-scaling from Workers × TargetConnsPerWorker. +// auto-scaling from the per-worker conn target. const envPbufCount = "CELERIS_IOURING_PBUF_COUNT" +// defaultConnsPerWorker is the per-worker connection target used to size +// the provided-buffer ring. The ring is sized at 2 buffers per conn at +// this target, giving comfortable headroom so the kernel rarely stalls +// waiting for buffer returns. Operators override the resulting ring size +// directly via CELERIS_IOURING_PBUF_COUNT. +const defaultConnsPerWorker = 20 + // resolveBufRingCount picks the provided-buffer-ring size for a worker. // The default formula is `nextPowerOf2(max(bufRingCountMin, 2 * -// TargetConnsPerWorker))`, i.e. two buffers per conn in the scaler's -// steady-state target — enough headroom that the kernel rarely stalls -// waiting for buffer returns. Above 1024 conns the previous hard-coded -// 1024 was too small: buffers were reused aggressively, the kernel -// stalled, and the very behaviour the ring is designed to optimise -// (multishot recv CQE batching) collapsed into CQE storms (celeris#322). +// connsPerWorker))`, i.e. two buffers per conn at the per-worker conn +// target — enough headroom that the kernel rarely stalls waiting for +// buffer returns. Above 1024 conns the previous hard-coded 1024 was too +// small: buffers were reused aggressively, the kernel stalled, and the +// very behaviour the ring is designed to optimise (multishot recv CQE +// batching) collapsed into CQE storms (celeris#322). // // The ring is PER-WORKER: NewBufferRing is created once per Worker on its // own ring, so the scaling MUST be driven by the per-worker conn target, @@ -90,7 +97,7 @@ const envPbufCount = "CELERIS_IOURING_PBUF_COUNT" // mmap'd RSS per worker and risking the kernel cap on large boxes // (celeris#322 follow-up). // Operators can override via CELERIS_IOURING_PBUF_COUNT. -func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorker int) int { +func resolveBufRingCount(_ resource.ResolvedResources, connsPerWorker int) int { if v := os.Getenv(envPbufCount); v != "" { if n, err := strconv.Atoi(v); err == nil && n > 0 { if n&(n-1) != 0 { @@ -99,9 +106,9 @@ func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorke return clampBufRingCount(n) } } - target := scalerTargetConnsPerWorker + target := connsPerWorker if target <= 0 { - target = 20 // mirrors scaler.Resolve's default + target = defaultConnsPerWorker } scaled := 2 * target if scaled < bufRingCountMin { @@ -224,11 +231,6 @@ type Worker struct { wake chan struct{} wakeMu sync.Mutex suspended atomic.Bool - // inactive is the per-worker pause flag used by the dynamic worker - // scaler. ORed with acceptPaused (which is engine-wide) when computing - // effective paused state. The scaler flips this to deactivate idle - // workers under low load and reactivate them under burst load. - inactive atomic.Bool // listenFDClosed signals that the worker has cancelled in-flight // accept SQEs and closed its listen FD in response to acceptPaused // being set. PauseAccept polls this so it only returns once the @@ -431,15 +433,11 @@ func (w *Worker) run(ctx context.Context) { // The ring size scales with the worker's per-conn target (celeris#322): // the previous hard-coded 1024 entries was undersized above ~1024 conns // and produced CQE storms as the kernel stalled waiting for buffer - // returns. The formula gives 2 buffers per conn at the scaler's - // steady-state target — comfortable headroom without runaway RSS. + // returns. The formula gives 2 buffers per conn at the per-worker conn + // target — comfortable headroom without runaway RSS. // CELERIS_IOURING_PBUF_COUNT overrides the auto-scaled value. if w.tier.SupportsMultishotRecv() && os.Getenv("CELERIS_IOURING_MULTISHOT_RECV") == "1" { - var targetConns int - if w.cfg.WorkerScaling != nil { - targetConns = w.cfg.WorkerScaling.TargetConnsPerWorker - } - bufRingCount := resolveBufRingCount(w.resolved, targetConns) + bufRingCount := resolveBufRingCount(w.resolved, defaultConnsPerWorker) br, err := NewBufferRing(w.ring, bufRingGroupID, bufRingCount, w.resolved.BufferSize) if err != nil { w.logger.Warn("ring-mapped buffer registration failed, using per-connection buffers", @@ -481,8 +479,7 @@ func (w *Worker) run(ctx context.Context) { // Cache the atomic load: same value used by the two branches // below and (further down) the SUSPENDED check. Saves 2 atomic // loads per event-loop iteration on the steady-state hot path. - // OR with the per-worker inactive flag (dynamic scaler). - paused := w.acceptPaused.Load() || w.inactive.Load() + paused := w.acceptPaused.Load() if w.listenFD >= 0 && paused { if sqe := w.ring.GetSQE(); sqe != nil { prepCancelFDSkipSuccess(sqe, w.listenFD) @@ -782,7 +779,6 @@ func (w *Worker) run(ctx context.Context) { // DRAINING → SUSPENDED: no listen socket, no connections, CQEs processed. // Checked after CQE processing so accept CQEs for connections that // completed before the listen socket close are served, not leaked. - // Combined paused: engine-wide OR per-worker (dynamic scaler). // // hasDriverConns gate: connCount only counts HTTP conns. An // EventLoopProvider driver may still have live conns in @@ -790,9 +786,9 @@ func (w *Worker) run(ctx context.Context) { // park its event loop and starve those driver conns of CQE // servicing. Stay active while any driver conn is registered (v1.5.0 // review 2.10). - if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && (w.acceptPaused.Load() || w.inactive.Load()) { + if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && w.acceptPaused.Load() { w.wakeMu.Lock() - if !w.acceptPaused.Load() && !w.inactive.Load() { + if !w.acceptPaused.Load() { w.wakeMu.Unlock() continue } diff --git a/engine/scaler/doc.go b/engine/scaler/doc.go deleted file mode 100644 index c5ebe959..00000000 --- a/engine/scaler/doc.go +++ /dev/null @@ -1,16 +0,0 @@ -// Package scaler implements the dynamic worker scaler shared by the -// iouring, epoll, and adaptive engines. The scaler tracks the active -// connection count and adjusts how many workers participate in the -// SO_REUSEPORT group so connections-per-active-worker stays near the -// configured target — this keeps CQE/event batching density in the -// sweet spot. -// -// Engines plug in via the [Source] interface. Per-engine sources -// (engine/iouring/scaler.go, engine/epoll/scaler.go, adaptive/scaler.go) -// adapt the engine to this contract; the algorithm in [Run] is shared. -// -// Implementation is Linux-only because the underlying engines that -// surface a [Source] are themselves Linux-only. This package compiles -// to an empty stub on non-Linux GOOS so godoc and reflection-based -// tooling still see the public types and contract. -package scaler diff --git a/engine/scaler/scaler.go b/engine/scaler/scaler.go deleted file mode 100644 index ebd06ea2..00000000 --- a/engine/scaler/scaler.go +++ /dev/null @@ -1,285 +0,0 @@ -//go:build linux - -// Package-level documentation lives in doc.go (build-unconstrained so -// pkg.go.dev can render it on non-Linux). Per-engine [Source] sources: -// iouring + epoll report Generation()=0 always; the adaptive engine's -// source increments it on each engine switch so the scaler can -// re-baseline the new active engine's worker pause state. - -package scaler - -import ( - "context" - "log/slog" - "os" - "strconv" - "time" - - "github.com/goceleris/celeris/resource" -) - -// Source is the engine-side interface the scaler uses to read load and -// adjust worker activation. Implementations must be safe for concurrent -// use; the scaler calls these from its own goroutine. -type Source interface { - // NumWorkers returns the total worker pool size (= max active count). - // Must be stable across the lifetime of a scaler run. - NumWorkers() int - // ActiveConns returns the current active connection count. Read every tick. - ActiveConns() int64 - // PauseWorker deactivates worker i. Idempotent. - PauseWorker(i int) - // ResumeWorker reactivates worker i. Idempotent. - ResumeWorker(i int) - // Generation returns a counter that increments whenever the - // underlying engine identity changes (e.g., adaptive engine switch). - // Per-engine sources return 0 always. The scaler uses this to detect - // switches and re-baseline the active count on the new engine. - Generation() uint64 - // Logger returns the slog.Logger used for scaler diagnostics. May - // return nil to suppress log output. - Logger() *slog.Logger -} - -// Config holds the resolved scaler parameters. All fields are mutable -// before Run starts; the loop reads them once at startup. Use -// [Resolve] to derive a Config from a resource.Config (handles both -// the typed WorkerScaling field and the env-var fallback). -type Config struct { - // Enabled gates the entire scaler. Resolve sets this to true when - // either the typed config or the CELERIS_DYN_WORKERS env is set. - Enabled bool - // StartHigh seeds the scaler at NumWorkers active and scales down - // on idle. Default true (data-validated; preserves SO_REUSEPORT - // distribution at startup, +34-78% over start-low on ramp scenarios). - StartHigh bool - // MinActive is the floor on active worker count. - MinActive int - // TargetConnsPerWorker is the scaling target. desired = ceil(conns/Target). - TargetConnsPerWorker int - // Interval is the tick cadence. Default 250ms. - Interval time.Duration - // ScaleUpStep is the max workers added per tick (burst limit). - ScaleUpStep int - // ScaleDownStep is the max workers removed per tick. - ScaleDownStep int - // ScaleDownHysteresis: scale-down requires desired ≤ active - hyst - 1. - ScaleDownHysteresis int - // ScaleDownIdleTicks: consecutive sub-threshold ticks needed to scale down. - ScaleDownIdleTicks int - // Trace logs every scaler decision when true. - Trace bool -} - -// Resolve produces a Config from the engine's resource.Config. Typed -// cfg.WorkerScaling takes precedence; env vars are the legacy fallback. -// Returns Enabled=false when neither configures the scaler. -func Resolve(cfg resource.Config, numWorkers int) Config { - if cfg.WorkerScaling != nil { - return fromTyped(cfg.WorkerScaling, numWorkers) - } - return fromEnv(numWorkers) -} - -func fromTyped(c *resource.WorkerScalingConfig, numWorkers int) Config { - minActive := c.MinActive - if minActive == 0 { - minActive = numWorkers / 2 - if minActive < 2 { - minActive = 2 - } - } - if minActive > numWorkers { - minActive = numWorkers - } - target := c.TargetConnsPerWorker - if target == 0 { - target = 20 - } - interval := c.Interval - if interval == 0 { - interval = 250 * time.Millisecond - } - upStep := c.ScaleUpStep - if upStep == 0 { - upStep = 2 - } - downStep := c.ScaleDownStep - if downStep == 0 { - downStep = 1 - } - hyst := c.ScaleDownHysteresis - if hyst == 0 { - hyst = 1 - } - idleTicks := c.ScaleDownIdleTicks - if idleTicks == 0 { - idleTicks = 4 - } - return Config{ - Enabled: true, - StartHigh: c.Strategy != resource.ScalingStrategyStartLow, - MinActive: minActive, - TargetConnsPerWorker: target, - Interval: interval, - ScaleUpStep: upStep, - ScaleDownStep: downStep, - ScaleDownHysteresis: hyst, - ScaleDownIdleTicks: idleTicks, - Trace: c.Trace, - } -} - -func fromEnv(numWorkers int) Config { - // The CELERIS_DYN_* env vars are the legacy fallback for the typed - // [resource.WorkerScalingConfig]. They remain active so deployed - // manifests that pre-date v1.4.6 (when the typed config shipped) - // keep working without an edit. New code should use - // celeris.Config.WorkerScaling directly. - getInt := func(k string, def int) int { - if v := os.Getenv(k); v != "" { - if n, err := strconv.Atoi(v); err == nil { - return n - } - } - return def - } - enabled := getInt("CELERIS_DYN_WORKERS", 0) != 0 - minActive := getInt("CELERIS_DYN_MIN", numWorkers/2) - if minActive < 1 { - minActive = 1 - } - if minActive > numWorkers { - minActive = numWorkers - } - return Config{ - Enabled: enabled, - // Defaults to true (data-validated; matches the typed-config - // Strategy=ScalingStrategyStartHigh default). start-low pauses - // workers BEFORE the engine has finished settling its listen - // FDs in the SO_REUSEPORT group, which races against incoming - // H2 prior-knowledge dials and produces RST mid-handshake (seen - // in the v1.4.1 strict-matrix run on get-json-64k-h2/adaptive - // before this change). Users who explicitly want start-low can - // set CELERIS_DYN_START_HIGH=0. - StartHigh: os.Getenv("CELERIS_DYN_START_HIGH") != "0", - MinActive: minActive, - TargetConnsPerWorker: getInt("CELERIS_DYN_TARGET", 20), - Interval: time.Duration(getInt("CELERIS_DYN_INTERVAL", 250)) * time.Millisecond, - ScaleUpStep: getInt("CELERIS_DYN_UPSTEP", 2), - ScaleDownStep: getInt("CELERIS_DYN_DOWNSTEP", 1), - ScaleDownHysteresis: getInt("CELERIS_DYN_DOWNHYST", 1), - ScaleDownIdleTicks: getInt("CELERIS_DYN_DOWNIDLE", 4), - Trace: os.Getenv("CELERIS_DYN_TRACE") == "1", - } -} - -// Run executes the scaler loop until ctx is canceled. Reads load from -// src.ActiveConns(), pauses/resumes workers via src.PauseWorker / -// src.ResumeWorker, and re-baselines on Generation changes. -// -// Caller must check cfg.Enabled before invoking Run; Run does not -// short-circuit when disabled. -func Run(ctx context.Context, src Source, cfg Config) { - totalWorkers := src.NumWorkers() - var active int - if cfg.StartHigh { - active = totalWorkers - } else { - active = cfg.MinActive - for i := cfg.MinActive; i < totalWorkers; i++ { - src.PauseWorker(i) - } - } - - if log := src.Logger(); log != nil { - log.Info("dynamic worker scaler started", - "min_active", cfg.MinActive, - "max", totalWorkers, - "target_conns_per_worker", cfg.TargetConnsPerWorker, - "interval_ms", int(cfg.Interval/time.Millisecond), - "start_high", cfg.StartHigh, - "up_step", cfg.ScaleUpStep, - "down_step", cfg.ScaleDownStep, - "down_hyst", cfg.ScaleDownHysteresis, - "down_idle_ticks", cfg.ScaleDownIdleTicks) - } - - ticker := time.NewTicker(cfg.Interval) - defer ticker.Stop() - idleTicks := 0 - lastGen := src.Generation() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - if gen := src.Generation(); gen != lastGen { - // Engine switched (only meaningful for adaptive). Re-baseline - // the new engine to the current `active` count: 0..active-1 - // resumed, active..total-1 paused. This is idempotent against - // the new engine's existing per-worker state. - for i := 0; i < totalWorkers; i++ { - if i < active { - src.ResumeWorker(i) - } else { - src.PauseWorker(i) - } - } - lastGen = gen - idleTicks = 0 - continue - } - active, idleTicks = tick(src, cfg, active, idleTicks) - } - } -} - -// tick is a single iteration of the scaler decision loop. Split out -// for unit-testing the algorithm without spinning up a real engine. -func tick(src Source, cfg Config, active, idleTicks int) (int, int) { - totalWorkers := src.NumWorkers() - conns := src.ActiveConns() - desired := int((conns + int64(cfg.TargetConnsPerWorker) - 1) / int64(cfg.TargetConnsPerWorker)) - if desired < cfg.MinActive { - desired = cfg.MinActive - } - if desired > totalWorkers { - desired = totalWorkers - } - if cfg.Trace { - if log := src.Logger(); log != nil { - log.Info("scaler tick", - "conns", conns, "desired", desired, "active", active, "idle_ticks", idleTicks) - } - } - - switch { - case desired > active: - step := desired - active - if step > cfg.ScaleUpStep { - step = cfg.ScaleUpStep - } - for n := 0; n < step; n++ { - src.ResumeWorker(active) - active++ - } - idleTicks = 0 - case desired <= active-cfg.ScaleDownHysteresis-1: - idleTicks++ - if idleTicks >= cfg.ScaleDownIdleTicks { - step := active - desired - if step > cfg.ScaleDownStep { - step = cfg.ScaleDownStep - } - for n := 0; n < step && active > cfg.MinActive; n++ { - active-- - src.PauseWorker(active) - } - idleTicks = 0 - } - default: - idleTicks = 0 - } - return active, idleTicks -} diff --git a/engine/scaler/scaler_test.go b/engine/scaler/scaler_test.go deleted file mode 100644 index bcce2b7d..00000000 --- a/engine/scaler/scaler_test.go +++ /dev/null @@ -1,327 +0,0 @@ -//go:build linux - -package scaler - -import ( - "context" - "log/slog" - "sync" - "testing" - "time" - - "github.com/goceleris/celeris/resource" -) - -// TestResolve_Defaults locks in the data-validated defaults captured in -// the v1.4.1 spike-B sweep: start-high, min=numCPU/2, target=20, -// interval=250ms, upStep=2, downStep=1, hyst=1, idleTicks=4. -func TestResolve_Defaults(t *testing.T) { - t.Parallel() - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{}} - c := Resolve(rcfg, 8) - if !c.Enabled { - t.Fatal("typed config (non-nil) must enable the scaler") - } - if !c.StartHigh { - t.Errorf("StartHigh: expected true (zero value Strategy = StartHigh), got false") - } - if c.MinActive != 4 { - t.Errorf("MinActive: expected 4 (numCPU/2 with numCPU=8), got %d", c.MinActive) - } - if c.TargetConnsPerWorker != 20 { - t.Errorf("TargetConnsPerWorker: expected 20, got %d", c.TargetConnsPerWorker) - } - if c.Interval != 250*time.Millisecond { - t.Errorf("Interval: expected 250ms, got %v", c.Interval) - } - if c.ScaleUpStep != 2 { - t.Errorf("ScaleUpStep: expected 2, got %d", c.ScaleUpStep) - } - if c.ScaleDownStep != 1 { - t.Errorf("ScaleDownStep: expected 1, got %d", c.ScaleDownStep) - } - if c.ScaleDownHysteresis != 1 { - t.Errorf("ScaleDownHysteresis: expected 1, got %d", c.ScaleDownHysteresis) - } - if c.ScaleDownIdleTicks != 4 { - t.Errorf("ScaleDownIdleTicks: expected 4, got %d", c.ScaleDownIdleTicks) - } -} - -// TestResolve_StartLow verifies opt-in to start-low. -func TestResolve_StartLow(t *testing.T) { - t.Parallel() - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{ - Strategy: resource.ScalingStrategyStartLow, - }} - c := Resolve(rcfg, 8) - if c.StartHigh { - t.Errorf("StartHigh: expected false (Strategy=StartLow), got true") - } -} - -// TestResolve_MinActiveClamping covers the floor + cap on MinActive. -func TestResolve_MinActiveClamping(t *testing.T) { - t.Parallel() - cases := []struct { - name string - minActive int - numWorkers int - want int - }{ - {"zero defaults to numCPU/2", 0, 8, 4}, - {"zero with small pool floors to 2", 0, 2, 2}, - {"explicit 1 respected", 1, 8, 1}, - {"above pool clamped to pool", 16, 8, 8}, - {"explicit 3", 3, 8, 3}, - } - for _, tc := range cases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - c := Resolve(resource.Config{WorkerScaling: &resource.WorkerScalingConfig{MinActive: tc.minActive}}, tc.numWorkers) - if c.MinActive != tc.want { - t.Errorf("MinActive: got %d, want %d", c.MinActive, tc.want) - } - }) - } -} - -// TestResolve_ConfigBeatsEnv verifies typed config wins over env vars. -func TestResolve_ConfigBeatsEnv(t *testing.T) { - t.Setenv("CELERIS_DYN_TARGET", "999") - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{TargetConnsPerWorker: 25}} - c := Resolve(rcfg, 4) - if c.TargetConnsPerWorker != 25 { - t.Errorf("typed config did not take precedence over env: got %d, want 25", c.TargetConnsPerWorker) - } - if !c.Enabled { - t.Error("typed config should produce Enabled=true") - } -} - -// TestResolve_EnvFallback verifies the env-var path is the fallback. -func TestResolve_EnvFallback(t *testing.T) { - t.Setenv("CELERIS_DYN_WORKERS", "1") - t.Setenv("CELERIS_DYN_TARGET", "33") - c := Resolve(resource.Config{}, 4) - if !c.Enabled { - t.Fatal("CELERIS_DYN_WORKERS=1 should enable the scaler") - } - if c.TargetConnsPerWorker != 33 { - t.Errorf("env target read incorrectly: got %d, want 33", c.TargetConnsPerWorker) - } -} - -// TestResolve_DisabledByDefault verifies the legacy zero-config path. -func TestResolve_DisabledByDefault(t *testing.T) { - t.Setenv("CELERIS_DYN_WORKERS", "") - c := Resolve(resource.Config{}, 4) - if c.Enabled { - t.Errorf("scaler should be disabled when neither env nor config provides it") - } -} - -// fakeSource implements Source for unit-testing the algorithm without -// spinning up an engine. PauseWorker / ResumeWorker calls are tracked -// per worker. -type fakeSource struct { - mu sync.Mutex - numWorkers int - conns int64 - gen uint64 - pauseLog []int - resumeLog []int - paused []bool -} - -func newFake(n int) *fakeSource { return &fakeSource{numWorkers: n, paused: make([]bool, n)} } - -func (s *fakeSource) NumWorkers() int { return s.numWorkers } -func (s *fakeSource) ActiveConns() int64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.conns -} -func (s *fakeSource) PauseWorker(i int) { - s.mu.Lock() - defer s.mu.Unlock() - s.pauseLog = append(s.pauseLog, i) - if i >= 0 && i < len(s.paused) { - s.paused[i] = true - } -} -func (s *fakeSource) ResumeWorker(i int) { - s.mu.Lock() - defer s.mu.Unlock() - s.resumeLog = append(s.resumeLog, i) - if i >= 0 && i < len(s.paused) { - s.paused[i] = false - } -} -func (s *fakeSource) Generation() uint64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.gen -} -func (s *fakeSource) Logger() *slog.Logger { return nil } - -func (s *fakeSource) setConns(n int64) { - s.mu.Lock() - s.conns = n - s.mu.Unlock() -} -func (s *fakeSource) bumpGen() { - s.mu.Lock() - s.gen++ - s.mu.Unlock() -} -func (s *fakeSource) numActive() int { - s.mu.Lock() - defer s.mu.Unlock() - n := 0 - for _, p := range s.paused { - if !p { - n++ - } - } - return n -} - -// runAsync starts Run in a goroutine and returns a stop func plus a -// Done chan so callers can poll the source state without depending on -// the run-loop's wall-clock progress. Older versions of these tests -// used a fixed-duration ctx and asserted at the end; under -race or -// on slow CPUs the time.Ticker can fire fewer than expected times in -// a 200ms window, leading to flakes (msa2-client hit "got 7" once). -func runAsync(t *testing.T, src *fakeSource, cfg Config) (stop func(), done <-chan struct{}) { - t.Helper() - ctx, cancel := context.WithCancel(context.Background()) - d := make(chan struct{}) - go func() { - defer close(d) - Run(ctx, src, cfg) - }() - return cancel, d -} - -// waitForActive polls src.numActive() until it matches want or -// timeout elapses. Returns true on success. Takes the place of a -// fixed-time sleep in scaler tests. -func waitForActive(t *testing.T, src *fakeSource, want int, timeout time.Duration) bool { - t.Helper() - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - if src.numActive() == want { - return true - } - time.Sleep(2 * time.Millisecond) - } - t.Errorf("numActive: waited %v for %d, last=%d", timeout, want, src.numActive()) - return false -} - -// TestRun_StartHighScalesDownOnIdle verifies the start-high default -// behaviour: starts at NumWorkers active, scales down to a floor of -// MinActive when load is low. -// -// Floor: with ScaleDownHysteresis=0 the floor IS MinActive. With -// hysteresis>0 the floor is MinActive+hysteresis to prevent flapping -// near the boundary; that's tested in -// TestRun_HysteresisFloorPreservedAboveMinActive. -func TestRun_StartHighScalesDownOnIdle(t *testing.T) { - src := newFake(8) - src.setConns(0) // idle from the start - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 0, ScaleDownIdleTicks: 2, - } - stop, done := runAsync(t, src, cfg) - waitForActive(t, src, 2, 2*time.Second) - stop() - <-done -} - -// TestRun_HysteresisFloorPreservedAboveMinActive verifies that with the -// default hysteresis=1 the algorithm stops scaling at MinActive+1, not -// at MinActive. This prevents flapping near the boundary when conn -// count oscillates ±1 around the threshold. -func TestRun_HysteresisFloorPreservedAboveMinActive(t *testing.T) { - src := newFake(8) - src.setConns(0) - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 2, - } - stop, done := runAsync(t, src, cfg) - // Floor is MinActive + hyst = 3. - waitForActive(t, src, 3, 2*time.Second) - stop() - <-done -} - -// TestRun_StartLowScalesUpOnLoad verifies the start-low path: starts at -// MinActive, scales up to ceil(conns/target) when load arrives. -func TestRun_StartLowScalesUpOnLoad(t *testing.T) { - src := newFake(8) - cfg := Config{ - Enabled: true, StartHigh: false, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 4, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100, - } - // Set conns so desired = ceil(160/20) = 8 (full pool). - src.setConns(160) - stop, done := runAsync(t, src, cfg) - waitForActive(t, src, 8, 2*time.Second) - stop() - <-done -} - -// TestRun_GenerationChangeRebaselines verifies that incrementing -// Generation triggers re-baseline of the active count on the new -// underlying engine. This is the adaptive switch-handling invariant. -func TestRun_GenerationChangeRebaselines(t *testing.T) { - src := newFake(8) - src.setConns(60) // desired = 3, but with start-high we start at 8 - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100, - } - // Run for a few ticks so the scaler settles. Then bump Generation - // — the next tick should re-baseline the (still-fake) source by - // pausing/resuming workers to enforce the current `active`. - ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) - defer cancel() - go Run(ctx, src, cfg) - time.Sleep(15 * time.Millisecond) - - // Reset the logs; the next bump should write a full re-baseline burst. - src.mu.Lock() - src.pauseLog = nil - src.resumeLog = nil - src.mu.Unlock() - src.bumpGen() - time.Sleep(15 * time.Millisecond) - cancel() - time.Sleep(10 * time.Millisecond) - - src.mu.Lock() - pauseLogLen := len(src.pauseLog) - resumeLogLen := len(src.resumeLog) - src.mu.Unlock() - if pauseLogLen+resumeLogLen < src.NumWorkers() { - t.Errorf("expected re-baseline to issue Pause+Resume calls covering all %d workers; got %d pauses + %d resumes", - src.NumWorkers(), pauseLogLen, resumeLogLen) - } -} diff --git a/resource/config.go b/resource/config.go index 25b0e2f1..32fffdef 100644 --- a/resource/config.go +++ b/resource/config.go @@ -83,103 +83,6 @@ type Config struct { // from celeris.Config.EnableH2Upgrade (pointer, may be nil) and Protocol. // Always a concrete value after WithDefaults. EnableH2Upgrade bool - // SkipBuiltinScaler suppresses the per-engine dynamic worker scaler - // loop. Set by the adaptive engine when it constructs its sub-engines — - // adaptive runs ONE higher-level scaler that delegates to the active - // sub-engine, so the iouring + epoll built-in scalers must stay quiet - // to avoid two scalers fighting over the same worker pool. - SkipBuiltinScaler bool - // WorkerScaling configures the dynamic worker scaler. As of v1.4.6 - // (issue #281), [celeris.Config.toResourceConfig] sets this to a - // zero-value struct whenever the user-facing field was nil — the - // scaler is therefore DEFAULT-ON for any user constructing the - // server via celeris.New. Direct callers of resource.Config (i.e. - // engine tests + the resource_test suite) still see nil-as-disabled - // to keep the legacy contract for that low-level surface. - // - // When set, the scaler activates and deactivates workers based on - // observed load to keep CQE/event batching density in the sweet - // spot. See WorkerScalingConfig for tuning details. - WorkerScaling *WorkerScalingConfig -} - -// WorkerScalingStrategy selects the seed strategy for the dynamic -// worker scaler. The zero value (ScalingStrategyStartHigh) is the -// recommended default: start at NumWorkers active, scale down once -// load is observably low. This preserves SO_REUSEPORT distribution at -// startup, which the spike-B sweep showed is dramatically better on -// ramp / oscil traffic patterns (+34-78 % across all three engines). -type WorkerScalingStrategy int - -const ( - // ScalingStrategyStartHigh seeds the scaler at NumWorkers active. - // Best for production where traffic ramps from idle and bursts. - // Zero value — selected when the field is unset. - ScalingStrategyStartHigh WorkerScalingStrategy = 0 - // ScalingStrategyStartLow seeds the scaler at MinActive. Best when - // the application has a long idle warmup before any conns arrive - // and saving CPU during that idle period matters more than peak - // throughput on the first burst. - ScalingStrategyStartLow WorkerScalingStrategy = 1 -) - -// WorkerScalingConfig controls the dynamic worker scaler used by the -// iouring, epoll, and adaptive engines. Zero values mean "use the -// scaler's default" — see field comments for what those are. Pass via -// celeris.Config.WorkerScaling to enable; nil disables the scaler -// entirely (the engine runs all configured workers all the time, like -// versions before the scaler was introduced). -// -// The scaler tracks the engine's activeConns counter and adjusts the -// number of "active" workers (workers participating in the SO_REUSEPORT -// group) so that conns / active is roughly TargetConnsPerWorker. Scale-up -// is reactive (next tick after a load increase). Scale-down is hysteretic -// — must observe ScaleDownIdleTicks consecutive ticks below the -// hysteresis threshold before reducing one worker. -type WorkerScalingConfig struct { - // Strategy picks the seed-state strategy. Zero value is - // ScalingStrategyStartHigh, which is the data-validated default for - // most production workloads. See WorkerScalingStrategy for tuning. - Strategy WorkerScalingStrategy - // MinActive is the floor on the active worker count. The scaler will - // never reduce active workers below this. Defaults to max(2, NumCPU/2). - // Set to NumCPU to force the scaler to always run at full capacity - // (effectively a static-w=NumCPU configuration). - MinActive int - // TargetConnsPerWorker is the active-worker scaling target. The scaler - // computes desired = ceil(activeConns / TargetConnsPerWorker), clamps - // to [MinActive, NumWorkers], and steers active toward that. Default 20. - // Higher values keep more conns per worker (better batching, less - // parallelism). Lower values prefer parallelism over batching. - TargetConnsPerWorker int - // Interval controls how often the scaler reevaluates active count. - // Default 250ms. Lower values respond to load changes faster but burn - // more CPU on the controller goroutine. - Interval time.Duration - // ScaleUpStep is the maximum number of workers the scaler will - // resume per tick. Default 2 — wider bursts disrupt SO_REUSEPORT - // load balancing more than they help. Bigger values are tempting on - // SPIKE workloads but produce worse throughput per the v1.4.1 - // SPIKE-test sweep (upStep=4 and upStep=8 both lost to upStep=2). - ScaleUpStep int - // ScaleDownStep is the maximum number of workers the scaler will - // pause per tick when load drops. Default 1 — scale-down too quickly - // and you can't recover throughput when load comes back. - ScaleDownStep int - // ScaleDownHysteresis adds a buffer between desired and active - // before scale-down fires: scale-down only if desired ≤ active - - // ScaleDownHysteresis - 1. Default 1 (so a desired-of-N triggers - // scale-down only when active is N+2 or higher). - ScaleDownHysteresis int - // ScaleDownIdleTicks is how many consecutive sub-threshold ticks - // must pass before a single scale-down step fires. Default 4 - // (= 1 second at the default 250ms interval). Tunes how patient - // the scaler is about temporary lulls: a request-rate dip of one - // tick will not trigger scale-down. - ScaleDownIdleTicks int - // Trace logs every scaler decision (active, desired, idle_ticks). - // Default false. Use when diagnosing scaling behaviour. - Trace bool } // Validate checks all config fields and returns any validation errors. diff --git a/server_test.go b/server_test.go index 73fa0538..aa98294e 100644 --- a/server_test.go +++ b/server_test.go @@ -11,7 +11,6 @@ import ( "github.com/goceleris/celeris/engine" "github.com/goceleris/celeris/protocol/h2/stream" - "github.com/goceleris/celeris/resource" ) func TestServerEngineInfo(t *testing.T) { @@ -1009,43 +1008,6 @@ func TestConfigZeroValuesNotMapped(t *testing.T) { } } -// TestConfigWorkerScalingDefaultOn pins the v1.4.6 default-on contract -// for issue #281: nil Config.WorkerScaling resolves to a non-nil, -// zero-value resource.WorkerScalingConfig — the scaler activates -// automatically with the data-validated defaults -// (Strategy=StartHigh, MinActive=max(2,NumCPU/2), -// TargetConnsPerWorker=20, Interval=250ms, ScaleUpStep=2, -// ScaleDownStep=1, ScaleDownHysteresis=1, ScaleDownIdleTicks=4). -// -// This brings the "just-works" public design intent (Adaptive engine, -// Auto protocol) into alignment for the third major default. -func TestConfigWorkerScalingDefaultOn(t *testing.T) { - cfg := Config{Addr: ":8080"} // no WorkerScaling field set - rc := cfg.toResourceConfig() - if rc.WorkerScaling == nil { - t.Fatal("nil Config.WorkerScaling did not resolve to a non-nil resource.WorkerScalingConfig — scaler default-on regressed") - } - if *rc.WorkerScaling != (resource.WorkerScalingConfig{}) { - t.Fatalf("expected zero-value WorkerScalingConfig (so scaler picks data-validated defaults), got %+v", *rc.WorkerScaling) - } -} - -// TestConfigWorkerScalingExplicitPreserved pins that user-supplied -// values are passed through verbatim: the default-on path only fires -// when the user did NOT configure the scaler. -func TestConfigWorkerScalingExplicitPreserved(t *testing.T) { - want := &resource.WorkerScalingConfig{ - Strategy: resource.ScalingStrategyStartLow, - MinActive: 7, - TargetConnsPerWorker: 99, - } - cfg := Config{Addr: ":8080", WorkerScaling: want} - rc := cfg.toResourceConfig() - if rc.WorkerScaling != want { - t.Fatalf("explicit WorkerScaling pointer was not preserved: got %p, want %p", rc.WorkerScaling, want) - } -} - // Phase 2.1 tests func TestRouteUseMiddleware(t *testing.T) { From 84e6263a9b305d394361aa1ec59380c93e2f763d Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Thu, 18 Jun 2026 19:56:24 +0200 Subject: [PATCH 20/27] feat(adaptive): kernel/feature-aware start engine + lazy standby MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pick the START engine from probed io_uring capabilities (chooseStartEngine): io_uring on bundles-era (6.10+) or the 6.1 fast tier (DEFER_TASKRUN+SINGLE_ISSUER+multishot+provided-buffers), epoll otherwise. On kernel 7.0 adaptive now starts on io_uring, capturing the high-conc keep-alive throughput the old epoll-start default stranded (~+12%). Standby construction is LAZY: only the start engine is built+Listen-ed eagerly; the other is constructed on the first switch that needs it. When the start engine is already best and never switches, the standby is never built — cutting the dual-engine tax from ~7% to ~0.9% (same-binary interleaved at 1024c on 7.0). Conns-per-worker UP/DOWN switching is gated OFF in production: pinned conns never migrate, so the start engine decides keep-alive throughput, and the down-revert otherwise fired on idle/warmup dips and stranded load. The io_uring error-rate safety revert stays always-on. The conns-per-worker controller + multi-signal telemetry (conns/worker, accept rate, bytes/req via new per-engine accept/close/byte counters) are kept, gated, for a future middle-tier kernel with a real crossover (to be validated by a multi-kernel sweep). Old CPU-bias score machinery removed. Switch-safety invariants unchanged (resume-before-pause, synchronous PauseAccept/H2-dial-RST, ASYNC_CANCEL, driver-FD refusal, freezeState). --- adaptive/controller.go | 257 ++++++++------ adaptive/controller_test.go | 403 ++++++++++++--------- adaptive/driver_provider_test.go | 18 +- adaptive/engine.go | 420 +++++++++++++++++----- adaptive/engine_test.go | 488 ++++++++++++++++++-------- adaptive/score.go | 89 ----- adaptive/score_test.go | 55 --- adaptive/telemetry.go | 23 ++ engine/engine.go | 23 ++ engine/epoll/engine.go | 20 +- engine/epoll/loop.go | 76 ++-- engine/epoll/review_v150_test.go | 72 ++-- engine/epoll/writer.go | 34 +- engine/iouring/check_timeouts_test.go | 1 + engine/iouring/engine.go | 20 +- engine/iouring/worker.go | 46 ++- 16 files changed, 1331 insertions(+), 714 deletions(-) delete mode 100644 adaptive/score.go delete mode 100644 adaptive/score_test.go diff --git a/adaptive/controller.go b/adaptive/controller.go index 45752fc8..48b8a7c6 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -4,21 +4,34 @@ package adaptive import ( "log/slog" - "os" "time" "github.com/goceleris/celeris/engine" ) -// envIOUringBias gates the io_uring workload bias. The bias is now REVERSIBLE -// (celeris#338): it only EXPLORES — boosting the io_uring standby so an -// epoll→io_uring switch is reachable when the workload model favors it — and -// never inflates the active score nor suppresses the epoll standby, so a -// wrongly-explored io_uring always reverts on measurement. That makes it safe -// ON by default (adaptive picks the real high-concurrency winner instead of -// parking on epoll); set CELERIS_ADAPTIVE_IOURING_BIAS=0 to force the -// conservative measurement-only controller. -const envIOUringBias = "CELERIS_ADAPTIVE_IOURING_BIAS" +// The adaptive engine starts on epoll and switches to io_uring under load +// using a DIRECT conns-per-worker policy (no benchmark fingerprinting, no +// CPU monitor required). The thresholds come from the empirical crossover on +// this hardware: epoll and io_uring tie up to ~16 conns/worker; io_uring +// pulls ahead above ~20/worker and keeps scaling while epoll plateaus +// (io_uring ~+14% at 64 conns/worker); epoll wins at ~1 conn (lower latency). +// +// Policy: +// - On epoll, switch UP to io_uring when conns/worker sustains the up +// threshold for sustainTicks consecutive ticks, OR snap immediately when +// conns/worker crosses the heavy-load high-watermark (the fast path). +// - On io_uring, revert DOWN to epoll when conns/worker sustains BELOW the +// down threshold for sustainTicks ticks. The down threshold sits well +// under the up threshold so the band between them is a hysteresis zone +// that prevents flapping. +// - Large-payload workloads (avg bytes/req above largePayloadBytes) are +// link-bound — the engines tie — so an io_uring switch is suppressed to +// avoid pointless churn. +// - A safety revert fires if io_uring is active and the error rate climbs +// above errorRevertRate, regardless of load. +// +// The oscillation lock (3 switches in 5 min → 5 min lock) and the post-switch +// cooldown bound any residual thrash and hold io_uring after the fast snap. type controllerState struct { activeIsPrimary bool @@ -28,44 +41,73 @@ type controllerState struct { switchCount int locked bool lockUntil time.Time - lastActiveScore map[engine.EngineType]float64 - lastActiveTime map[engine.EngineType]time.Time + + // upTicks / downTicks count consecutive evaluations that satisfy the + // switch-up / switch-down condition; a normal switch needs sustainTicks + // of them, the heavy-load fast path needs only one. Reset on a switch + // (recordSwitch) and whenever the condition lapses. + upTicks int + downTicks int } type controller struct { - primary engine.Engine - secondary engine.Engine + primary engine.Engine // epoll (low-conns winner / starting engine) + secondary engine.Engine // io_uring (high-conns winner) sampler TelemetrySampler - weights ScoreWeights state controllerState evalInterval time.Duration cooldown time.Duration - threshold float64 - biasEnabled bool - logger *slog.Logger + + upThreshold float64 // conns/worker: epoll → io_uring + downThreshold float64 // conns/worker: io_uring → epoll (hysteresis low edge) + highWatermark float64 // conns/worker: heavy-load fast-path snap + largePayloadBytes float64 // avg bytes/req above which io_uring is suppressed + errorRevertRate float64 // io_uring error rate above which we revert to epoll + sustainTicks int // consecutive ticks required for a normal switch + + // connSwitchEnabled gates the conns-per-worker UP/DOWN switching. It is + // OFF for the kernel regimes where the feature-gated start engine is + // already the best at every concurrency (io_uring-best on bundles/6.10+, + // epoll-best on <6.1) — there, switching only churns and, worse, the + // down-revert fires during idle/warmup dips and strands load on the wrong + // engine (since pinned conns never migrate). The always-on error-revert + // below is independent of this flag. A future middle-tier kernel where a + // genuine crossover exists can re-enable it (validated by the kernel + // matrix). The engine sets it via SetConnSwitchEnabled from the profile. + connSwitchEnabled bool + + logger *slog.Logger } func newController(primary, secondary engine.Engine, sampler TelemetrySampler, logger *slog.Logger) *controller { return &controller{ - primary: primary, - secondary: secondary, - sampler: sampler, - weights: DefaultWeights(), - evalInterval: 5 * time.Second, - cooldown: 30 * time.Second, - threshold: 0.15, - biasEnabled: os.Getenv(envIOUringBias) != "0", - logger: logger, + primary: primary, + secondary: secondary, + sampler: sampler, + evalInterval: 1 * time.Second, + cooldown: 30 * time.Second, + upThreshold: 20.0, + downThreshold: 12.0, + highWatermark: 32.0, + largePayloadBytes: 16384.0, + errorRevertRate: 0.05, + sustainTicks: 2, + // Default ON so the conns-per-worker unit tests exercise the policy; + // the production New() path sets it from the kernel/feature profile + // (currently OFF — the feature-gated start engine is authoritative). + connSwitchEnabled: true, + logger: logger, state: controllerState{ activeIsPrimary: true, - lastActiveScore: make(map[engine.EngineType]float64), - lastActiveTime: make(map[engine.EngineType]time.Time), }, } } -// evaluate checks whether a switch is warranted. Returns true if a switch should occur. +// evaluate decides whether a switch is warranted given the current load. It +// returns true when the engine should switch to its standby. The decision is +// driven entirely by conns-per-worker (with payload-size and error-rate +// refinements); the frozen check, oscillation lock and cooldown gate it. func (c *controller) evaluate(now time.Time, frozen bool) bool { if frozen { return false @@ -83,99 +125,108 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { return false } - var active, standby engine.Engine - if c.state.activeIsPrimary { - active = c.primary - standby = c.secondary - } else { - active = c.secondary - standby = c.primary + active := c.activeEngine() + snap := c.sampler.Sample(active) + cpw := snap.ConnsPerWorker + + if active.Type() == engine.IOUring { + // Safety error-revert is ALWAYS active, independent of connSwitchEnabled: + // if io_uring starts erroring on this deployment, fall back to epoll. + if snap.ErrorRate > c.errorRevertRate { + c.state.downTicks = 0 + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap) + return true + } + if !c.connSwitchEnabled { + return false + } + return c.evaluateDown(snap, cpw) } - - activeSnap := c.sampler.Sample(active) - baselineActiveScore := computeScore(activeSnap, c.weights) - - // io_uring bias: the modeled io_uring advantage for the current workload. - // Zero outside the empirical sweet spot, AND zero unless explicitly enabled - // (celeris#341, envIOUringBias). It never reads the standby's real - // throughput, so off-by-default keeps adaptive from speculatively switching - // onto a measurably-slower engine. - bias := ioUringBias(activeSnap, c.biasEnabled) - - // Reversible bias (celeris#338): the ACTIVE score is ALWAYS the pure - // measurement — never inflated or penalised — so leaving the active engine - // is decided measured-vs-measured, never blocked by a sticky bias bonus. - activeScore := baselineActiveScore - - // Record the measured (unbiased) score as history, so a later revert - // compares real throughput rather than a biased estimate. - c.state.lastActiveScore[active.Type()] = baselineActiveScore - c.state.lastActiveTime[active.Type()] = now - - // Seed standby with 80% of active if no history exists. Harmless: 0.8 never - // clears the switch threshold on its own — only the explore-bias does. - if _, ok := c.state.lastActiveScore[standby.Type()]; !ok { - c.state.lastActiveScore[standby.Type()] = baselineActiveScore * 0.80 - c.state.lastActiveTime[standby.Type()] = now + // epoll active. + if !c.connSwitchEnabled { + return false } + return c.evaluateUp(snap, cpw) +} - // Standby estimate. The historical (measured, decayed) score ALWAYS counts — - // it is what drives a measurement-based revert. The io_uring bias may - // additionally EXPLORE: it boosts the io_uring standby when the workload - // model favors it (making an organic epoll→io_uring switch reachable), but - // it NEVER suppresses the epoll standby — so reverting from a wrongly-explored - // io_uring is always allowed on measurement. A bad exploration self-corrects - // the next eval; the oscillation lock bounds any thrash. - standbyScore := c.historicalScore(standby.Type(), now) - if modeled := c.biasModeledStandbyScore(standby.Type(), baselineActiveScore, bias); modeled > standbyScore { - standbyScore = modeled +// evaluateUp runs while epoll is active and considers a switch UP to io_uring. +func (c *controller) evaluateUp(snap TelemetrySnapshot, cpw float64) bool { + // Large payloads are link-bound: the engines tie, so never switch up. + // Keep upTicks pinned at zero so a later small-payload burst restarts the + // sustain count from scratch. + largePayload := snap.BytesPerReq >= c.largePayloadBytes + + switch { + case !largePayload && cpw >= c.highWatermark: + // Heavy-load fast path: snap immediately on a single tick. + c.state.upTicks++ + c.state.downTicks = 0 + c.logSwitch("epoll", "io_uring", "heavy-load fast path", cpw, snap) + return true + case !largePayload && cpw >= c.upThreshold: + c.state.upTicks++ + c.state.downTicks = 0 + if c.state.upTicks >= c.sustainTicks { + c.logSwitch("epoll", "io_uring", "sustained high load", cpw, snap) + return true + } + default: + c.state.upTicks = 0 } + c.state.downTicks = 0 + return false +} - if standbyScore > activeScore*(1.0+c.threshold) { - c.logger.Info("switch recommended", - "active", active.Type().String(), - "standby", standby.Type().String(), - "active_score", activeScore, - "standby_score", standbyScore, - ) +// evaluateDown runs while io_uring is active and considers a revert to epoll. +func (c *controller) evaluateDown(snap TelemetrySnapshot, cpw float64) bool { + // Safety revert: an error storm on io_uring beats any load consideration. + if snap.ErrorRate > c.errorRevertRate { + c.state.downTicks = 0 + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap) return true } + if cpw < c.downThreshold { + c.state.downTicks++ + if c.state.downTicks >= c.sustainTicks { + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "sustained low load", cpw, snap) + return true + } + } else { + c.state.downTicks = 0 + } + c.state.upTicks = 0 return false } -// biasModeledStandbyScore models the io_uring standby's score for the CURRENT -// workload from the io_uring bias (celeris#338): when conditions favor io_uring -// it is modeled as bias-better than the active baseline → standby = -// baseline*(1+bias), making an organic epoll→io_uring EXPLORATION reachable -// (the historical-only path could never clear 1+threshold from a cold standby). -// -// It returns 0 for the epoll standby — the bias never models epoll DOWN. That -// asymmetry is the reversibility guarantee: a revert from a wrongly-explored -// io_uring back to epoll is driven purely by epoll's real (historical) -// measurement and is never blocked by the bias. -func (c *controller) biasModeledStandbyScore(standby engine.EngineType, baselineActiveScore, bias float64) float64 { - if bias <= 0 || standby != engine.IOUring { - return 0 +func (c *controller) activeEngine() engine.Engine { + if c.state.activeIsPrimary { + return c.primary } - return baselineActiveScore * (1.0 + bias) + return c.secondary } -// historicalScore returns the last known score for an engine type, decayed -// at 1% per second since the score was recorded. -func (c *controller) historicalScore(et engine.EngineType, now time.Time) float64 { - score, ok := c.state.lastActiveScore[et] - if !ok { - return 0 - } - elapsed := now.Sub(c.state.lastActiveTime[et]).Seconds() - return score * max(0, 1.0-0.01*elapsed) +func (c *controller) logSwitch(from, to, reason string, cpw float64, snap TelemetrySnapshot) { + c.logger.Info("engine switch recommended", + "from", from, + "to", to, + "reason", reason, + "conns_per_worker", cpw, + "bytes_per_req", snap.BytesPerReq, + "error_rate", snap.ErrorRate, + "active_connections", snap.ActiveConnections, + ) } // recordSwitch updates controller state after a switch has been performed. func (c *controller) recordSwitch(now time.Time) { c.state.activeIsPrimary = !c.state.activeIsPrimary c.state.lastSwitch = now + c.state.upTicks = 0 + c.state.downTicks = 0 c.state.switchTimes[c.state.switchIdx%len(c.state.switchTimes)] = now c.state.switchIdx++ diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go index 7ea7b1df..7130d1a0 100644 --- a/adaptive/controller_test.go +++ b/adaptive/controller_test.go @@ -3,198 +3,287 @@ package adaptive import ( + "log/slog" "testing" "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/resource" ) -// TestControllerOrganicSwitch verifies that, in the io_uring sweet spot -// (high connection count + high CPU), the controller eventually recommends an -// epoll→io_uring switch driven purely by the io_uring bias — no pre-seeded -// standby history, no active degradation. The bias is opt-in (celeris#341), so -// this exercises it with biasEnabled forced on. -func TestControllerOrganicSwitch(t *testing.T) { - primary := newMockEngine(engine.Epoll) // active - secondary := newMockEngine(engine.IOUring) // standby +// newCtrlEpollActive returns a controller with epoll active (primary) plus the +// synthetic sampler driving its telemetry, cooldown disabled for unit testing. +func newCtrlEpollActive() (*controller, *syntheticSampler) { sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger()) + c.cooldown = 0 + return c, sampler +} + +// newCtrlIOUringActive returns a controller with io_uring active (primary). +func newCtrlIOUringActive() (*controller, *syntheticSampler) { + sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.IOUring), newMockEngine(engine.Epoll), sampler, testLogger()) + c.cooldown = 0 + return c, sampler +} + +func testLogger() *slog.Logger { return slog.New(slog.DiscardHandler) } - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - e.ctrl.biasEnabled = true // bias is opt-in; this test exercises it +// TestConnSwitchDisabledGate verifies the kernel-aware gate: with conns-per-worker +// switching OFF (the production default — the feature-gated start engine is +// authoritative on io_uring-best/epoll-best kernels), neither the epoll up-switch +// nor the io_uring down-revert fires regardless of load, but the io_uring +// error-rate safety revert STILL fires. Guards the warmup-dip revert bug where an +// idle/ramp dip on a 7.0 box reverted io_uring→epoll and stranded the load. +func TestConnSwitchDisabledGate(t *testing.T) { + now := time.Now() + + c, sampler := newCtrlEpollActive() + c.connSwitchEnabled = false + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64}) + for i := 0; i < 5; i++ { + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("epoll up-switch fired with switching disabled (tick %d)", i) + } + } - // Active epoll snapshot lands squarely in io_uring's empirical sweet spot. - sampler.Set(engine.Epoll, TelemetrySnapshot{ - ThroughputRPS: 1000, - ActiveConnections: 2048, - CPUUtilization: 0.9, - }) + c2, s2 := newCtrlIOUringActive() + c2.connSwitchEnabled = false + s2.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) + for i := 0; i < 5; i++ { + if c2.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("io_uring down-revert fired with switching disabled (tick %d)", i) + } + } - if e.ActiveEngine().Type() != engine.Epoll { - t.Fatal("expected epoll active initially") + c3, s3 := newCtrlIOUringActive() + c3.connSwitchEnabled = false + s3.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.5}) + if !c3.evaluate(now, false) { + t.Fatal("error-rate safety revert did NOT fire with switching disabled") } +} + +// TestUpSwitchRequiresSustain verifies the normal epoll→io_uring switch needs +// sustainTicks (2) consecutive ticks at or above the up threshold (20). +func TestUpSwitchRequiresSustain(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 20}) + + now := time.Now() + if c.evaluate(now, false) { + t.Fatal("one tick at the up threshold must not switch (sustainTicks=2)") + } + if c.state.upTicks != 1 { + t.Fatalf("upTicks = %d, want 1 after first qualifying tick", c.state.upTicks) + } + if !c.evaluate(now.Add(time.Second), false) { + t.Fatal("second consecutive tick at the up threshold must switch") + } +} + +// TestUpSwitchSustainResetsOnDip verifies a dip below the up threshold resets +// the sustain counter, so the count must restart. +func TestUpSwitchSustainResetsOnDip(t *testing.T) { + c, sampler := newCtrlEpollActive() + now := time.Now() + + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 22}) + if c.evaluate(now, false) { + t.Fatal("first tick must not switch") + } + // Dip into the hysteresis band — resets upTicks. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 15}) + if c.evaluate(now.Add(time.Second), false) { + t.Fatal("dip must not switch") + } + if c.state.upTicks != 0 { + t.Fatalf("upTicks = %d, want 0 after a dip below the up threshold", c.state.upTicks) + } + // Back above threshold: needs two more ticks now. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 22}) + if c.evaluate(now.Add(2*time.Second), false) { + t.Fatal("first tick after reset must not switch") + } + if !c.evaluate(now.Add(3*time.Second), false) { + t.Fatal("second tick after reset must switch") + } +} + +// TestFastPathSnapsImmediately verifies conns/worker at/above the high +// watermark (32) switches on a single tick. +func TestFastPathSnapsImmediately(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 32}) + + if !c.evaluate(time.Now(), false) { + t.Fatal("conns/worker at the high watermark must snap on one tick") + } +} + +// TestHysteresisBandNoFlap verifies the 12–20 band switches neither way. +func TestHysteresisBandNoFlap(t *testing.T) { + for _, cpw := range []float64{12, 14, 16, 18, 19.9} { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: cpw}) + now := time.Now() + for i := range 4 { + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("cpw=%.1f inside hysteresis band must not switch up", cpw) + } + } + } +} + +// TestLargePayloadSuppression verifies a large average payload suppresses the +// switch even well above the high watermark. +func TestLargePayloadSuppression(t *testing.T) { + c, sampler := newCtrlEpollActive() + // Exactly at the threshold counts as large (>=). + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 16384}) now := time.Now() - switched := false - // Advance well past any cooldown/observation window and let the estimate - // settle over a few ticks. for i := range 5 { - if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - switched = true - break + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("large payload must suppress io_uring switch") } } - if !switched { - t.Fatal("expected organic epoll→io_uring switch in the io_uring sweet spot") + + // Drop below the large-payload threshold → fast path fires. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 16383}) + if !c.evaluate(now.Add(10*time.Second), false) { + t.Fatal("small payload above high watermark must switch") } } -// TestControllerRevertsFromSlowerExploredEngine is the celeris#338 reversibility -// guard — the core of the safe bias. io_uring is active (as if just explored to) -// in the bias sweet spot, but it MEASURES slower than epoll's known score. The -// controller MUST revert to epoll: the bias may explore but must never block a -// measurement-driven reversion (it neither inflates the active score nor -// suppresses the epoll standby). This is exactly the case the old sticky bias -// got wrong (it parked adaptive on the slower engine). -func TestControllerRevertsFromSlowerExploredEngine(t *testing.T) { - primary := newMockEngine(engine.IOUring) // active (explored-to) - secondary := newMockEngine(engine.Epoll) // standby, measured-faster historically - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - e.ctrl.biasEnabled = true +// TestRevertRequiresSustain verifies the io_uring→epoll revert needs +// sustainTicks below the down threshold (12). +func TestRevertRequiresSustain(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 8}) now := time.Now() - // epoll was measured fast before; io_uring now measures slow IN the sweet spot. - e.ctrl.state.lastActiveScore[engine.Epoll] = 1000 - e.ctrl.state.lastActiveTime[engine.Epoll] = now - sampler.Set(engine.IOUring, TelemetrySnapshot{ - ThroughputRPS: 500, - ActiveConnections: 2048, - CPUUtilization: 0.9, - }) - - if !e.ctrl.evaluate(now, false) { - t.Fatal("io_uring measuring slower than epoll's historical must REVERT to epoll even in the io_uring bias sweet spot — the bias must not block measured reversion") - } -} - -// TestControllerBiasOffNoExplore verifies the kill-switch: with the bias forced -// off (CELERIS_ADAPTIVE_IOURING_BIAS=0), the controller is purely -// measurement-driven and does NOT explore the unmeasured io_uring standby even -// in the sweet spot. -func TestControllerBiasOffNoExplore(t *testing.T) { - primary := newMockEngine(engine.Epoll) // active - secondary := newMockEngine(engine.IOUring) // standby, never measured - sampler := newSyntheticSampler() + if c.evaluate(now, false) { + t.Fatal("one low tick must not revert (sustainTicks=2)") + } + if !c.evaluate(now.Add(time.Second), false) { + t.Fatal("second consecutive low tick must revert") + } +} - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - e.ctrl.biasEnabled = false // kill-switch +// TestErrorRevertImmediate verifies an error rate above errorRevertRate (0.05) +// reverts io_uring→epoll immediately regardless of load. +func TestErrorRevertImmediate(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.06}) - sampler.Set(engine.Epoll, TelemetrySnapshot{ - ThroughputRPS: 1000, - ActiveConnections: 2048, - CPUUtilization: 0.9, - }) + if !c.evaluate(time.Now(), false) { + t.Fatal("error rate above errorRevertRate must revert on one tick") + } +} + +// TestNoRevertWhenBusy verifies io_uring stays put above the down threshold +// with a clean error rate. +func TestNoRevertWhenBusy(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 16, ErrorRate: 0.0}) now := time.Now() for i := range 5 { - if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - t.Fatal("bias off: must not explore the unmeasured io_uring standby") + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not revert while busy with a clean error rate") } } } -// TestControllerStableUnderFluctuation is the celeris#338 real-load stability -// guard: at the 1024c sweet spot where io_uring is marginally faster (~+7%, well -// under the 15% switch threshold), with ±4% telemetry jitter on both engines, -// the controller must EXPLORE to io_uring and SETTLE there without thrashing. -// The 15% threshold provides the hysteresis; the reversible bias provides the -// explore. (Unit tests cover static telemetry; this covers fluctuation.) -func TestControllerStableUnderFluctuation(t *testing.T) { - primary := newMockEngine(engine.Epoll) // start active - secondary := newMockEngine(engine.IOUring) // marginally faster at 1024c - sampler := newSyntheticSampler() +// TestCooldownGatesRevert verifies the cooldown blocks a revert immediately +// after a switch, but the first switch (no prior switch) is not gated. +func TestCooldownGatesRevert(t *testing.T) { + c, sampler := newCtrlEpollActive() + c.cooldown = 1 * time.Hour + + now := time.Now() + // First switch is not gated (lastSwitch is zero). + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + if !c.evaluate(now, false) { + t.Fatal("first switch must not be cooldown-gated") + } + c.recordSwitch(now) + + // io_uring now active and idle, but the revert is cooldown-gated. + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) + if c.evaluate(now.Add(time.Minute), false) { + t.Fatal("revert within cooldown must be blocked") + } +} + +// TestFrozenBlocksSwitch verifies the frozen flag short-circuits evaluate. +func TestFrozenBlocksSwitch(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 100}) + if c.evaluate(time.Now(), true) { + t.Fatal("frozen must block all switches") + } +} - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - c := e.ctrl +// TestOscillationLockExpiry verifies 3 switches in 5 minutes locks the +// controller, and the lock expires after 5 minutes. +func TestOscillationLockExpiry(t *testing.T) { + sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger()) c.cooldown = 0 - c.biasEnabled = true now := time.Now() - switches := 0 - // activeType derives the controller's chosen engine from its decision state - // (recordSwitch toggles activeIsPrimary; the engine-level performSwitch is - // out of scope for a controller-logic test). - activeType := func() engine.EngineType { - if c.state.activeIsPrimary { - return primary.Type() - } - return secondary.Type() - } - for i := range 60 { - // Deterministic ±4% jitter (no rand): pattern over -2..+2. Both engines - // are set each tick; evaluate samples whichever it currently calls active. - jit := func(base float64) float64 { return base * (1.0 + 0.04*float64((i%5)-2)/2) } - sampler.Set(engine.Epoll, TelemetrySnapshot{ThroughputRPS: jit(1240), ActiveConnections: 1024, CPUUtilization: 0.85}) - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: jit(1330), ActiveConnections: 1024, CPUUtilization: 0.85}) - tn := now.Add(time.Duration(i+1) * time.Second) - if c.evaluate(tn, false) { - c.recordSwitch(tn) - switches++ - } + for range 3 { + c.recordSwitch(now) + now = now.Add(time.Second) + } + if !c.state.locked { + t.Fatal("expected lock after 3 switches within 5 minutes") } - if activeType() != engine.IOUring { - t.Fatalf("adaptive should explore + settle on the faster io_uring at 1024c, got %s", activeType()) - } - if switches > 3 { - t.Fatalf("excessive switching under fluctuation (%d) — possible thrash; 15%% threshold should hold once settled", switches) - } -} - -// TestControllerNoSwitchOutsideSweetSpot is the inverse: low CPU or too few -// connections yields zero bias, so the controller must NOT recommend a switch -// (no degradation, no favorable conditions). -func TestControllerNoSwitchOutsideSweetSpot(t *testing.T) { - cases := []struct { - name string - snap TelemetrySnapshot - }{ - { - name: "low CPU", - snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 2048, CPUUtilization: 0.10}, - }, - { - name: "too few connections", - snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 32, CPUUtilization: 0.9}, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - primary := newMockEngine(engine.Epoll) - secondary := newMockEngine(engine.IOUring) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - - sampler.Set(engine.Epoll, tc.snap) - - now := time.Now() - for i := range 5 { - if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - t.Fatalf("unexpected switch outside io_uring sweet spot (%s)", tc.name) - } - } - }) + // Locked: a strong load must not switch. + sampler.Set(c.activeEngine().Type(), TelemetrySnapshot{ConnsPerWorker: 100}) + if c.evaluate(now, false) { + t.Fatal("locked controller must not switch") + } + + // After lock expiry, evaluation resumes (lock cleared on the next eval). + after := c.state.lockUntil.Add(time.Second) + _ = c.evaluate(after, false) + if c.state.locked { + t.Fatal("lock should clear once lockUntil has passed") + } +} + +// TestRecordSwitchResetsTicks verifies recordSwitch zeroes both tick counters. +func TestRecordSwitchResetsTicks(t *testing.T) { + c, _ := newCtrlEpollActive() + c.state.upTicks = 5 + c.state.downTicks = 3 + c.recordSwitch(time.Now()) + if c.state.upTicks != 0 || c.state.downTicks != 0 { + t.Fatalf("recordSwitch must reset ticks, got up=%d down=%d", c.state.upTicks, c.state.downTicks) + } + if c.state.activeIsPrimary { + t.Fatal("recordSwitch must toggle activeIsPrimary") + } +} + +// guards against a config drift in the documented thresholds. +func TestDefaultThresholds(t *testing.T) { + c, _ := newCtrlEpollActive() + if c.upThreshold != 20 || c.downThreshold != 12 || c.highWatermark != 32 { + t.Fatalf("threshold drift: up=%.0f down=%.0f hwm=%.0f", c.upThreshold, c.downThreshold, c.highWatermark) + } + if c.largePayloadBytes != 16384 || c.errorRevertRate != 0.05 || c.sustainTicks != 2 { + t.Fatalf("policy drift: largePayload=%.0f errRevert=%.3f sustain=%d", + c.largePayloadBytes, c.errorRevertRate, c.sustainTicks) + } + if c.evalInterval != time.Second { + t.Fatalf("evalInterval = %v, want 1s", c.evalInterval) + } + if c.cooldown != 0 { // overridden to 0 by the helper + t.Fatalf("test helper should zero cooldown, got %v", c.cooldown) } } diff --git a/adaptive/driver_provider_test.go b/adaptive/driver_provider_test.go index dddffd18..510c01b8 100644 --- a/adaptive/driver_provider_test.go +++ b/adaptive/driver_provider_test.go @@ -12,6 +12,7 @@ import ( "golang.org/x/sys/unix" "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" ) @@ -29,6 +30,14 @@ var _ stream.Handler = noopHandler{} // (we need both primary + secondary up for a meaningful switch test). func newBoundAdaptive(t *testing.T) (*Engine, func()) { t.Helper() + // These tests need BOTH sub-engines for a meaningful switch. Since New() + // now LAZILY builds the standby (and falls back to an epoll start when + // io_uring is unavailable), a no-io_uring host would let New() succeed yet + // fail the first switch when the io_uring standby cannot be built. Skip up + // front when io_uring is genuinely unavailable. + if !probe.Probe().IOUringTier.Available() { + t.Skip("io_uring unavailable: switch test needs both sub-engines") + } cfg := resource.Config{ Addr: "127.0.0.1:0", Protocol: engine.HTTP1, @@ -347,13 +356,8 @@ func TestAdaptiveSwitchAfterDriverQuiescence(t *testing.T) { e, stop := newBoundAdaptive(t) defer stop() - // Seed historical scores so ForceSwitch actually flips active. - now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 100 - e.ctrl.state.lastActiveScore[engine.IOUring] = 500 - e.ctrl.state.lastActiveTime[engine.Epoll] = now - e.ctrl.state.lastActiveTime[engine.IOUring] = now - + // ForceSwitch bypasses the decision policy and flips the active engine + // directly, so no telemetry seeding is needed here. local, peer := socketpairNonblocking(t) defer func() { _ = unix.Close(peer) }() defer func() { _ = unix.Close(local) }() diff --git a/adaptive/engine.go b/adaptive/engine.go index 66750f7b..77ce3011 100644 --- a/adaptive/engine.go +++ b/adaptive/engine.go @@ -8,6 +8,7 @@ import ( "fmt" "log/slog" "net" + "os" "sync" "sync/atomic" "time" @@ -15,6 +16,7 @@ import ( "github.com/goceleris/celeris/engine" "github.com/goceleris/celeris/engine/epoll" "github.com/goceleris/celeris/engine/iouring" + "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" ) @@ -25,9 +27,19 @@ var ( ) // Engine is an adaptive meta-engine that switches between io_uring and epoll. +// +// The two sub-engine slots map to a fixed protocol direction the controller +// keys off: primary is ALWAYS the epoll engine (the controller's +// activeIsPrimary==true means epoll is active) and secondary is ALWAYS the +// io_uring engine. On the public New() path only the START engine is built +// eagerly; the other slot stays nil until the first switch actually needs it +// (see buildStandby + performSwitch). On a modern kernel that starts on +// io_uring and never reverts, the epoll standby is never constructed, so its +// GC-rooted heap never exists. newFromEngines (tests) populates BOTH slots +// eagerly, exercising the standby-already-exists switch path. type Engine struct { - primary engine.Engine // io_uring - secondary engine.Engine // epoll + primary engine.Engine // epoll (nil until built when it is the lazy standby) + secondary engine.Engine // io_uring (nil until built when it is the lazy standby) active atomic.Pointer[engine.Engine] ctrl *controller cfg resource.Config @@ -38,6 +50,24 @@ type Engine struct { frozen atomic.Bool logger *slog.Logger + // startType is the engine type chosen for the eager start engine. The + // standby is the other type; buildStandby constructs it on demand. + startType engine.EngineType + + // buildStandby constructs the LAZY standby sub-engine on first switch. + // It captures cfg + handler (+ cpuMon for the sampler symmetry) and is + // nil on the newFromEngines (tests) path where both engines are eager. + buildStandby func() (engine.Engine, error) + + // listenCtx / listenWG are captured by Listen so performSwitch can start a + // freshly-built standby's Listen goroutine under the SAME context and wait + // group as the active engine. Shutdown then joins it implicitly via the + // wait group (wg.Wait in Listen) — a never-built standby added nothing to + // the group, so there is nothing to join. Guarded by mu (performSwitch + // holds mu across the whole switch; Listen sets these once under mu). + listenCtx context.Context + listenWG *sync.WaitGroup + // freezeCooldown is the duration to suppress further switches after a switch. // Zero means no cooldown (default). freezeCooldown time.Duration @@ -62,12 +92,57 @@ type Engine struct { switchRejected atomic.Uint64 // telemetry: how many switches were blocked by driver FDs } -// New creates a new adaptive engine with epoll as primary and io_uring as secondary. -// Epoll starts first because it has lower H2 latency on current kernels (single-pass -// read→process→write vs io_uring's two-iteration CQE model). The controller may -// switch to io_uring if telemetry indicates it would perform better for the workload. -// Both sub-engines get the full resource config. This is safe because standby -// workers are fully suspended (zero CPU, zero connections, listen sockets closed). +// chooseStartEngine selects which sub-engine the adaptive meta-engine should +// start (and build eagerly) given the probed io_uring capability profile. +// +// io_uring loses to epoll on old kernels (pre-LTS-stability bugs, missing the +// fast-path setup flags) but wins on modern ones for thin-HTTP, so the start +// engine is feature-gated: +// +// - IOUring when io_uring is mature for thin-HTTP — either the kernel is in +// the "bundles" era (>6.10, where multishot + provided buffers + defer +// taskrun are all stable and tuned) OR the 6.1+ fast tier is present +// (DEFER_TASKRUN + SINGLE_ISSUER + MULTISHOT_RECV + PROVIDED_BUFFERS). +// - Epoll otherwise (old kernels, or io_uring missing the fast tier). +// +// The env var CELERIS_ADAPTIVE_START overrides the rule: +// +// iouring | epoll — force that start engine. +// auto (or unset) — use the capability rule above. +// +// NOTE: the exact capability rule will be refined by a kernel-matrix sweep; +// treat the thresholds here as the current best estimate, not a final answer. +func chooseStartEngine(p engine.CapabilityProfile) engine.EngineType { + switch os.Getenv("CELERIS_ADAPTIVE_START") { + case "iouring": + return engine.IOUring + case "epoll": + return engine.Epoll + case "auto", "": + // fall through to the capability rule + default: + // Unknown value: fall through to auto rather than fail hard. + } + + bundlesEra := p.KernelMajor > 6 || (p.KernelMajor == 6 && p.KernelMinor >= 10) + fastTier := p.DeferTaskrun && p.SingleIssuer && p.MultishotRecv && p.ProvidedBuffers + if bundlesEra || fastTier { + return engine.IOUring + } + return engine.Epoll +} + +// New creates a new adaptive engine. Only the START engine is built and +// Listen'd eagerly; the other engine (the standby) is constructed lazily on the +// first switch that actually needs it. The start engine is chosen by +// chooseStartEngine from the probed io_uring capabilities (feature-gated, with a +// CELERIS_ADAPTIVE_START env override). +// +// Both sub-engines bind the SAME SO_REUSEPORT port so the adaptive switch is +// transparent: resolvePort pins a concrete port up front, and the lazily-built +// standby reuses it. Building only the start engine eliminates the parked +// standby's GC-rooted heap — on a modern kernel that starts on io_uring and +// never reverts, the epoll standby is never constructed (≈0 standby tax). // // cpuMon is an engine.CPUMonitor (the public interface); when non-nil it // supplies the live sampler with CPU utilization data so the io_uring bias can @@ -91,39 +166,99 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) } } - primary, err := epoll.New(cfg, handler) - if err != nil { - return nil, fmt.Errorf("epoll sub-engine: %w", err) - } - - secondary, err := iouring.New(cfg, handler) - if err != nil { - return nil, fmt.Errorf("io_uring sub-engine: %w", err) - } + // probe.Probe() reads kernel version + io_uring setup feature bits WITHOUT + // constructing an engine, so it is cheap enough for the start decision. + startType := chooseStartEngine(probe.Probe()) sampler := newLiveSampler(cpuMon) logger := cfg.Logger + if logger == nil { + logger = slog.Default() + } + + // Constructors for each slot. The standby's constructor is stored on the + // Engine and only invoked on the first switch. The io_uring constructor + // does not take cpuMon (iouring.New has no such parameter); cpuMon already + // feeds the shared sampler via newLiveSampler above. + buildEpoll := func() (engine.Engine, error) { + eng, err := epoll.New(cfg, handler) + if err != nil { + return nil, fmt.Errorf("epoll sub-engine: %w", err) + } + return eng, nil + } + buildIOUring := func() (engine.Engine, error) { + eng, err := iouring.New(cfg, handler) + if err != nil { + return nil, fmt.Errorf("io_uring sub-engine: %w", err) + } + return eng, nil + } e := &Engine{ - primary: primary, - secondary: secondary, cfg: cfg, handler: handler, logger: logger, + startType: startType, } - e.ctrl = newController(primary, secondary, sampler, logger) - - // Start with primary (epoll) for all protocols. Epoll has better H2 - // throughput on current kernels and matches H1 performance. - var initialActive engine.Engine = primary - e.ctrl.state.activeIsPrimary = true - e.active.Store(&initialActive) + var startEngine engine.Engine + if startType == engine.IOUring { + // io_uring is the eager start; epoll is the lazy standby. + // io_uring construction can fail on a kernel that probed as capable + // but cannot actually set up the ring (e.g. low RLIMIT_MEMLOCK). Fall + // back to starting on epoll rather than failing New outright. + eng, err := buildIOUring() + if err != nil { + logger.Warn("io_uring start engine unavailable, falling back to epoll start", "error", err) + startType = engine.Epoll + e.startType = engine.Epoll + eng, err = buildEpoll() + if err != nil { + return nil, err + } + startEngine = eng + e.primary = eng + e.buildStandby = buildIOUring + } else { + startEngine = eng + e.secondary = eng + e.buildStandby = buildEpoll + } + } else { + // epoll is the eager start; io_uring is the lazy standby. + eng, err := buildEpoll() + if err != nil { + return nil, err + } + startEngine = eng + e.primary = eng + e.buildStandby = buildIOUring + } + // The controller needs BOTH engine TYPES to decide switch direction even + // while the standby engine is nil, but it only ever dereferences the + // ACTIVE engine (activeEngine()). Pass the start engine for the active slot + // and nil for the lazy standby slot — newController stores them; activeIsPrimary + // records which slot the start engine occupies (primary==epoll). + e.ctrl = newController(e.primary, e.secondary, sampler, logger) + e.ctrl.state.activeIsPrimary = e.startType == engine.Epoll + // Conns-per-worker UP/DOWN switching is OFF in production: the feature-gated + // chooseStartEngine already selects the engine that is best at every + // concurrency on this kernel, and (because pinned conns never migrate) the + // down-revert would only fire on idle/warmup dips and strand load on the + // wrong engine. The always-on error-revert in the controller is unaffected. + // A future middle-tier kernel with a genuine crossover can flip this on + // (validated by the kernel matrix). + e.ctrl.connSwitchEnabled = false + + e.active.Store(&startEngine) return e, nil } -// newFromEngines creates an adaptive engine from pre-built engines (for testing). +// newFromEngines creates an adaptive engine from pre-built engines (for +// testing). BOTH slots are populated eagerly and buildStandby is left nil, so +// performSwitch exercises the "standby already exists" path (no lazy build). func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, cfg resource.Config) *Engine { logger := cfg.Logger if logger == nil { @@ -135,6 +270,7 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, secondary: secondary, cfg: cfg, logger: logger, + startType: engine.Epoll, } e.ctrl = newController(primary, secondary, sampler, logger) @@ -146,7 +282,9 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, return e } -// Listen starts both sub-engines and the evaluation loop. +// Listen starts ONLY the active sub-engine and the evaluation loop. The standby +// is built and Listen'd lazily by performSwitch on the first switch (joined +// under the same ctx + wait group captured here). func (e *Engine) Listen(ctx context.Context) error { innerCtx, innerCancel := context.WithCancel(ctx) defer innerCancel() @@ -163,23 +301,25 @@ func (e *Engine) Listen(ctx context.Context) error { var wg sync.WaitGroup - errCh := make(chan error, 2) + // Publish ctx + wg so performSwitch can launch the lazily-built standby's + // Listen goroutine under the same lifetime (Shutdown joins it via wg.Wait). + e.mu.Lock() + e.listenCtx = innerCtx + e.listenWG = &wg + e.mu.Unlock() - wg.Go(func() { - if err := e.primary.Listen(innerCtx); err != nil { - errCh <- fmt.Errorf("primary (epoll): %w", err) - } - }) + errCh := make(chan error, 2) + active := *e.active.Load() wg.Go(func() { - if err := e.secondary.Listen(innerCtx); err != nil { - errCh <- fmt.Errorf("secondary (io_uring): %w", err) + if err := active.Listen(innerCtx); err != nil { + errCh <- fmt.Errorf("active (%s): %w", active.Type().String(), err) } }) - // Wait for both engines to bind their addresses. + // Wait for the ACTIVE engine to bind its address. // io_uring may need multiple tier fallback attempts, so allow ample time — - // but if either sub-engine has already returned an error to errCh + // but if the active sub-engine has already returned an error to errCh // (e.g. ENOMEM at io_uring_setup under low RLIMIT_MEMLOCK), surface it // immediately instead of waiting out the deadline. deadline := time.Now().Add(20 * time.Second) @@ -189,7 +329,7 @@ func (e *Engine) Listen(ctx context.Context) error { defer bindWait.Stop() var startErr error bindLoop: - for e.primary.Addr() == nil || e.secondary.Addr() == nil { + for active.Addr() == nil { select { case startErr = <-errCh: break bindLoop @@ -204,50 +344,22 @@ bindLoop: wg.Wait() return fmt.Errorf("sub-engine startup failed: %w", startErr) } - if e.primary.Addr() == nil || e.secondary.Addr() == nil { + if active.Addr() == nil { innerCancel() wg.Wait() - return fmt.Errorf("sub-engines failed to initialize within 20s deadline") - } - - // Pause standby engine's accept BEFORE publishing Addr so the - // SO_REUSEPORT group only contains the active engine by the time - // callers start dialing. Without this gate, a burst of incoming - // connections in the window between secondary.Listen() succeeding - // and PauseAccept taking effect lands some dials on the standby's - // accept queue; closing that queue then RSTs them. H1 clients - // reconnect transparently but H2 prior-knowledge clients see the - // dial fail mid-handshake. Read c.state.activeIsPrimary under - // switchMu to avoid racing with performSwitch → recordSwitch (a - // concurrent ForceSwitch or controller tick can fire before Listen - // finishes its own setup). - e.switchMu.Lock() - activeIsPrimary := e.ctrl.state.activeIsPrimary - e.switchMu.Unlock() - if activeIsPrimary { - if ac, ok := e.secondary.(engine.AcceptController); ok { - if err := ac.PauseAccept(); err != nil { - innerCancel() - wg.Wait() - return fmt.Errorf("pause secondary: %w", err) - } - } - } else { - if ac, ok := e.primary.(engine.AcceptController); ok { - if err := ac.PauseAccept(); err != nil { - innerCancel() - wg.Wait() - return fmt.Errorf("pause primary: %w", err) - } - } + return fmt.Errorf("active sub-engine failed to initialize within 20s deadline") } - addr := e.primary.Addr() + // No standby to pause: only the active engine is in the SO_REUSEPORT group, + // so publishing Addr cannot expose a dial to a phantom standby listener. + // (The original pause-standby-before-publish-Addr step guarded that window; + // with a lazy standby there is no standby listening here.) + addr := active.Addr() e.addr.Store(&addr) e.logger.Info("adaptive engine listening", "addr", e.cfg.Addr, - "active", (*e.active.Load()).Type().String(), + "active", active.Type().String(), ) // Start evaluation loop. @@ -289,6 +401,47 @@ func (e *Engine) runEvalLoop(ctx context.Context) { } } +// buildAndStartStandby constructs the lazy standby sub-engine, launches its +// Listen goroutine under the same ctx + wait group Listen captured (so Shutdown +// joins it), and waits — bounded — for it to bind the shared SO_REUSEPORT port. +// The caller holds e.mu. wantType is purely for error messages. On any failure +// it returns an error and the engine state is left untouched (no slot stored), +// so the current active keeps serving. +func (e *Engine) buildAndStartStandby(wantType engine.EngineType) (engine.Engine, error) { + if e.buildStandby == nil { + return nil, fmt.Errorf("no standby builder for %s", wantType.String()) + } + if e.listenCtx == nil || e.listenWG == nil { + return nil, fmt.Errorf("cannot build standby before Listen has started") + } + + built, err := e.buildStandby() + if err != nil { + return nil, fmt.Errorf("build %s standby: %w", wantType.String(), err) + } + + ctx := e.listenCtx + wg := e.listenWG + wg.Go(func() { + if lerr := built.Listen(ctx); lerr != nil { + e.logger.Warn("lazy standby Listen returned error", + "standby", built.Type().String(), "error", lerr) + } + }) + + // Wait (bounded) for the standby to bind the shared port — once Addr() is + // non-nil it has joined the SO_REUSEPORT group and is accepting, so the + // resume-before-pause overlap is real and connections are never dropped. + deadline := time.Now().Add(5 * time.Second) + for built.Addr() == nil { + if time.Now().After(deadline) || ctx.Err() != nil { + return nil, fmt.Errorf("%s standby failed to bind within 5s", wantType.String()) + } + time.Sleep(5 * time.Millisecond) + } + return built, nil +} + func (e *Engine) performSwitch() { e.mu.Lock() defer e.mu.Unlock() @@ -318,21 +471,92 @@ func (e *Engine) performSwitch() { e.freezeState.Unlock() return } + // Release freezeState across the (possibly slow) lazy standby build + + // Listen + bind-wait below; re-acquired before the active.Store commit. + // Holding it across a multi-second build would block driver + // register/unregister flows (same reasoning as the PauseAccept release + // at the end of this function). e.mu (held for the whole function) + // already serialises performSwitch against itself, so no other switch + // can race the build. + e.freezeState.Unlock() now := time.Now() + // Determine the direction. activeIsPrimary toggles on recordSwitch, so it + // always reflects the engine we are switching AWAY from. e.switchMu.Lock() + switchingFromPrimary := e.ctrl.state.activeIsPrimary + e.switchMu.Unlock() + + // Resolve the standby slot for this direction. On the lazy New() path the + // target slot may be nil and must be built + Listen'd now (it binds the + // shared SO_REUSEPORT port and joins the accept pool). On the + // newFromEngines (tests) path both slots are pre-populated and buildStandby + // is nil, so the build is skipped. + freshlyBuilt := false + if switchingFromPrimary { + // primary (active) → secondary (standby). + if e.secondary == nil { + built, err := e.buildAndStartStandby(engine.IOUring) + if err != nil { + e.logger.Warn("aborting switch: lazy standby build failed; staying on current active", + "standby", engine.IOUring.String(), "error", err) + return + } + // Publish the built engine to both the Engine and controller + // slots under switchMu so a concurrent evaluate (ForceSwitch + // racing the eval loop) never reads a torn controller slot. + e.switchMu.Lock() + e.secondary = built + e.ctrl.secondary = built + e.switchMu.Unlock() + freshlyBuilt = true + } + } else { + // secondary (active) → primary (standby). + if e.primary == nil { + built, err := e.buildAndStartStandby(engine.Epoll) + if err != nil { + e.logger.Warn("aborting switch: lazy standby build failed; staying on current active", + "standby", engine.Epoll.String(), "error", err) + return + } + e.switchMu.Lock() + e.primary = built + e.ctrl.primary = built + e.switchMu.Unlock() + freshlyBuilt = true + } + } + var newActive, newStandby engine.Engine - if e.ctrl.state.activeIsPrimary { - // Switching: primary → secondary. + if switchingFromPrimary { newActive = e.secondary newStandby = e.primary } else { - // Switching: secondary → primary. newActive = e.primary newStandby = e.secondary } - e.switchMu.Unlock() + + // Re-acquire freezeState for the commit and RE-CHECK driverFDs: a driver + // may have registered during the build window above. If so, abort — but the + // freshly-built standby stays cached for the next attempt. Pause its accept + // first so it does not sit in the SO_REUSEPORT pool alongside the (still + // active) old engine; the next switch ResumeAccepts it. + e.freezeState.Lock() + if e.driverFDs.Load() > 0 { + e.switchRejected.Add(1) + e.logger.Warn("refusing engine switch: driver FDs registered during standby build", + "driver_fds", e.driverFDs.Load(), + ) + e.freezeState.Unlock() + if freshlyBuilt { + if ac, ok := newActive.(engine.AcceptController); ok { + _ = ac.PauseAccept() + } + } + return + } // Resume new active BEFORE pausing old — this creates a brief overlap // where both engines listen (via SO_REUSEPORT), which is correct. The @@ -426,16 +650,40 @@ func (e *Engine) Shutdown(ctx context.Context) error { } } - return errors.Join( - e.primary.Shutdown(ctx), - e.secondary.Shutdown(ctx), - ) + // Only shut down engines that exist. On the lazy New() path the standby + // slot is nil if no switch ever built it; cancelling listenCtx (above) + // already unwound the active engine's Listen goroutine and any lazily + // started standby Listen goroutine (both share that ctx + wait group). + e.mu.Lock() + primary := e.primary + secondary := e.secondary + e.mu.Unlock() + + var errs []error + if primary != nil { + errs = append(errs, primary.Shutdown(ctx)) + } + if secondary != nil { + errs = append(errs, secondary.Shutdown(ctx)) + } + return errors.Join(errs...) } -// Metrics aggregates metrics from both sub-engines. +// Metrics aggregates metrics from whichever sub-engines exist. On the lazy +// New() path a never-built standby is nil and contributes nothing. func (e *Engine) Metrics() engine.EngineMetrics { - pm := e.primary.Metrics() - sm := e.secondary.Metrics() + e.mu.Lock() + primary := e.primary + secondary := e.secondary + e.mu.Unlock() + + var pm, sm engine.EngineMetrics + if primary != nil { + pm = primary.Metrics() + } + if secondary != nil { + sm = secondary.Metrics() + } // Both sub-engines were built from the same handler + cfg, so their // AsyncRoutes counts are identical; take one (not the sum) for the // adaptive view. AsyncPromotedConns IS additive — promotions on the diff --git a/adaptive/engine_test.go b/adaptive/engine_test.go index 8bd0df0f..0a8a314d 100644 --- a/adaptive/engine_test.go +++ b/adaptive/engine_test.go @@ -4,6 +4,7 @@ package adaptive import ( "context" + "errors" "net" "sync" "sync/atomic" @@ -99,92 +100,168 @@ func TestInitialBias(t *testing.T) { } } -func TestSwitchTrigger(t *testing.T) { +// newAdaptiveStartingOnEpoll builds an adaptive engine whose active engine is +// epoll (primary), the production starting configuration. The synthetic +// sampler returned lets the test drive conns-per-worker for the active engine. +func newAdaptiveStartingOnEpoll(t *testing.T) (*Engine, *syntheticSampler) { + t.Helper() + primary := newMockEngine(engine.Epoll) // active + secondary := newMockEngine(engine.IOUring) // standby + sampler := newSyntheticSampler() + e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1}) + e.ctrl.cooldown = 0 + return e, sampler +} + +// newAdaptiveStartingOnIOUring builds an adaptive engine whose active engine is +// io_uring (primary), for exercising the revert-to-epoll path. +func newAdaptiveStartingOnIOUring(t *testing.T) (*Engine, *syntheticSampler) { + t.Helper() primary := newMockEngine(engine.IOUring) secondary := newMockEngine(engine.Epoll) sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.evalInterval = 1 * time.Millisecond + e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1}) e.ctrl.cooldown = 0 + return e, sampler +} + +// TestSwitchTrigger: sustained high conns/worker on the active epoll engine +// triggers an epoll→io_uring switch after sustainTicks ticks. +func TestSwitchTrigger(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24}) - // Make active (io_uring) score 100. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100, ErrorRate: 0.01}) + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatal("expected epoll initially") + } - // Pre-seed standby historical score (epoll was previously active with score 150). - e.ctrl.state.lastActiveScore[engine.Epoll] = 150 - e.ctrl.state.lastActiveTime[engine.Epoll] = time.Now() + now := time.Now() + // First tick arms the sustain counter but must not switch yet. + if e.ctrl.evaluate(now, false) { + t.Fatal("switch must require sustainTicks consecutive ticks, not one") + } + // Second tick crosses sustainTicks → switch. + if !e.ctrl.evaluate(now.Add(time.Second), false) { + t.Fatal("expected switch after sustained high conns/worker") + } + e.performSwitch() if e.ActiveEngine().Type() != engine.IOUring { - t.Fatal("expected io_uring initially") + t.Errorf("expected io_uring after switch, got %v", e.ActiveEngine().Type()) } +} + +// TestSwitchFastPath: conns/worker above the high watermark snaps to io_uring +// on a single tick (no sustain wait). +func TestSwitchFastPath(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + + if !e.ctrl.evaluate(time.Now(), false) { + t.Fatal("expected heavy-load fast-path snap on a single tick") + } +} + +// TestNoSwitchInHysteresisBand: conns/worker between the down and up +// thresholds (12–20) must not switch in either direction. +func TestNoSwitchInHysteresisBand(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 16}) - // Run evaluation — should trigger switch (150 > 100*1.15). now := time.Now() - switched := e.ctrl.evaluate(now, false) - if !switched { - t.Fatal("expected switch to be recommended") + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not switch up inside the 12–20 hysteresis band") + } } +} - e.performSwitch() +// TestLargePayloadSuppressesSwitch: even far above the high watermark, a +// large average payload (link-bound) must suppress the io_uring switch. +func TestLargePayloadSuppressesSwitch(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 32768}) - if e.ActiveEngine().Type() != engine.Epoll { - t.Errorf("expected epoll after switch, got %v", e.ActiveEngine().Type()) + now := time.Now() + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("large payload (link-bound) must suppress io_uring switch") + } } } -func TestHysteresis(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() +// TestRevertOnLowLoad: io_uring active, conns/worker drops below the down +// threshold for sustainTicks → revert to epoll. +func TestRevertOnLowLoad(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 4}) - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 1 * time.Hour // Very long cooldown to test blocking. + now := time.Now() + if e.ctrl.evaluate(now, false) { + t.Fatal("revert must require sustainTicks below the down threshold") + } + if !e.ctrl.evaluate(now.Add(time.Second), false) { + t.Fatal("expected revert to epoll after sustained low load") + } +} + +// TestErrorRevert: an error rate above errorRevertRate forces a revert from +// io_uring to epoll regardless of load (even at high conns/worker). +func TestErrorRevert(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.10}) + + if !e.ctrl.evaluate(time.Now(), false) { + t.Fatal("expected error-rate safety revert from io_uring") + } +} + +// TestNoRevertWhenIOUringBusy: io_uring active with conns/worker above the +// down threshold and no errors must stay on io_uring. +func TestNoRevertWhenIOUringBusy(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 24}) - // Pre-seed standby historical score so initial switch triggers. now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not revert while io_uring is still busy") + } + } +} - // Trigger initial switch. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) +// TestCooldownGate: after a switch, a revert is blocked until the cooldown +// window elapses, even when the revert condition holds. +func TestCooldownGate(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + e.ctrl.cooldown = 1 * time.Hour + now := time.Now() + // Fast-path switch epoll→io_uring. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) if !e.ctrl.evaluate(now, false) { - t.Fatal("expected initial switch") + t.Fatal("expected initial fast-path switch") } e.performSwitch() - // Immediately try to switch back — should be blocked by cooldown. - sampler.Set(engine.Epoll, TelemetrySnapshot{ThroughputRPS: 100}) - + // Now io_uring active and idle — but cooldown blocks the revert. + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) if e.ctrl.evaluate(now.Add(1*time.Second), false) { - t.Error("switch should be blocked by cooldown") + t.Error("revert should be blocked by cooldown") } } func TestConnectionDraining(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + e, sampler := newAdaptiveStartingOnEpoll(t) + primary := e.primary.(*mockEngine) + secondary := e.secondary.(*mockEngine) - // Initial state: primary active, secondary should be paused. - // Simulate the initial pause that Listen() would do. + // Simulate the initial pause that Listen() would do on the standby. _ = secondary.PauseAccept() secondary.pauseCalls.Store(0) // reset counter - // Pre-seed standby historical score. now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now - - // Trigger switch. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // fast-path if !e.ctrl.evaluate(now, false) { t.Fatal("expected switch") @@ -202,28 +279,19 @@ func TestConnectionDraining(t *testing.T) { } func TestOscillationLock(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + e, sampler := newAdaptiveStartingOnEpoll(t) now := time.Now() - // Perform 3 switches rapidly. + // Perform 3 switches rapidly via the fast path; each iteration sets a + // load that triggers the active engine's switch direction. for range 3 { - activeType := e.ActiveEngine().Type() - sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50}) - other := engine.Epoll - if activeType == engine.Epoll { - other = engine.IOUring + switch e.ActiveEngine().Type() { + case engine.Epoll: + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // switch up + case engine.IOUring: + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) // error revert } - // Pre-seed standby historical score for each iteration. - e.ctrl.state.lastActiveScore[other] = 200 - e.ctrl.state.lastActiveTime[other] = now - if !e.ctrl.evaluate(now, false) { t.Fatal("expected switch before lock") } @@ -231,16 +299,13 @@ func TestOscillationLock(t *testing.T) { now = now.Add(10 * time.Second) } - // Fourth switch should be locked. - activeType := e.ActiveEngine().Type() - sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50}) - other := engine.Epoll - if activeType == engine.Epoll { - other = engine.IOUring + // Fourth switch should be locked (3 switches within 5 minutes). + switch e.ActiveEngine().Type() { + case engine.Epoll: + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + case engine.IOUring: + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) } - e.ctrl.state.lastActiveScore[other] = 200 - e.ctrl.state.lastActiveTime[other] = now - if e.ctrl.evaluate(now, false) { t.Error("expected oscillation lock to prevent switch") } @@ -250,123 +315,242 @@ func TestOscillationLock(t *testing.T) { } func TestOverloadFreeze(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + e, sampler := newAdaptiveStartingOnEpoll(t) now := time.Now() - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50}) - - // Pre-seed standby historical score. - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // fast-path load // Freeze switching. e.FreezeSwitching() - if e.ctrl.evaluate(now, e.frozen.Load()) { t.Error("expected freeze to block evaluation") } // Unfreeze. e.UnfreezeSwitching() - if !e.ctrl.evaluate(now, e.frozen.Load()) { t.Error("expected evaluation to proceed after unfreeze") } } -func TestHistoricalScoreDecay(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) +// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor +// is implemented on *Engine (post-v1.0.0). + +// --- Part A: feature-gated start-engine selection ------------------------ - base := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0 - e.ctrl.state.lastActiveTime[engine.Epoll] = base +// TestChooseStartEngine covers the capability rule (no env override). +func TestChooseStartEngine(t *testing.T) { + t.Setenv("CELERIS_ADAPTIVE_START", "") // ensure auto - // At t=0, score should be 100. - got := e.ctrl.historicalScore(engine.Epoll, base) - if got != 100.0 { - t.Errorf("at t=0: score = %f, want 100.0", got) + tests := []struct { + name string + profile engine.CapabilityProfile + want engine.EngineType + }{ + { + name: "7.0 bundles era", + profile: engine.CapabilityProfile{KernelMajor: 7, KernelMinor: 0}, + want: engine.IOUring, + }, + { + name: "6.10 bundles era boundary", + profile: engine.CapabilityProfile{KernelMajor: 6, KernelMinor: 10}, + want: engine.IOUring, + }, + { + name: "6.1 fast tier (defer+single+multishotrecv+providedbuffers)", + profile: engine.CapabilityProfile{ + KernelMajor: 6, KernelMinor: 1, + DeferTaskrun: true, SingleIssuer: true, + MultishotRecv: true, ProvidedBuffers: true, + }, + want: engine.IOUring, + }, + { + name: "5.15 LTS (no fast tier)", + profile: engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15}, + want: engine.Epoll, + }, + { + name: "6.1 missing one fast-tier bit -> epoll", + profile: engine.CapabilityProfile{ + KernelMajor: 6, KernelMinor: 1, + DeferTaskrun: true, SingleIssuer: true, + MultishotRecv: true, ProvidedBuffers: false, + }, + want: engine.Epoll, + }, } - // At t=50s, score should be 50 (50% decay). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(50*time.Second)) - if got != 50.0 { - t.Errorf("at t=50s: score = %f, want 50.0", got) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := chooseStartEngine(tt.profile); got != tt.want { + t.Errorf("chooseStartEngine = %v, want %v", got, tt.want) + } + }) } +} - // At t=100s, score should be 0 (100% decay). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(100*time.Second)) - if got != 0.0 { - t.Errorf("at t=100s: score = %f, want 0.0", got) +// TestChooseStartEngineEnvOverride covers CELERIS_ADAPTIVE_START. +func TestChooseStartEngineEnvOverride(t *testing.T) { + // An old-kernel profile that the capability rule would send to epoll. + oldKernel := engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15} + // A modern profile the rule would send to io_uring. + newKernel := engine.CapabilityProfile{KernelMajor: 7, KernelMinor: 0} + + tests := []struct { + val string + profile engine.CapabilityProfile + want engine.EngineType + }{ + {"iouring", oldKernel, engine.IOUring}, // force iouring on old kernel + {"epoll", newKernel, engine.Epoll}, // force epoll on new kernel + {"auto", oldKernel, engine.Epoll}, // auto -> rule + {"auto", newKernel, engine.IOUring}, // auto -> rule + {"garbage", newKernel, engine.IOUring}, // unknown -> falls back to rule } - // At t=200s, score should still be 0 (clamped). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(200*time.Second)) - if got != 0.0 { - t.Errorf("at t=200s: score = %f, want 0.0", got) + for _, tt := range tests { + t.Run(tt.val, func(t *testing.T) { + t.Setenv("CELERIS_ADAPTIVE_START", tt.val) + if got := chooseStartEngine(tt.profile); got != tt.want { + t.Errorf("CELERIS_ADAPTIVE_START=%q chooseStartEngine = %v, want %v", tt.val, got, tt.want) + } + }) } } -func TestHistoricalScoreSeeding(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) +// --- Part B: lazy standby build on first switch + reuse ------------------ + +// newLazyAdaptive builds an adaptive engine in the LAZY shape: epoll is the +// eager active (primary), io_uring is nil and built on demand by a counting +// buildStandby. listenCtx/listenWG are populated as Listen would, so +// performSwitch can launch + bind-wait the lazily-built standby. The returned +// counter pointer records how many times buildStandby ran. +func newLazyAdaptive(t *testing.T) (*Engine, *syntheticSampler, *mockEngine, *int) { + t.Helper() + active := newMockEngine(engine.Epoll) sampler := newSyntheticSampler() - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) + var builtCount int + lazyStandby := newMockEngine(engine.IOUring) + e := &Engine{ + primary: active, // epoll active + secondary: nil, // io_uring lazy standby + cfg: resource.Config{Protocol: engine.HTTP1}, + logger: testLogger(), + startType: engine.Epoll, + } + e.buildStandby = func() (engine.Engine, error) { + builtCount++ + return lazyStandby, nil + } + e.ctrl = newController(e.primary, e.secondary, sampler, e.logger) + e.ctrl.state.activeIsPrimary = true e.ctrl.cooldown = 0 + var ap engine.Engine = active + e.active.Store(&ap) + + // Wire the Listen-owned ctx + wait group so performSwitch can start the + // lazily-built standby's Listen goroutine. + ctx, cancel := context.WithCancel(t.Context()) + t.Cleanup(cancel) + var wg sync.WaitGroup + // Hold the wait group above zero for the test's lifetime (mirrors the + // eval-loop goroutine being a wg member in production), so wg.Go from + // performSwitch is always called while the counter is positive. + wg.Add(1) + t.Cleanup(func() { wg.Done() }) + e.listenCtx = ctx + e.listenWG = &wg + + return e, sampler, lazyStandby, &builtCount +} - // Active (io_uring) has score 100. No standby history yet. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) - - now := time.Now() - e.ctrl.evaluate(now, false) +// TestLazyStandbyBuiltOnFirstSwitchAndReused verifies the lazy New() path +// builds the standby exactly once (on the first switch) and reuses it on a +// subsequent switch back. +func TestLazyStandbyBuiltOnFirstSwitchAndReused(t *testing.T) { + e, _, lazyStandby, builtCount := newLazyAdaptive(t) - // Standby should be seeded at 80% of active. - standbyScore, ok := e.ctrl.state.lastActiveScore[engine.Epoll] - if !ok { - t.Fatal("expected standby score to be seeded") + if e.secondary != nil { + t.Fatal("standby must be nil before the first switch") + } + if *builtCount != 0 { + t.Fatalf("buildStandby ran %d times before any switch, want 0", *builtCount) } - // Active score = 1.0*100 - 2.0*0 = 100. 80% = 80. - if standbyScore != 80.0 { - t.Errorf("standby seed score = %f, want 80.0", standbyScore) + // First switch: epoll -> io_uring. Must build + cache + activate the standby. + e.performSwitch() + + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after first switch, want 1", *builtCount) + } + if e.secondary == nil { + t.Fatal("standby must be cached after the first switch") + } + if e.ActiveEngine().Type() != engine.IOUring { + t.Fatalf("active = %v after first switch, want io_uring", e.ActiveEngine().Type()) + } + // The lazily-built standby must have been Listen'd (its goroutine ran). + select { + case <-lazyStandby.listenStarted: + case <-time.After(2 * time.Second): + t.Fatal("lazy standby Listen was never started") + } + // New active (io_uring) must have been resumed; old active (epoll) paused. + if lazyStandby.resumeCalls.Load() == 0 { + t.Error("expected ResumeAccept on the freshly-built (now active) standby") + } + if e.primary.(*mockEngine).pauseCalls.Load() == 0 { + t.Error("expected PauseAccept on the old active (epoll)") } -} -func TestSwitchAfterActiveDegrades(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() + // Second switch: io_uring -> epoll. The standby slot for THIS direction is + // epoll (e.primary, already non-nil), so no new build happens. The io_uring + // engine stays cached too. + e.performSwitch() - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after second switch, want 1 (reuse)", *builtCount) + } + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatalf("active = %v after second switch, want epoll", e.ActiveEngine().Type()) + } + if e.secondary != lazyStandby { + t.Error("io_uring standby must remain cached (same instance) after switching back") + } - now := time.Now() + // Third switch: epoll -> io_uring again. Reuses the cached io_uring standby. + e.performSwitch() + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after third switch, want 1 (reuse)", *builtCount) + } + if e.ActiveEngine().Type() != engine.IOUring { + t.Fatalf("active = %v after third switch, want io_uring", e.ActiveEngine().Type()) + } +} - // Pre-seed standby (epoll) with a strong historical score. - e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0 - e.ctrl.state.lastActiveTime[engine.Epoll] = now +// TestLazyStandbyBuildFailureAbortsSwitch verifies a buildStandby error aborts +// the switch cleanly: the active engine is unchanged and the standby slot stays +// nil. +func TestLazyStandbyBuildFailureAbortsSwitch(t *testing.T) { + e, _, _, _ := newLazyAdaptive(t) + e.buildStandby = func() (engine.Engine, error) { + return nil, errBuildFailed + } - // Active (io_uring) degrades to 50 RPS. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50}) + e.performSwitch() - // Standby historical = 100 * (1 - 0.01*0) = 100. - // Active score = 50. 100 > 50*1.15 = 57.5 → switch. - if !e.ctrl.evaluate(now, false) { - t.Error("expected switch when active degrades below standby historical") + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatalf("active changed to %v after a failed build; must stay epoll", e.ActiveEngine().Type()) + } + if e.secondary != nil { + t.Fatal("standby slot must stay nil after a failed build") + } + if e.ctrl.state.activeIsPrimary != true { + t.Fatal("controller direction must be unchanged after a failed build") } } -// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor -// is implemented on *Engine (post-v1.0.0). +var errBuildFailed = errors.New("build failed") diff --git a/adaptive/score.go b/adaptive/score.go deleted file mode 100644 index 43592fc0..00000000 --- a/adaptive/score.go +++ /dev/null @@ -1,89 +0,0 @@ -//go:build linux - -package adaptive - -// ScoreWeights defines the weighting for each telemetry signal in score -// computation. Higher throughput weight favors faster engines; higher error -// weight penalizes unreliable ones. -type ScoreWeights struct { - // Throughput is the weight applied to requests-per-second in the score. - Throughput float64 - // ErrorRate is the penalty weight applied to the error fraction. - ErrorRate float64 -} - -// DefaultWeights returns the default score weights. -func DefaultWeights() ScoreWeights { - return ScoreWeights{ - Throughput: 1.0, - ErrorRate: 2.0, - } -} - -// computeScore produces a weighted score from a telemetry snapshot. -// Higher is better. -func computeScore(snap TelemetrySnapshot, w ScoreWeights) float64 { - return w.Throughput*snap.ThroughputRPS - w.ErrorRate*snap.ErrorRate -} - -// ioUringBias returns a positive value when conditions favor io_uring over -// epoll, based on benchmark data. The bias is added to io_uring's score (or -// subtracted from epoll's) to enable proactive switching without requiring -// a throughput advantage. -// -// Signals from 3-run median benchmarks (2026-03-27): -// - io_uring CPU efficiency: 2-8% less CPU at 256-4096 connections -// - io_uring p99 tail: 17-39% better at 1024-4096 connections -// - Throughput: equivalent (±2%) at 256-4096 connections -// - io_uring loses at <64 connections (fixed overhead) and >8192 on x86 -// -// Pre-v1.5.0 this function was effectively dead: liveSampler.cpuMon was -// never assigned (celeris#316), so snap.CPUUtilization was always 0 and -// cpuFactor was always 0, making the function return 0 unconditionally. -// v1.5.0 wires the CPU monitor at engine construction; the bias now fires -// when CPU is above 30% and conns land in the 128-8192 sweet spot. -// -// The conn-factor falloff above 8192 is the empirical cost structure on -// x86 — see the bench data above — not a bug. If #318 identifies a -// different cause (e.g. a PbufRing bottleneck, fixed by #322), update -// this comment to point at the new finding. -func ioUringBias(snap TelemetrySnapshot, enabled bool) float64 { - // Off by default (celeris#341): this heuristic never reads the standby's - // measured throughput, so an ungated bias can speculatively switch adaptive - // onto a measurably-slower engine. Gated behind envIOUringBias. - if !enabled { - return 0 - } - conns := snap.ActiveConnections - cpu := snap.CPUUtilization - - // No bias at very low or very high connection counts. - if conns < 128 || conns > 8192 { - return 0 - } - - // Base bias: proportional to connection count in the sweet spot. - // Peaks at 1024-4096 connections where io_uring's advantages are strongest. - var connFactor float64 - switch { - case conns < 256: - connFactor = 0.1 - case conns < 1024: - connFactor = 0.3 - case conns <= 4096: - connFactor = 0.5 // Peak: io_uring's best range - default: - connFactor = 0.2 // 4096-8192: still beneficial but declining - } - - // CPU factor: bias increases when CPU-bound (io_uring's efficiency advantage - // matters most when cores are saturated). No bias below 30% CPU. - cpuFactor := 0.0 - if cpu > 0.3 { - cpuFactor = (cpu - 0.3) / 0.7 // 0.0 at 30% CPU, 1.0 at 100% - } - - // Combined bias: connection suitability × CPU pressure. - // Maximum bias is 0.5 (50% of throughput score). - return connFactor * cpuFactor -} diff --git a/adaptive/score_test.go b/adaptive/score_test.go deleted file mode 100644 index 9f9757a0..00000000 --- a/adaptive/score_test.go +++ /dev/null @@ -1,55 +0,0 @@ -//go:build linux - -package adaptive - -import "testing" - -func TestIoUringBiasConnFactorFalloff(t *testing.T) { - tests := []struct { - name string - conns int64 - cpu float64 - wantBias float64 - tolerance float64 - }{ - {name: "below 128: zero", conns: 64, cpu: 0.9, wantBias: 0.0}, - {name: "at 128 boundary: connFactor 0.1", conns: 128, cpu: 1.0, wantBias: 0.1 * (1.0 - 0.3) / 0.7}, - {name: "256 conns peak: 0.5", conns: 2048, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7}, - {name: "4096 boundary: 0.5 still", conns: 4096, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7}, - {name: "4097: drops to 0.2", conns: 4097, cpu: 1.0, wantBias: 0.2 * (1.0 - 0.3) / 0.7}, - {name: "above 8192: zero", conns: 9000, cpu: 0.9, wantBias: 0.0}, - {name: "low CPU: zero cpuFactor", conns: 2048, cpu: 0.20, wantBias: 0.5 * 0.0}, - {name: "exactly 30% CPU: zero cpuFactor", conns: 2048, cpu: 0.30, wantBias: 0.5 * 0.0}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - snap := TelemetrySnapshot{ - ActiveConnections: tt.conns, - CPUUtilization: tt.cpu, - } - got := ioUringBias(snap, true) - if absDiff(got, tt.wantBias) > tt.tolerance { - t.Errorf("ioUringBias(conns=%d, cpu=%.2f) = %v, want %v ± %v", - tt.conns, tt.cpu, got, tt.wantBias, tt.tolerance) - } - }) - } -} - -func TestComputeScoreTwoTerm(t *testing.T) { - w := DefaultWeights() - snap := TelemetrySnapshot{ThroughputRPS: 1000, ErrorRate: 0.05} - got := computeScore(snap, w) - want := 1.0*1000.0 - 2.0*0.05 - if got != want { - t.Errorf("computeScore = %v, want %v", got, want) - } -} - -func absDiff(a, b float64) float64 { - d := a - b - if d < 0 { - return -d - } - return d -} diff --git a/adaptive/telemetry.go b/adaptive/telemetry.go index 992d74d2..5b52d1a1 100644 --- a/adaptive/telemetry.go +++ b/adaptive/telemetry.go @@ -22,6 +22,20 @@ type TelemetrySnapshot struct { // CPUUtilization is the estimated CPU usage fraction (0.0-1.0). Read from // the live sampler's CPUMonitor; zero when no monitor is wired. CPUUtilization float64 + // ConnsPerWorker is ActiveConnections divided by the engine's worker + // count. This is the PRIMARY load signal driving engine selection: + // epoll and io_uring tie at low conns/worker, but io_uring pulls ahead + // and keeps scaling above ~20/worker while epoll plateaus. + ConnsPerWorker float64 + // AcceptRate is the new-connection arrival rate (accepts/sec) over the + // last sampling interval, derived like ThroughputRPS. A secondary load + // signal: a high accept rate indicates connection churn. + AcceptRate float64 + // BytesPerReq is the average payload size (read+written bytes per + // request) over the last interval. When this exceeds the controller's + // large-payload threshold the workload is link-bound — the engines tie + // — so the controller suppresses an io_uring switch to avoid churn. + BytesPerReq float64 } // TelemetrySampler produces telemetry snapshots from an engine. @@ -54,6 +68,11 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot { ActiveConnections: m.ActiveConnections, } + // ConnsPerWorker is a point-in-time ratio (not a delta), so it needs no + // prior sample. max(Workers, 1) guards the pre-Listen window where the + // worker count is still zero. + snap.ConnsPerWorker = float64(m.ActiveConnections) / float64(max(m.Workers, 1)) + prev, hasPrev := s.prevMetrics[et] prevT, hasT := s.prevTime[et] if hasPrev && hasT { @@ -61,9 +80,13 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot { if elapsed > 0 { deltaReqs := m.RequestCount - prev.RequestCount deltaErrs := m.ErrorCount - prev.ErrorCount + deltaAccepts := m.AcceptCount - prev.AcceptCount + deltaBytes := (m.BytesRead + m.BytesWritten) - (prev.BytesRead + prev.BytesWritten) snap.ThroughputRPS = float64(deltaReqs) / elapsed + snap.AcceptRate = float64(deltaAccepts) / elapsed if deltaReqs > 0 { snap.ErrorRate = float64(deltaErrs) / float64(deltaReqs) + snap.BytesPerReq = float64(deltaBytes) / float64(deltaReqs) } } } diff --git a/engine/engine.go b/engine/engine.go index 73ce169f..aeeaf6b2 100644 --- a/engine/engine.go +++ b/engine/engine.go @@ -106,4 +106,27 @@ type EngineMetrics struct { //nolint:revive // user-approved name // how often the inline → goroutine handoff fires vs the pure-sync // inline fast path. AsyncPromotedConns uint64 + // Workers is the number of I/O workers (io_uring) or event loops + // (epoll) the engine is running. Static after Listen. The adaptive + // controller divides ActiveConnections by it to derive the + // conns-per-worker load signal that drives engine selection. + Workers int + // AcceptCount is the cumulative number of connections accepted by this + // engine since start. Together with elapsed time it yields the accept + // rate (new-connection arrival rate) used as a secondary load signal. + AcceptCount uint64 + // CloseCount is the cumulative number of connections closed by this + // engine since start. AcceptCount - CloseCount tracks the live count; + // a high close rate relative to accepts indicates short-lived + // churn-style connections. + CloseCount uint64 + // BytesRead is the cumulative number of payload bytes received from + // the network across all connections. Used with BytesWritten and + // RequestCount to derive the average bytes-per-request signal that + // suppresses io_uring selection for link-bound (large-payload) + // workloads where the engines tie. + BytesRead uint64 + // BytesWritten is the cumulative number of payload bytes sent to the + // network across all connections. See BytesRead. + BytesWritten uint64 } diff --git a/engine/epoll/engine.go b/engine/epoll/engine.go index 6bc69970..a1f49d14 100644 --- a/engine/epoll/engine.go +++ b/engine/epoll/engine.go @@ -33,6 +33,17 @@ type Engine struct { // asyncPromoted counts inline → dispatch-goroutine promotions // across all loops (celeris #300). asyncPromoted atomic.Uint64 + // acceptCount / closeCount track cumulative connection lifecycle + // events; bytesRead / bytesWritten track cumulative payload bytes. + // All four feed the adaptive controller's load signals. Bytes are + // batched per-loop and flushed once per event-loop iteration + // (mirroring reqCount) to avoid hot-path cache-line bouncing; + // accepts/closes are infrequent so they increment directly like + // activeConns. + acceptCount atomic.Uint64 + closeCount atomic.Uint64 + bytesRead atomic.Uint64 + bytesWritten atomic.Uint64 } // asyncRoutes is the static AsyncRoutes count snapshotted at // construction from the handler's AsyncRouteCount (#300 G3). @@ -90,7 +101,9 @@ func (e *Engine) Listen(ctx context.Context) error { l := newLoop(i, cpus[i], e.handler, resolved, e.cfg, &e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount, - &e.metrics.asyncPromoted, &e.acceptPaused) + &e.metrics.asyncPromoted, &e.acceptPaused, + &e.metrics.acceptCount, &e.metrics.closeCount, + &e.metrics.bytesRead, &e.metrics.bytesWritten) e.loops[i] = l } e.mu.Unlock() @@ -165,6 +178,11 @@ func (e *Engine) Metrics() engine.EngineMetrics { ErrorCount: e.metrics.errCount.Load(), AsyncRoutes: e.asyncRoutes, AsyncPromotedConns: e.metrics.asyncPromoted.Load(), + Workers: len(e.loops), + AcceptCount: e.metrics.acceptCount.Load(), + CloseCount: e.metrics.closeCount.Load(), + BytesRead: e.metrics.bytesRead.Load(), + BytesWritten: e.metrics.bytesWritten.Load(), } } diff --git a/engine/epoll/loop.go b/engine/epoll/loop.go index b8c67b6c..58369767 100644 --- a/engine/epoll/loop.go +++ b/engine/epoll/loop.go @@ -88,14 +88,20 @@ type Loop struct { // re-arms the signal. listenFDClosed atomic.Bool - reqCount *atomic.Uint64 - activeConns *atomic.Int64 - errCount *atomic.Uint64 - asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) - reqBatch uint64 // batched request count, flushed to reqCount per iteration - tickCounter uint32 - consecutiveEmpty uint32 // consecutive iterations with no events (for adaptive timeout) - cachedNow int64 // cached time.Now().UnixNano(), refreshed once per events return + reqCount *atomic.Uint64 + activeConns *atomic.Int64 + errCount *atomic.Uint64 + asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) + acceptCount *atomic.Uint64 // cumulative accepts (engine-wide, shared) + closeCount *atomic.Uint64 // cumulative closes (engine-wide, shared) + bytesRead *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared) + bytesWritten *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared) + reqBatch uint64 // batched request count, flushed to reqCount per iteration + bytesReadBatch uint64 // batched recv bytes, flushed to bytesRead per iteration + bytesWrittenBatch uint64 // batched send bytes, flushed to bytesWritten per iteration + tickCounter uint32 + consecutiveEmpty uint32 // consecutive iterations with no events (for adaptive timeout) + cachedNow int64 // cached time.Now().UnixNano(), refreshed once per events return // fdCapDrops counts accepted fds that fell outside the l.conns table // (fd >= connTableSize) and were force-closed in acceptAll. Worker- @@ -143,7 +149,8 @@ type Loop struct { func newLoop(id, cpuID int, handler stream.Handler, resolved resource.ResolvedResources, cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64, - asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) *Loop { + asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool, + acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) *Loop { return &Loop{ id: id, @@ -175,6 +182,10 @@ func newLoop(id, cpuID int, handler stream.Handler, activeConns: activeConns, errCount: errCount, asyncPromoted: asyncPromoted, + acceptCount: acceptCount, + closeCount: closeCount, + bytesRead: bytesRead, + bytesWritten: bytesWritten, h2cfg: conn.H2Config{ MaxConcurrentStreams: cfg.MaxConcurrentStreams, InitialWindowSize: cfg.InitialWindowSize, @@ -461,7 +472,7 @@ func (l *Loop) run(ctx context.Context) { if mu := cs.detachMu; mu != nil { mu.Lock() } - err := flushWrites(cs) + err := l.flushWrites(cs, true) if err != nil { // Surface I/O failure to detached middleware before closing. if cs.h1State != nil && cs.h1State.OnError != nil { @@ -506,7 +517,7 @@ func (l *Loop) run(ctx context.Context) { if cs != nil && cs.h2State != nil && cs.h2State.WriteQueuePending() { cs.h2State.DrainWriteQueue(cs.writeFn) if cs.writePos < len(cs.writeBuf) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { l.removeDirty(cs) l.closeConn(fd) } else if cs.writePos < len(cs.writeBuf) { @@ -527,6 +538,17 @@ func (l *Loop) run(ctx context.Context) { l.reqBatch = 0 } + // Flush batched payload-byte counters with the same per-iteration + // cadence as reqCount, for the same cache-line-contention reason. + if l.bytesReadBatch > 0 { + l.bytesRead.Add(l.bytesReadBatch) + l.bytesReadBatch = 0 + } + if l.bytesWrittenBatch > 0 { + l.bytesWritten.Add(l.bytesWrittenBatch) + l.bytesWrittenBatch = 0 + } + // Check connection timeouts. Default cadence is every 1024 iterations // (~100ms under load); when detached conns exist with idle deadlines // the gate tightens to every 32 iterations (~50ms idle wall time) @@ -656,6 +678,7 @@ func (l *Loop) acceptAll(ctx context.Context, now int64) { } cs.writeFn = l.makeWriteFn(cs) l.activeConns.Add(1) + l.acceptCount.Add(1) if l.cfg.OnConnect != nil { l.cfg.OnConnect(cs.remoteAddr) @@ -714,7 +737,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) // Surface read failure to detached middleware (e.g. WS). if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(err) @@ -729,7 +752,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) // Surface peer-close (EOF) to detached middleware. if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(errPeerClosed) @@ -742,6 +765,9 @@ func (l *Loop) drainRead(fd int, now int64) { } cs.lastActivity = now + // n > 0 here (the err and EOF cases returned above): bytes read on + // this iteration, into either cs.buf or the zero-copy bodyBuf. + l.bytesReadBatch += uint64(n) // Direct-into-bodyBuf completion path: we read straight into // H1State.bodyBuf; dispatch the handler if the body is full, @@ -764,7 +790,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -781,7 +807,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -792,7 +818,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - if err := flushWrites(cs); err != nil { + if err := l.flushWrites(cs, true); err != nil { if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -942,7 +968,7 @@ func (l *Loop) drainRead(fd int, now int64) { // skipped by the `continue` so we must flush explicitly here, // otherwise the client blocks forever waiting for the 101. if cs.writePos < len(cs.writeBuf) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { l.closeConn(fd) return } @@ -1003,7 +1029,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) cs.pendingBytes = 0 if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(processErr) @@ -1024,7 +1050,7 @@ func (l *Loop) drainRead(fd int, now int64) { mu.Lock() } if csWritePending(cs) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(fErr) } @@ -1118,6 +1144,7 @@ func (l *Loop) hijackConn(fd int) (net.Conn, error) { l.conns[fd] = nil l.connCount-- l.activeConns.Add(-1) + l.closeCount.Add(1) f := os.NewFile(uintptr(fd), "tcp") c, err := net.FileConn(f) @@ -1650,7 +1677,10 @@ func (l *Loop) runAsyncHandler(cs *connState) { // flushWrites may partially complete; residual bytes // stay in cs.writeBuf and the worker retries via // markDirty once drainDetachQueue picks us up. - if err := flushWrites(cs); err != nil { + // Dispatch-goroutine call site: not the loop thread, so + // byte accounting must hit the shared atomic, not the + // per-loop batch (false → atomic path). + if err := l.flushWrites(cs, false); err != nil { promoteErr = err } } @@ -1733,7 +1763,8 @@ func (l *Loop) runAsyncHandler(cs *connState) { // race the middleware's own write path. if !errors.Is(processErr, conn.ErrHijacked) && (cs.writePos < len(cs.writeBuf) || len(cs.bodyBuf) > 0) { - flushErr = flushWrites(cs) + // Dispatch-goroutine call site (false → shared-atomic byte path). + flushErr = l.flushWrites(cs, false) } // Resync pendingBytes with actual buffer state. makeWriteFn uses // pendingBytes to enforce writeCap backpressure; without this @@ -1940,7 +1971,7 @@ func (l *Loop) handleWritable(cs *connState) { if mu := cs.detachMu; mu != nil { mu.Lock() } - err := flushWrites(cs) + err := l.flushWrites(cs, true) drained := err == nil && !csWritePending(cs) if err == nil { if drained { @@ -2233,6 +2264,7 @@ func (l *Loop) closeConn(fd int) { l.conns[fd] = nil l.connCount-- l.activeConns.Add(-1) + l.closeCount.Add(1) if l.cfg.OnDisconnect != nil { l.cfg.OnDisconnect(cs.remoteAddr) diff --git a/engine/epoll/review_v150_test.go b/engine/epoll/review_v150_test.go index 64538b57..745c2b0a 100644 --- a/engine/epoll/review_v150_test.go +++ b/engine/epoll/review_v150_test.go @@ -80,11 +80,15 @@ func TestCheckTimeoutsClosesAllExpired(t *testing.T) { t.Cleanup(func() { _ = unix.Close(epfd) }) l := &Loop{ - epollFD: epfd, - conns: make([]*connState, 1024), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - cfg: resource.Config{IdleTimeout: time.Second}, + epollFD: epfd, + conns: make([]*connState, 1024), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + cfg: resource.Config{IdleTimeout: time.Second}, } past := time.Now().Add(-time.Hour).UnixNano() @@ -168,17 +172,21 @@ func TestAcceptAllDrainsBacklogNoStrand(t *testing.T) { } l := &Loop{ - epollFD: epfd, - listenFD: lfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 256), - activeConns: &atomic.Int64{}, - errCount: &atomic.Uint64{}, - reqCount: &atomic.Uint64{}, - eventFD: -1, - timerFD: -1, - resolved: resource.ResolvedResources{BufferSize: 8192}, - cfg: resource.Config{}, + epollFD: epfd, + listenFD: lfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 256), + activeConns: &atomic.Int64{}, + errCount: &atomic.Uint64{}, + reqCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + closeCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + timerFD: -1, + resolved: resource.ResolvedResources{BufferSize: 8192}, + cfg: resource.Config{}, } const want = 200 @@ -261,12 +269,16 @@ func TestHijackDefersReleaseWhileAsyncGoroutineActive(t *testing.T) { } l := &Loop{ - epollFD: epfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - eventFD: -1, - cfg: resource.Config{}, + epollFD: epfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + cfg: resource.Config{}, } // Async-mode conn: detachMu set, dispatch goroutine "alive" (asyncRun). @@ -345,12 +357,16 @@ func TestHijackSyncReleasesImmediately(t *testing.T) { } l := &Loop{ - epollFD: epfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - eventFD: -1, - cfg: resource.Config{}, + epollFD: epfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + cfg: resource.Config{}, } // Sync mode: detachMu nil, no dispatch goroutine. cs := &connState{fd: local, liveIdx: -1} diff --git a/engine/epoll/writer.go b/engine/epoll/writer.go index 75f58dc6..7f4526f4 100644 --- a/engine/epoll/writer.go +++ b/engine/epoll/writer.go @@ -16,16 +16,33 @@ import ( // writev(2) with iovec = [writeBuf[writePos:], bodyBuf]. Saves one // full body-sized memcpy per request compared to appending the body // into writeBuf first. -func flushWrites(cs *connState) error { +// onLoopThread distinguishes the worker (event-loop) call sites from the +// async-dispatch-goroutine ones. Loop-thread bytes accumulate in the +// non-atomic per-loop batch (flushed once per iteration, like reqBatch); +// goroutine bytes go straight to the shared atomic. Keeping the two paths +// apart is what makes the batch field worker-thread-only and therefore +// race-free — the dispatch goroutine never touches bytesWrittenBatch. +func (l *Loop) addWrittenBytes(n int, onLoopThread bool) { + if n <= 0 { + return + } + if onLoopThread { + l.bytesWrittenBatch += uint64(n) + return + } + l.bytesWritten.Add(uint64(n)) +} + +func (l *Loop) flushWrites(cs *connState, onLoopThread bool) error { if len(cs.bodyBuf) > 0 { - return flushWritesV(cs) + return l.flushWritesV(cs, onLoopThread) } if cs.writePos >= len(cs.writeBuf) { cs.writeBuf = cs.writeBuf[:0] cs.writePos = 0 // writeBuf is empty — if a zero-copy sendfile response is pending, // drive it now (after any prior pipelined writeBuf bytes flushed). - return flushSendfile(cs) + return l.flushSendfile(cs, onLoopThread) } n, err := unix.Write(cs.fd, cs.writeBuf[cs.writePos:]) if err != nil { @@ -36,6 +53,7 @@ func flushWrites(cs *connState) error { cs.writePos = 0 return err } + l.addWrittenBytes(n, onLoopThread) cs.writePos += n if cs.writePos >= len(cs.writeBuf) { // Fully flushed @@ -43,7 +61,7 @@ func flushWrites(cs *connState) error { cs.writePos = 0 // writeBuf drained this call — continue into the sendfile body if a // sendfile response is pending. Keeps header→body ordering correct. - return flushSendfile(cs) + return l.flushSendfile(cs, onLoopThread) } else if cs.writePos > cap(cs.writeBuf)/2 { // Amortized compaction: only copy when more than half the buffer is consumed. // This reduces the average cost from O(n) per partial write to O(1) amortized. @@ -66,12 +84,13 @@ func flushWrites(cs *connState) error { // the writeBuf path above. On a real I/O error the file is released and // the error is surfaced (the caller closes the conn). On completion the // dup'd file is closed and cs.sendfile cleared. -func flushSendfile(cs *connState) error { +func (l *Loop) flushSendfile(cs *connState, onLoopThread bool) error { st := cs.sendfile if st == nil { return nil } - _, err := st.advance(cs.fd) + sent, err := st.advance(cs.fd) + l.addWrittenBytes(int(sent), onLoopThread) if err != nil { if err == unix.EAGAIN || err == unix.EWOULDBLOCK { return nil // send buffer full; resume on next writable edge @@ -120,7 +139,7 @@ func csPendingBytes(cs *connState) int { // is non-nil; clears bodyBuf once fully sent. Partial writev collapses // the remainder into writeBuf so the next flushWrites call uses the // plain write path. -func flushWritesV(cs *connState) error { +func (l *Loop) flushWritesV(cs *connState, onLoopThread bool) error { headerRem := cs.writeBuf[cs.writePos:] iovs := [2][]byte{headerRem, cs.bodyBuf} n, err := unix.Writev(cs.fd, iovs[:]) @@ -133,6 +152,7 @@ func flushWritesV(cs *connState) error { cs.bodyBuf = nil return err } + l.addWrittenBytes(n, onLoopThread) total := len(headerRem) + len(cs.bodyBuf) switch { case n >= total: diff --git a/engine/iouring/check_timeouts_test.go b/engine/iouring/check_timeouts_test.go index 50178e81..8d0c6512 100644 --- a/engine/iouring/check_timeouts_test.go +++ b/engine/iouring/check_timeouts_test.go @@ -65,6 +65,7 @@ func TestCheckTimeoutsSwapRemoveNoSkip(t *testing.T) { liveConns: make([]int, 0, n), errCount: &atomic.Uint64{}, activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, cfg: resource.Config{IdleTimeout: time.Second}, } past := time.Now().Add(-time.Hour).UnixNano() diff --git a/engine/iouring/engine.go b/engine/iouring/engine.go index e4ebdde5..2b4f67e2 100644 --- a/engine/iouring/engine.go +++ b/engine/iouring/engine.go @@ -35,6 +35,17 @@ type Engine struct { // asyncPromoted counts the cumulative inline → dispatch-goroutine // promotions across all workers (celeris #300). asyncPromoted atomic.Uint64 + // acceptCount / closeCount track cumulative connection lifecycle + // events; bytesRead / bytesWritten track cumulative payload bytes. + // All four feed the adaptive controller's load signals. Bytes are + // batched per-worker and flushed once per event-loop iteration + // (mirroring reqCount) to avoid hot-path cache-line bouncing; + // accepts/closes are infrequent so they increment directly like + // activeConns. + acceptCount atomic.Uint64 + closeCount atomic.Uint64 + bytesRead atomic.Uint64 + bytesWritten atomic.Uint64 } // asyncRoutes is cached from the handler's HasAsyncRoutes/route count // at construction so Metrics() doesn't pay the type-assertion per @@ -248,7 +259,9 @@ func (e *Engine) createWorkers(tier TierStrategy, cpus []int, w, err := newWorker(i, cpus[i], tier, e.handler, resolved, e.cfg, &e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount, - &e.metrics.asyncPromoted, &e.acceptPaused) + &e.metrics.asyncPromoted, &e.acceptPaused, + &e.metrics.acceptCount, &e.metrics.closeCount, + &e.metrics.bytesRead, &e.metrics.bytesWritten) if err != nil { // Clean up already-created workers. for _, prev := range workers[:i] { @@ -303,6 +316,11 @@ func (e *Engine) Metrics() engine.EngineMetrics { ErrorCount: e.metrics.errCount.Load(), AsyncRoutes: e.asyncRoutes, AsyncPromotedConns: e.metrics.asyncPromoted.Load(), + Workers: len(e.workers), + AcceptCount: e.metrics.acceptCount.Load(), + CloseCount: e.metrics.closeCount.Load(), + BytesRead: e.metrics.bytesRead.Load(), + BytesWritten: e.metrics.bytesWritten.Load(), } } diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index fd029415..d7f31de9 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -241,11 +241,17 @@ type Worker struct { // RST as it pauses). listenFDClosed atomic.Bool - reqCount *atomic.Uint64 - activeConns *atomic.Int64 - errCount *atomic.Uint64 - asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) - reqBatch uint64 // batched request count, flushed to reqCount per iteration + reqCount *atomic.Uint64 + activeConns *atomic.Int64 + errCount *atomic.Uint64 + asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) + acceptCount *atomic.Uint64 // cumulative accepts (engine-wide, shared) + closeCount *atomic.Uint64 // cumulative closes (engine-wide, shared) + bytesRead *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared) + bytesWritten *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared) + reqBatch uint64 // batched request count, flushed to reqCount per iteration + bytesReadBatch uint64 // batched recv bytes, flushed to bytesRead per iteration + bytesWrittenBatch uint64 // batched send bytes, flushed to bytesWritten per iteration tickCounter uint32 cachedNow int64 // cached time.Now().UnixNano(), refreshed every 64 iterations @@ -317,7 +323,8 @@ type Worker struct { func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler, resolved resource.ResolvedResources, cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64, - asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init + asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool, + acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init // Listen socket creation is deferred to run() (after CPU pinning and NUMA // binding) so that the kernel allocates socket internal buffers on the @@ -344,6 +351,10 @@ func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler, activeConns: activeConns, errCount: errCount, asyncPromoted: asyncPromoted, + acceptCount: acceptCount, + closeCount: closeCount, + bytesRead: bytesRead, + bytesWritten: bytesWritten, acceptPaused: acceptPaused, wake: make(chan struct{}), ready: make(chan error, 1), @@ -671,6 +682,17 @@ func (w *Worker) run(ctx context.Context) { w.reqBatch = 0 } + // Flush batched payload-byte counters with the same per-iteration + // cadence as reqCount, for the same cache-line-contention reason. + if w.bytesReadBatch > 0 { + w.bytesRead.Add(w.bytesReadBatch) + w.bytesReadBatch = 0 + } + if w.bytesWrittenBatch > 0 { + w.bytesWritten.Add(w.bytesWrittenBatch) + w.bytesWrittenBatch = 0 + } + // Single atomic publish for all batched buffer returns (P0). if w.hasBufReturns { w.bufRing.PublishBuffers() @@ -1180,6 +1202,7 @@ func (w *Worker) onAcceptedFD(ctx context.Context, newFD int, now int64, isFixed } cs.writeFn = w.makeWriteFn(cs) w.activeConns.Add(1) + w.acceptCount.Add(1) if w.cfg.OnConnect != nil { w.cfg.OnConnect(cs.remoteAddr) @@ -1221,6 +1244,7 @@ func (w *Worker) hijackConn(fd int) (net.Conn, error) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) // Cancel-then-release discipline, hijack variant: a single-shot // recv SQE is virtually always still armed on cs.buf here. The fd // lives on under the caller's net.Conn, so an uncancelled recv would @@ -1569,6 +1593,9 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { } cs.lastActivity = now + // c.Res > 0 here (the c.Res <= 0 cases returned above): bytes received + // on this recv CQE, regardless of which buffer they landed in. + w.bytesReadBatch += uint64(c.Res) // Direct-into-bodyBuf path: the previous recv SQE targeted // H1State.bodyBuf (NextRecvBuf). The CQE's Res applies to bodyBuf, @@ -2137,6 +2164,11 @@ func (w *Worker) completeSend(cs *connState, fd int, sent int, now int64) { return } + // sent >= 0 here: payload bytes flushed by this send completion + // (covers regular SEND and the SEND_ZC NOTIF path, both of which + // reach completeSend with the byte count). + w.bytesWrittenBatch += uint64(sent) + // Partial-send handling, split by whether we issued a plain SEND // (sendBuf only) or a WRITEV (sendBuf + sendBody). Partial WRITEV // responses collapse the remainder into sendBuf so the retry path @@ -2488,6 +2520,7 @@ func (w *Worker) finishClose(fd int) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) if w.cfg.OnDisconnect != nil && cs != nil { w.cfg.OnDisconnect(cs.remoteAddr) @@ -2588,6 +2621,7 @@ func (w *Worker) finishCloseDetached(fd int, cs *connState) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) if w.cfg.OnDisconnect != nil { w.cfg.OnDisconnect(cs.remoteAddr) From 72d947c40b087f5205fcac5852b86d338e5b0113 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Fri, 19 Jun 2026 10:10:00 +0200 Subject: [PATCH 21/27] feat(adaptive): kernel/memlock/protocol-gated start engine + new-conn switch Redesign the adaptive start-engine decision around connection pinning: an established conn cannot migrate between epoll and io_uring, so the START engine decides keep-alive throughput and the workload concurrency is unknowable at Listen() time. chooseStartEngine now gates only on t0-knowable facts: env override -> io_uring not viable (kernel fast-tier AND RLIMIT_MEMLOCK can fund the workers) -> Protocol==H2C -> WorkloadHint==HighConcurrency -> default. The default flips from io_uring-on-modern-kernels to EPOLL: every server ramps from zero connections (the low-concurrency regime where epoll wins on both throughput and tail latency). io_uring starts only on an explicit WorkloadHint=HighConcurrency (new operator field) when kernel + memlock allow. New helpers/fields: - iouring.MaxWorkersForMemlock(): the memlock worker ceiling, exported so the start decision avoids io_uring's silent 1-worker collapse proactively, not just on construction failure. capWorkersToMemlock derives from it. - resource.Resources.WorkloadHint + root celeris.WorkloadHint (Unspecified/LowConcurrency/HighConcurrency). Runtime switch (controller) re-enabled but constrained: only on the epoll-start path with io_uring viable and a non-h2c protocol, it promotes NEW connections to io_uring when conns/worker sustains the crossover. Pinning keeps the switch inert for a pure keep-alive burst; it helps ramps/churn. The load-driven down-revert is disabled (pinning makes it harmful); the io_uring error-revert stays always-on. Thresholds tuned to the epoll-vs-io_uring sweep: up 20->24, high-watermark 32->48, large-payload suppression 16384->8192 bytes. Empirical basis (msa2-server, kernel 7.0, real NIC): epoll wins <=32 conns (+~20%, ~40% lower tail), tie 64-256, io_uring wins >=~384 conns (~24/worker, +8-13%); io_uring's edge is h1-small-payload only (h2c/large payloads tie) and collapses under low RLIMIT_MEMLOCK (1 worker ~= 1/5 throughput). Validated on the cluster: resource/adaptive/iouring/epoll/root suites pass on real io_uring; default adaptive starts epoll, env/hint force io_uring, and a 1024c load fires the epoll->io_uring switch. Cross-engine connection migration (transplant pinned conns) deferred to a v1.6.0 spike (#383): only H1-idle epoll->io_uring is feasible; H1-mid-request and H2 are impossible under the current parser/HPACK/stream architecture. --- adaptive/controller.go | 47 +++++++++----- adaptive/controller_test.go | 24 +++---- adaptive/engine.go | 121 ++++++++++++++++++++++++++---------- adaptive/engine_test.go | 96 +++------------------------- adaptive/start_test.go | 92 +++++++++++++++++++++++++++ config.go | 26 ++++++++ engine/iouring/ring.go | 36 ++++++++--- resource/resource.go | 27 ++++++++ 8 files changed, 311 insertions(+), 158 deletions(-) create mode 100644 adaptive/start_test.go diff --git a/adaptive/controller.go b/adaptive/controller.go index 48b8a7c6..d1a1aef6 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -66,17 +66,24 @@ type controller struct { errorRevertRate float64 // io_uring error rate above which we revert to epoll sustainTicks int // consecutive ticks required for a normal switch - // connSwitchEnabled gates the conns-per-worker UP/DOWN switching. It is - // OFF for the kernel regimes where the feature-gated start engine is - // already the best at every concurrency (io_uring-best on bundles/6.10+, - // epoll-best on <6.1) — there, switching only churns and, worse, the - // down-revert fires during idle/warmup dips and strands load on the wrong - // engine (since pinned conns never migrate). The always-on error-revert - // below is independent of this flag. A future middle-tier kernel where a - // genuine crossover exists can re-enable it (validated by the kernel - // matrix). The engine sets it via SetConnSwitchEnabled from the profile. + // connSwitchEnabled gates the conns-per-worker UP switch (epoll→io_uring). + // Production sets it true ONLY on the epoll-start path with io_uring viable + // and a non-h2c protocol: there, a sustained high-concurrency ramp should + // promote NEW connections to io_uring (it wins ≥~24 conns/worker for h1 + // small payloads). It is false when io_uring is the start engine (nothing + // better to switch up to), when io_uring is unviable, or for h2c. The + // always-on error-revert below is independent of this flag. The engine sets + // it directly from the profile in New(). connSwitchEnabled bool + // loadDownRevert gates the LOAD-driven io_uring→epoll revert (evaluateDown). + // It is OFF in production: because pinned conns never migrate, reverting on + // a load dip strands established io_uring keep-alives and routes new conns + // back to epoll mid-ramp — pure harm. The always-on error-revert is + // independent of this flag. Defaults true in newController so the + // conns-per-worker unit tests exercise evaluateDown; New() sets it false. + loadDownRevert bool + logger *slog.Logger } @@ -87,16 +94,21 @@ func newController(primary, secondary engine.Engine, sampler TelemetrySampler, l sampler: sampler, evalInterval: 1 * time.Second, cooldown: 30 * time.Second, - upThreshold: 20.0, + // Thresholds from the epoll-vs-io_uring sweep: io_uring overtakes epoll + // at ~24 conns/worker for h1 small payloads (was 20); the heavy-load + // fast-path snaps only well past the crossover (48); large payloads are + // link-bound (engines tie) above 8 KB so suppress the switch there. + upThreshold: 24.0, downThreshold: 12.0, - highWatermark: 32.0, - largePayloadBytes: 16384.0, + highWatermark: 48.0, + largePayloadBytes: 8192.0, errorRevertRate: 0.05, sustainTicks: 2, - // Default ON so the conns-per-worker unit tests exercise the policy; - // the production New() path sets it from the kernel/feature profile - // (currently OFF — the feature-gated start engine is authoritative). + // Both default ON so the conns-per-worker unit tests exercise the policy; + // the production New() path sets connSwitchEnabled from the kernel/feature + // profile and loadDownRevert=false (pinning makes load-revert harmful). connSwitchEnabled: true, + loadDownRevert: true, logger: logger, state: controllerState{ activeIsPrimary: true, @@ -138,7 +150,10 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap) return true } - if !c.connSwitchEnabled { + if !c.connSwitchEnabled || !c.loadDownRevert { + // Load-driven down-revert is disabled in production: pinned conns + // never migrate, so reverting only strands io_uring keep-alives and + // routes new conns to epoll mid-ramp. Only the error-revert above moves us. return false } return c.evaluateDown(snap, cpw) diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go index 7130d1a0..beaf4c6d 100644 --- a/adaptive/controller_test.go +++ b/adaptive/controller_test.go @@ -65,10 +65,10 @@ func TestConnSwitchDisabledGate(t *testing.T) { } // TestUpSwitchRequiresSustain verifies the normal epoll→io_uring switch needs -// sustainTicks (2) consecutive ticks at or above the up threshold (20). +// sustainTicks (2) consecutive ticks at or above the up threshold (24). func TestUpSwitchRequiresSustain(t *testing.T) { c, sampler := newCtrlEpollActive() - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 20}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24}) now := time.Now() if c.evaluate(now, false) { @@ -88,7 +88,7 @@ func TestUpSwitchSustainResetsOnDip(t *testing.T) { c, sampler := newCtrlEpollActive() now := time.Now() - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 22}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26}) if c.evaluate(now, false) { t.Fatal("first tick must not switch") } @@ -101,7 +101,7 @@ func TestUpSwitchSustainResetsOnDip(t *testing.T) { t.Fatalf("upTicks = %d, want 0 after a dip below the up threshold", c.state.upTicks) } // Back above threshold: needs two more ticks now. - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 22}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26}) if c.evaluate(now.Add(2*time.Second), false) { t.Fatal("first tick after reset must not switch") } @@ -111,17 +111,17 @@ func TestUpSwitchSustainResetsOnDip(t *testing.T) { } // TestFastPathSnapsImmediately verifies conns/worker at/above the high -// watermark (32) switches on a single tick. +// watermark (48) switches on a single tick. func TestFastPathSnapsImmediately(t *testing.T) { c, sampler := newCtrlEpollActive() - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 32}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 48}) if !c.evaluate(time.Now(), false) { t.Fatal("conns/worker at the high watermark must snap on one tick") } } -// TestHysteresisBandNoFlap verifies the 12–20 band switches neither way. +// TestHysteresisBandNoFlap verifies the 12–24 band switches neither way. func TestHysteresisBandNoFlap(t *testing.T) { for _, cpw := range []float64{12, 14, 16, 18, 19.9} { c, sampler := newCtrlEpollActive() @@ -140,7 +140,7 @@ func TestHysteresisBandNoFlap(t *testing.T) { func TestLargePayloadSuppression(t *testing.T) { c, sampler := newCtrlEpollActive() // Exactly at the threshold counts as large (>=). - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 16384}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8192}) now := time.Now() for i := range 5 { @@ -150,7 +150,7 @@ func TestLargePayloadSuppression(t *testing.T) { } // Drop below the large-payload threshold → fast path fires. - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 16383}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8191}) if !c.evaluate(now.Add(10*time.Second), false) { t.Fatal("small payload above high watermark must switch") } @@ -204,7 +204,7 @@ func TestCooldownGatesRevert(t *testing.T) { now := time.Now() // First switch is not gated (lastSwitch is zero). - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) if !c.evaluate(now, false) { t.Fatal("first switch must not be cooldown-gated") } @@ -273,10 +273,10 @@ func TestRecordSwitchResetsTicks(t *testing.T) { // guards against a config drift in the documented thresholds. func TestDefaultThresholds(t *testing.T) { c, _ := newCtrlEpollActive() - if c.upThreshold != 20 || c.downThreshold != 12 || c.highWatermark != 32 { + if c.upThreshold != 24 || c.downThreshold != 12 || c.highWatermark != 48 { t.Fatalf("threshold drift: up=%.0f down=%.0f hwm=%.0f", c.upThreshold, c.downThreshold, c.highWatermark) } - if c.largePayloadBytes != 16384 || c.errorRevertRate != 0.05 || c.sustainTicks != 2 { + if c.largePayloadBytes != 8192 || c.errorRevertRate != 0.05 || c.sustainTicks != 2 { t.Fatalf("policy drift: largePayload=%.0f errRevert=%.3f sustain=%d", c.largePayloadBytes, c.errorRevertRate, c.sustainTicks) } diff --git a/adaptive/engine.go b/adaptive/engine.go index 77ce3011..3c83a34f 100644 --- a/adaptive/engine.go +++ b/adaptive/engine.go @@ -33,10 +33,12 @@ var ( // activeIsPrimary==true means epoll is active) and secondary is ALWAYS the // io_uring engine. On the public New() path only the START engine is built // eagerly; the other slot stays nil until the first switch actually needs it -// (see buildStandby + performSwitch). On a modern kernel that starts on -// io_uring and never reverts, the epoll standby is never constructed, so its -// GC-rooted heap never exists. newFromEngines (tests) populates BOTH slots -// eagerly, exercising the standby-already-exists switch path. +// (see buildStandby + performSwitch). Under the default policy the start engine +// is epoll, so the io_uring standby is built lazily — and only if a sustained +// high-concurrency ramp promotes new conns to it; an engine that never switches +// never constructs its standby, so that heap never exists. newFromEngines +// (tests) populates BOTH slots eagerly, exercising the standby-already-exists +// switch path. type Engine struct { primary engine.Engine // epoll (nil until built when it is the lazy standby) secondary engine.Engine // io_uring (nil until built when it is the lazy standby) @@ -92,41 +94,80 @@ type Engine struct { switchRejected atomic.Uint64 // telemetry: how many switches were blocked by driver FDs } -// chooseStartEngine selects which sub-engine the adaptive meta-engine should -// start (and build eagerly) given the probed io_uring capability profile. +// ioUringViable reports whether io_uring is worth running at all on this host: +// the kernel must expose the fast tier AND RLIMIT_MEMLOCK must be able to fund +// the requested worker count. These are the two t0-knowable disqualifiers from +// the epoll-vs-io_uring sweep: // -// io_uring loses to epoll on old kernels (pre-LTS-stability bugs, missing the -// fast-path setup flags) but wins on modern ones for thin-HTTP, so the start -// engine is feature-gated: -// -// - IOUring when io_uring is mature for thin-HTTP — either the kernel is in -// the "bundles" era (>6.10, where multishot + provided buffers + defer -// taskrun are all stable and tuned) OR the 6.1+ fast tier is present -// (DEFER_TASKRUN + SINGLE_ISSUER + MULTISHOT_RECV + PROVIDED_BUFFERS). -// - Epoll otherwise (old kernels, or io_uring missing the fast tier). +// - Kernel/feature: io_uring loses to epoll on old kernels (missing the +// fast-path setup flags); require the "bundles" era (>6.10) OR the 6.1+ +// fast tier (DEFER_TASKRUN + SINGLE_ISSUER + MULTISHOT_RECV + PROVIDED_BUFFERS). +// - Memlock: io_uring's provided-buffer rings need locked pages per worker +// (minMemlockPerWorker). If RLIMIT_MEMLOCK can't fund the requested workers, +// io_uring caps to a fraction of them and its throughput collapses; epoll +// does not memlock buffer rings, so it keeps all workers. In that case +// io_uring is never the right engine. +func ioUringViable(p engine.CapabilityProfile, cfg resource.Config) bool { + bundlesEra := p.KernelMajor > 6 || (p.KernelMajor == 6 && p.KernelMinor >= 10) + fastTier := p.DeferTaskrun && p.SingleIssuer && p.MultishotRecv && p.ProvidedBuffers + if !bundlesEra && !fastTier { + return false + } + wantWorkers := cfg.Resources.Resolve().Workers + if maxW := maxWorkersForMemlock(); maxW != -1 && maxW < wantWorkers { + return false + } + return true +} + +// maxWorkersForMemlock is the io_uring memlock worker-ceiling probe behind a var +// so tests can inject a low cap without mutating the process RLIMIT_MEMLOCK. +var maxWorkersForMemlock = iouring.MaxWorkersForMemlock + +// chooseStartEngine selects which sub-engine the adaptive meta-engine starts +// (and builds eagerly), from facts knowable at Listen() time only. // -// The env var CELERIS_ADAPTIVE_START overrides the rule: +// THE PINNING CONSTRAINT: an established connection cannot migrate between +// epoll and io_uring, so the START engine decides keep-alive throughput; the +// runtime switch can only route NEW connections. And the workload's +// concurrency — the thing that actually decides which engine wins — is +// unknowable here (no connections exist yet). So the start decision is gated +// only on t0-knowable disqualifiers, with a safe default: // -// iouring | epoll — force that start engine. -// auto (or unset) — use the capability rule above. +// 1. env override CELERIS_ADAPTIVE_START=iouring|epoll (operator escape hatch). +// 2. io_uring not viable (old kernel / missing fast tier / memlock too low) → epoll. +// 3. configured Protocol == H2C → epoll (io_uring's win is h1-small-payload only; +// h2c never benefits — its framing/HPACK cost dwarfs the engine delta). +// 4. explicit operator WorkloadHint == HighConcurrency → io_uring (the ONLY +// input that can express a high-concurrency expectation up front). +// 5. DEFAULT → epoll. Every server ramps from zero connections, i.e. the +// low-concurrency regime where epoll wins on throughput AND tail latency; +// the runtime switch then promotes new conns to io_uring if sustained +// high load develops. // -// NOTE: the exact capability rule will be refined by a kernel-matrix sweep; -// treat the thresholds here as the current best estimate, not a final answer. -func chooseStartEngine(p engine.CapabilityProfile) engine.EngineType { +// This flips the previous default (io_uring on modern kernels): io_uring now +// wins the start only on an explicit high-concurrency hint, because the +// benchmark-shaped "saturating burst at t0" is the only case where defaulting +// io_uring helps, and it costs the common low/mid-conc + latency cases. +func chooseStartEngine(p engine.CapabilityProfile, cfg resource.Config) engine.EngineType { switch os.Getenv("CELERIS_ADAPTIVE_START") { case "iouring": return engine.IOUring case "epoll": return engine.Epoll case "auto", "": - // fall through to the capability rule + // fall through to the policy below default: // Unknown value: fall through to auto rather than fail hard. } - bundlesEra := p.KernelMajor > 6 || (p.KernelMajor == 6 && p.KernelMinor >= 10) - fastTier := p.DeferTaskrun && p.SingleIssuer && p.MultishotRecv && p.ProvidedBuffers - if bundlesEra || fastTier { + if !ioUringViable(p, cfg) { + return engine.Epoll + } + if cfg.Protocol == engine.H2C { + return engine.Epoll + } + if cfg.Resources.WorkloadHint == resource.WorkloadHighConcurrency { return engine.IOUring } return engine.Epoll @@ -168,7 +209,8 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) // probe.Probe() reads kernel version + io_uring setup feature bits WITHOUT // constructing an engine, so it is cheap enough for the start decision. - startType := chooseStartEngine(probe.Probe()) + profile := probe.Probe() + startType := chooseStartEngine(profile, cfg) sampler := newLiveSampler(cpuMon) logger := cfg.Logger @@ -243,14 +285,25 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) // records which slot the start engine occupies (primary==epoll). e.ctrl = newController(e.primary, e.secondary, sampler, logger) e.ctrl.state.activeIsPrimary = e.startType == engine.Epoll - // Conns-per-worker UP/DOWN switching is OFF in production: the feature-gated - // chooseStartEngine already selects the engine that is best at every - // concurrency on this kernel, and (because pinned conns never migrate) the - // down-revert would only fire on idle/warmup dips and strand load on the - // wrong engine. The always-on error-revert in the controller is unaffected. - // A future middle-tier kernel with a genuine crossover can flip this on - // (validated by the kernel matrix). - e.ctrl.connSwitchEnabled = false + // Re-enable the conns-per-worker UP switch ONLY on the epoll-start path with + // io_uring viable and a non-h2c protocol. Rationale from the sweep: + // - When we START on epoll (the new default), a sustained high-concurrency + // ramp should promote NEW connections to io_uring (it wins ≥~24 conns/ + // worker for h1 small payloads). The switch routes new SYNs only — + // pinned conns stay on epoll — so it helps ramps/churn, and is inert for + // a pure keep-alive burst (which is fine; that case wants WorkloadHint). + // - When we START on io_uring there is nothing better to switch UP to, and + // a load-driven DOWN-revert would only strand pinned io_uring conns — so + // leave switching OFF there (the always-on error-revert still applies). + // - h2c never benefits from io_uring, so never switch up for it. + // The controller's load-driven DOWN-revert is disabled regardless (pinning); + // only the always-on io_uring error-revert can move us back to epoll. + e.ctrl.connSwitchEnabled = e.startType == engine.Epoll && + ioUringViable(profile, cfg) && + cfg.Protocol != engine.H2C + // Load-driven down-revert is always off in production (pinning makes it + // harmful); only the always-on io_uring error-revert can return us to epoll. + e.ctrl.loadDownRevert = false e.active.Store(&startEngine) return e, nil diff --git a/adaptive/engine_test.go b/adaptive/engine_test.go index 0a8a314d..cf277ba0 100644 --- a/adaptive/engine_test.go +++ b/adaptive/engine_test.go @@ -155,7 +155,7 @@ func TestSwitchTrigger(t *testing.T) { // on a single tick (no sustain wait). func TestSwitchFastPath(t *testing.T) { e, sampler := newAdaptiveStartingOnEpoll(t) - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) if !e.ctrl.evaluate(time.Now(), false) { t.Fatal("expected heavy-load fast-path snap on a single tick") @@ -238,7 +238,7 @@ func TestCooldownGate(t *testing.T) { now := time.Now() // Fast-path switch epoll→io_uring. - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) if !e.ctrl.evaluate(now, false) { t.Fatal("expected initial fast-path switch") } @@ -261,7 +261,7 @@ func TestConnectionDraining(t *testing.T) { secondary.pauseCalls.Store(0) // reset counter now := time.Now() - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // fast-path + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path if !e.ctrl.evaluate(now, false) { t.Fatal("expected switch") @@ -288,7 +288,7 @@ func TestOscillationLock(t *testing.T) { for range 3 { switch e.ActiveEngine().Type() { case engine.Epoll: - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // switch up + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // switch up case engine.IOUring: sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) // error revert } @@ -302,7 +302,7 @@ func TestOscillationLock(t *testing.T) { // Fourth switch should be locked (3 switches within 5 minutes). switch e.ActiveEngine().Type() { case engine.Epoll: - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) case engine.IOUring: sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) } @@ -318,7 +318,7 @@ func TestOverloadFreeze(t *testing.T) { e, sampler := newAdaptiveStartingOnEpoll(t) now := time.Now() - sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 40}) // fast-path load + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path load // Freeze switching. e.FreezeSwitching() @@ -338,87 +338,9 @@ func TestOverloadFreeze(t *testing.T) { // --- Part A: feature-gated start-engine selection ------------------------ -// TestChooseStartEngine covers the capability rule (no env override). -func TestChooseStartEngine(t *testing.T) { - t.Setenv("CELERIS_ADAPTIVE_START", "") // ensure auto - - tests := []struct { - name string - profile engine.CapabilityProfile - want engine.EngineType - }{ - { - name: "7.0 bundles era", - profile: engine.CapabilityProfile{KernelMajor: 7, KernelMinor: 0}, - want: engine.IOUring, - }, - { - name: "6.10 bundles era boundary", - profile: engine.CapabilityProfile{KernelMajor: 6, KernelMinor: 10}, - want: engine.IOUring, - }, - { - name: "6.1 fast tier (defer+single+multishotrecv+providedbuffers)", - profile: engine.CapabilityProfile{ - KernelMajor: 6, KernelMinor: 1, - DeferTaskrun: true, SingleIssuer: true, - MultishotRecv: true, ProvidedBuffers: true, - }, - want: engine.IOUring, - }, - { - name: "5.15 LTS (no fast tier)", - profile: engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15}, - want: engine.Epoll, - }, - { - name: "6.1 missing one fast-tier bit -> epoll", - profile: engine.CapabilityProfile{ - KernelMajor: 6, KernelMinor: 1, - DeferTaskrun: true, SingleIssuer: true, - MultishotRecv: true, ProvidedBuffers: false, - }, - want: engine.Epoll, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := chooseStartEngine(tt.profile); got != tt.want { - t.Errorf("chooseStartEngine = %v, want %v", got, tt.want) - } - }) - } -} - -// TestChooseStartEngineEnvOverride covers CELERIS_ADAPTIVE_START. -func TestChooseStartEngineEnvOverride(t *testing.T) { - // An old-kernel profile that the capability rule would send to epoll. - oldKernel := engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15} - // A modern profile the rule would send to io_uring. - newKernel := engine.CapabilityProfile{KernelMajor: 7, KernelMinor: 0} - - tests := []struct { - val string - profile engine.CapabilityProfile - want engine.EngineType - }{ - {"iouring", oldKernel, engine.IOUring}, // force iouring on old kernel - {"epoll", newKernel, engine.Epoll}, // force epoll on new kernel - {"auto", oldKernel, engine.Epoll}, // auto -> rule - {"auto", newKernel, engine.IOUring}, // auto -> rule - {"garbage", newKernel, engine.IOUring}, // unknown -> falls back to rule - } - - for _, tt := range tests { - t.Run(tt.val, func(t *testing.T) { - t.Setenv("CELERIS_ADAPTIVE_START", tt.val) - if got := chooseStartEngine(tt.profile); got != tt.want { - t.Errorf("CELERIS_ADAPTIVE_START=%q chooseStartEngine = %v, want %v", tt.val, got, tt.want) - } - }) - } -} +// chooseStartEngine + env-override + ioUringViable coverage now lives in +// start_test.go (the policy was redesigned: default epoll, io_uring only on an +// explicit high-concurrency hint with a viable kernel + memlock + non-h2c). // --- Part B: lazy standby build on first switch + reuse ------------------ diff --git a/adaptive/start_test.go b/adaptive/start_test.go new file mode 100644 index 00000000..ac278191 --- /dev/null +++ b/adaptive/start_test.go @@ -0,0 +1,92 @@ +//go:build linux + +package adaptive + +import ( + "testing" + + "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/resource" +) + +// viableProfile is a modern kernel with the full io_uring fast tier. +func viableProfile() engine.CapabilityProfile { + return engine.CapabilityProfile{ + KernelMajor: 6, KernelMinor: 12, + DeferTaskrun: true, SingleIssuer: true, MultishotRecv: true, ProvidedBuffers: true, + } +} + +// oldProfile is a pre-fast-tier kernel (io_uring not worth running). +func oldProfile() engine.CapabilityProfile { + return engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15} +} + +// withMemlock overrides the memlock probe for the duration of a test. +func withMemlock(t *testing.T, max int) { + t.Helper() + prev := maxWorkersForMemlock + maxWorkersForMemlock = func() int { return max } + t.Cleanup(func() { maxWorkersForMemlock = prev }) +} + +// TestChooseStartEngine_GateOrder asserts the new start-engine policy: the +// DEFAULT is epoll (the flipped default), io_uring only on an explicit +// high-concurrency hint with a viable kernel + memlock + non-h2c protocol. +func TestChooseStartEngine_GateOrder(t *testing.T) { + cfg := func(proto engine.Protocol, hint resource.WorkloadHint) resource.Config { + return resource.Config{Protocol: proto, Resources: resource.Resources{WorkloadHint: hint}} + } + tests := []struct { + name string + profile engine.CapabilityProfile + cfg resource.Config + memlock int // -1 = no cap + want engine.EngineType + }{ + {"default-is-epoll (viable, no hint)", viableProfile(), cfg(engine.HTTP1, resource.WorkloadUnspecified), -1, engine.Epoll}, + {"low-conc hint -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadLowConcurrency), -1, engine.Epoll}, + {"high-conc hint + viable -> iouring", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.IOUring}, + {"high-conc hint but old kernel -> epoll", oldProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.Epoll}, + {"high-conc hint but h2c -> epoll", viableProfile(), cfg(engine.H2C, resource.WorkloadHighConcurrency), -1, engine.Epoll}, + {"high-conc hint but memlock-starved -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), 1, engine.Epoll}, + {"auto protocol + high-conc hint -> iouring", viableProfile(), cfg(engine.Auto, resource.WorkloadHighConcurrency), -1, engine.IOUring}, + } + t.Setenv("CELERIS_ADAPTIVE_START", "") // neutralize any ambient override + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + withMemlock(t, tc.memlock) + if got := chooseStartEngine(tc.profile, tc.cfg); got != tc.want { + t.Fatalf("chooseStartEngine = %v, want %v", got, tc.want) + } + }) + } +} + +// TestChooseStartEngine_EnvOverride asserts the env escape hatch wins over policy. +func TestChooseStartEngine_EnvOverride(t *testing.T) { + withMemlock(t, -1) + t.Setenv("CELERIS_ADAPTIVE_START", "iouring") + if got := chooseStartEngine(oldProfile(), resource.Config{Protocol: engine.H2C}); got != engine.IOUring { + t.Fatalf("env=iouring should force IOUring even on old/h2c, got %v", got) + } + t.Setenv("CELERIS_ADAPTIVE_START", "epoll") + if got := chooseStartEngine(viableProfile(), resource.Config{Resources: resource.Resources{WorkloadHint: resource.WorkloadHighConcurrency}}); got != engine.Epoll { + t.Fatalf("env=epoll should force Epoll even with high-conc hint, got %v", got) + } +} + +// TestIOUringViable covers the two t0 disqualifiers. +func TestIOUringViable(t *testing.T) { + withMemlock(t, -1) + if !ioUringViable(viableProfile(), resource.Config{}) { + t.Fatal("viable profile + unlimited memlock should be viable") + } + if ioUringViable(oldProfile(), resource.Config{}) { + t.Fatal("old kernel should be non-viable") + } + withMemlock(t, 1) // 1 worker < resolved workers -> non-viable + if ioUringViable(viableProfile(), resource.Config{}) { + t.Fatal("memlock-starved should be non-viable") + } +} diff --git a/config.go b/config.go index 7a9f8ef2..16226a2e 100644 --- a/config.go +++ b/config.go @@ -40,6 +40,25 @@ const ( // String returns the engine type name. func (t EngineType) String() string { return engine.EngineType(t).String() } +// WorkloadHint is an OPTIONAL declaration of expected steady-state concurrency, +// used only by the Adaptive engine's start-engine decision. Because connections +// cannot migrate between epoll and io_uring, the START engine decides keep-alive +// throughput — and the concurrency is unknowable when the server binds. This +// hint is the ONLY way to make Adaptive START on io_uring; without it Adaptive +// starts on epoll (best for the ramp-from-zero / low-concurrency / latency case) +// and promotes new connections to io_uring under sustained high load. +type WorkloadHint resource.WorkloadHint + +const ( + // WorkloadUnspecified (default) → start epoll, promote under load. + WorkloadUnspecified WorkloadHint = WorkloadHint(resource.WorkloadUnspecified) + // WorkloadLowConcurrency → thin/latency-sensitive traffic; stay epoll. + WorkloadLowConcurrency WorkloadHint = WorkloadHint(resource.WorkloadLowConcurrency) + // WorkloadHighConcurrency → many h1 keep-alive conns/worker; start io_uring + // (when the kernel and RLIMIT_MEMLOCK allow it). + WorkloadHighConcurrency WorkloadHint = WorkloadHint(resource.WorkloadHighConcurrency) +) + // Config holds the public server configuration. type Config struct { // Addr is the TCP address to listen on (e.g. ":8080"). @@ -52,6 +71,12 @@ type Config struct { // Workers is the number of I/O worker goroutines (default GOMAXPROCS). Workers int + // WorkloadHint optionally declares the expected steady-state concurrency. + // It only affects the Adaptive engine's start-engine choice (see WorkloadHint): + // the zero value starts on epoll; WorkloadHighConcurrency starts on io_uring + // when the kernel + memlock allow. + WorkloadHint WorkloadHint + // ReadTimeout is the max duration for reading the entire request. // Zero uses the default (60s). Set to -1 for no timeout. ReadTimeout time.Duration @@ -222,6 +247,7 @@ func (c Config) toResourceConfig() resource.Config { if c.MaxConns > 0 { rc.Resources.MaxConns = c.MaxConns } + rc.Resources.WorkloadHint = resource.WorkloadHint(c.WorkloadHint) rc.MaxRequestBodySize = c.MaxRequestBodySize rc.AsyncHandlers = c.AsyncHandlers diff --git a/engine/iouring/ring.go b/engine/iouring/ring.go index fd5921ec..d4f54d75 100644 --- a/engine/iouring/ring.go +++ b/engine/iouring/ring.go @@ -19,6 +19,29 @@ import ( // we already promised it'd fit. const minMemlockPerWorker = 12 * 1024 * 1024 +// MaxWorkersForMemlock returns the io_uring worker ceiling imposed by +// RLIMIT_MEMLOCK, or -1 when the soft limit is unlimited (RLIM_INFINITY) or +// cannot be read — i.e. "no cap". It performs no syscall beyond Getrlimit and +// constructs no ring, so it is safe to call from the adaptive start decision +// BEFORE building the io_uring engine. The returned ceiling is at least 1 when +// a finite limit is present (createWorkers/NewRing will surface a precise +// ENOMEM if even one worker doesn't fit). This is the single source of truth +// for the rlim.Cur / minMemlockPerWorker math (capWorkersToMemlock calls it). +func MaxWorkersForMemlock() int { + var rlim unix.Rlimit + if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { + return -1 + } + if rlim.Cur == ^uint64(0) { + return -1 + } + maxByMemlock := int(rlim.Cur / minMemlockPerWorker) + if maxByMemlock < 1 { + maxByMemlock = 1 + } + return maxByMemlock +} + // capWorkersToMemlock returns min(want, RLIMIT_MEMLOCK / minMemlockPerWorker). // When the soft limit is unlimited (RLIM_INFINITY), the request is honoured // as-is. When the cap forces a reduction, the chosen logger is informed so @@ -30,20 +53,15 @@ func capWorkersToMemlock(want int, logger *slog.Logger) int { if want <= 1 { return want } - var rlim unix.Rlimit - if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { - return want - } - if rlim.Cur == ^uint64(0) { + maxByMemlock := MaxWorkersForMemlock() + if maxByMemlock == -1 { return want } - maxByMemlock := int(rlim.Cur / minMemlockPerWorker) - if maxByMemlock < 1 { - maxByMemlock = 1 - } if maxByMemlock >= want { return want } + var rlim unix.Rlimit + _ = unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim) if logger != nil { logger.Warn("io_uring workers capped by RLIMIT_MEMLOCK", "requested", want, diff --git a/resource/resource.go b/resource/resource.go index 756ed1d6..7325ef43 100644 --- a/resource/resource.go +++ b/resource/resource.go @@ -1,5 +1,29 @@ package resource +// WorkloadHint is an OPTIONAL operator declaration of the expected steady-state +// concurrency, consumed only by the adaptive engine's start-engine decision. +// +// Because established connections cannot migrate between epoll and io_uring, +// the START engine decides keep-alive throughput; and the expected concurrency +// is unknowable at Listen() time (no connections exist yet). This hint is the +// ONLY input that can make the adaptive engine START on io_uring. Absent it, +// the engine defaults to epoll (every server ramps from zero connections — the +// low-concurrency regime where epoll wins on both throughput and tail latency) +// and lets the runtime switch route NEW connections up to io_uring under load. +type WorkloadHint int + +const ( + // WorkloadUnspecified (zero value) leaves the start-engine choice to the + // default policy: start epoll, promote new conns to io_uring under load. + WorkloadUnspecified WorkloadHint = iota + // WorkloadLowConcurrency explicitly declares thin/latency-sensitive traffic + // — start (and stay) on epoll. + WorkloadLowConcurrency + // WorkloadHighConcurrency explicitly declares many h1 keep-alive + // connections per worker — start on io_uring (when kernel + memlock allow). + WorkloadHighConcurrency +) + // Resources allows user overrides of default resource values. // Zero values mean "use engine default". type Resources struct { @@ -13,6 +37,9 @@ type Resources struct { SocketSend int // MaxConns is the max simultaneous connections per worker (0 = unlimited). MaxConns int + // WorkloadHint is an OPTIONAL operator concurrency declaration; the only + // input that can make the adaptive engine START on io_uring (see WorkloadHint). + WorkloadHint WorkloadHint } // ResolvedResources contains the final computed values after applying defaults, From 90839abb91cb69ffd89074e8fa0d966ba32f4cfb Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Fri, 19 Jun 2026 11:22:56 +0200 Subject: [PATCH 22/27] perf(async): effective AsyncHandlers for drivers + immediate-promote + UsesDriver Close the async/sync handler review's footguns: #1 Server.AsyncHandlers() reports the EFFECTIVE async state (config.AsyncHandlers || router.hasAsyncRoutes()) instead of the raw config flag. WithEngine drivers select their netpoll-park fast path from this, so the recommended "AsyncHandlers=false + mark DB routes .Async()" idiom no longer silently drops the driver onto its slow busy-spin mini-loop. Only the driver registry consults this method, so the change is targeted. The value is read at driver construction, so open WithEngine drivers AFTER registering .Async() routes (or set AsyncHandlers=true); documented on the field. #3 adaptiveBlockingThreshold (2ms): a single unambiguously-blocking inline run promotes the adaptive route immediately (router.promoteRouteImmediate), skipping the adaptivePromoteStreak (8) hysteresis, so a forgotten-.Async() blocking handler stalls a worker for at most one request. The 300us/8-streak path still owns the 300us-2ms band; a CPU chain cannot cross 2ms under jitter. #4 Route.UsesDriver() - intent-revealing alias for .Async() on driver routes. #2 Config.AsyncHandlers doc rewritten (effective-state behavior, construction- order caveat, recommend AsyncHandlers=true OR .Async()/.UsesDriver() for driver routes; the adaptive net only auto-promotes handlers slower than 300us). Driver benchmark (msa2-server, 128c, footgun config = AsyncHandlers=false + per-route .Async(), before vs after #1): the fix recovers the async win and ~halves p99, matching the global-AsyncHandlers=true fast path - redis iouring 87k->107k (+23%) p99 2.8->1.4ms ; epoll 86k->107k memcached iouring 92k->136k (+48%) p99 3.0->1.2ms postgres iouring 62k-> 84k (+35%) p99 3.1->1.9ms Tests: 8 white-box unit (async_improvements_test.go) + 1 epoll integration (async_promote_integration_linux_test.go: single >2ms run promotes immediately); full celeris + driver suites green on real io_uring. --- async_improvements_test.go | 132 ++++++++++++++++++++++++ async_promote_integration_linux_test.go | 75 ++++++++++++++ config.go | 21 ++-- handler.go | 19 +++- router.go | 28 +++++ server.go | 11 +- 6 files changed, 277 insertions(+), 9 deletions(-) create mode 100644 async_improvements_test.go create mode 100644 async_promote_integration_linux_test.go diff --git a/async_improvements_test.go b/async_improvements_test.go new file mode 100644 index 00000000..e72a2a5c --- /dev/null +++ b/async_improvements_test.go @@ -0,0 +1,132 @@ +package celeris + +import ( + "sync/atomic" + "testing" +) + +// --- #1: Server.AsyncHandlers() reports the EFFECTIVE async state --------- + +// TestAsyncHandlersEffective_PerRouteAsync verifies the driver-footgun fix: a +// server with Config.AsyncHandlers=false but a .Async() route reports +// AsyncHandlers()==true, so a WithEngine driver picks its netpoll fast path +// under the recommended per-route-async idiom (instead of the slow mini-loop). +func TestAsyncHandlersEffective_PerRouteAsync(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + if s.AsyncHandlers() { + t.Fatal("no async routes + AsyncHandlers=false must report false") + } + s.GET("/db", noopHandler).Async() + if !s.AsyncHandlers() { + t.Fatal("a .Async() route must make AsyncHandlers() effective-true (driver fast path)") + } +} + +// TestAsyncHandlersEffective_ServerDefault verifies the server-level flag alone +// still reports true (unchanged behavior). +func TestAsyncHandlersEffective_ServerDefault(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: true}) + if !s.AsyncHandlers() { + t.Fatal("AsyncHandlers=true must report true") + } +} + +// TestAsyncHandlersEffective_GroupAsync verifies a group-level .Async() route +// also flips the effective state. +func TestAsyncHandlersEffective_GroupAsync(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + api := s.Group("/api").Async() + api.GET("/x", noopHandler) + if !s.AsyncHandlers() { + t.Fatal("a group .Async() route must make AsyncHandlers() effective-true") + } +} + +// --- #4: .UsesDriver() is an intent-revealing alias for .Async() ---------- + +func TestUsesDriver(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + s.GET("/cache/:key", noopHandler).UsesDriver() + if !s.router.routeAsync("GET", "/cache/anything") { + t.Fatal(".UsesDriver() must resolve hard-async") + } + if s.router.adaptiveRoutes["/cache/:key"] { + t.Fatal(".UsesDriver() (== .Async()) must not be adaptive") + } + if !s.AsyncHandlers() { + t.Fatal(".UsesDriver() must make AsyncHandlers() effective-true") + } +} + +// --- #3: immediate promotion of an unambiguously-blocking inline run ------- + +// TestAdaptiveBlockingThresholdConst guards the threshold ordering: the +// immediate-promote bar must sit above the slow bar so the streak path still +// owns the borderline band. +func TestAdaptiveBlockingThresholdConst(t *testing.T) { + if adaptiveBlockingThreshold <= adaptivePromoteThreshold { + t.Fatalf("blocking threshold %v must exceed slow threshold %v", + adaptiveBlockingThreshold, adaptivePromoteThreshold) + } +} + +// TestPromoteRouteImmediate verifies a single unambiguously-blocking inline run +// promotes the route immediately (no adaptivePromoteStreak wait), so a +// forgotten-async blocking handler stalls a worker for at most one request. +func TestPromoteRouteImmediate(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/slow", noopHandler) // inherits default → adaptive + r := s.router + if !r.adaptiveRoutes["/slow"] { + t.Fatal("/slow should be adaptive under AsyncHandlers=true") + } + if r.isPromoted("/slow") { + t.Fatal("must not be promoted before any blocking run") + } + r.promoteRouteImmediate("/slow") // one >adaptiveBlockingThreshold run + if !r.isPromoted("/slow") { + t.Fatal("a single blocking inline run must promote immediately") + } + if !r.routeAsync("GET", "/slow") { + t.Fatal("a promoted route must resolve async") + } +} + +// TestPromoteImmediate_ClearsFastStreak verifies the immediate promotion clears +// any accumulated fast streak so the route cannot wrongly settle on stale fast +// runs after the promotion TTL expires. +func TestPromoteImmediate_ClearsFastStreak(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/slow", noopHandler) + r := s.router + for i := 0; i < 10; i++ { + r.recordInlineRun("/slow", false) // accumulate fast runs + } + r.promoteRouteImmediate("/slow") + if !r.isPromoted("/slow") { + t.Fatal("must be promoted") + } + if v, ok := r.fastStreak.Load("/slow"); ok { + if got := v.(*atomic.Int32).Load(); got != 0 { + t.Fatalf("fast streak must be cleared on immediate promote, got %d", got) + } + } +} + +// TestStreakStillRequiresEight verifies the slow-streak path is unchanged: a +// borderline-slow (300µs–2ms) route still needs adaptivePromoteStreak runs. +func TestStreakStillRequiresEight(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/borderline", noopHandler) + r := s.router + for i := 0; i < adaptivePromoteStreak-1; i++ { + r.recordInlineRun("/borderline", true) + } + if r.isPromoted("/borderline") { + t.Fatalf("must not promote before %d consecutive slow runs", adaptivePromoteStreak) + } + r.recordInlineRun("/borderline", true) // the 8th + if !r.isPromoted("/borderline") { + t.Fatalf("must promote on the %dth consecutive slow run", adaptivePromoteStreak) + } +} diff --git a/async_promote_integration_linux_test.go b/async_promote_integration_linux_test.go new file mode 100644 index 00000000..736c844a --- /dev/null +++ b/async_promote_integration_linux_test.go @@ -0,0 +1,75 @@ +//go:build linux + +package celeris + +import ( + "context" + "net" + "net/http" + "testing" + "time" +) + +// TestAdaptiveImmediatePromote_Epoll verifies improvement #3 end-to-end on the +// real epoll engine (which runs HandleStream's inline timing): under +// AsyncHandlers=true, an UNMARKED route whose handler blocks for longer than +// adaptiveBlockingThreshold is promoted to async dispatch on the FIRST request, +// not after adaptivePromoteStreak (8) of them. A fast route used only for +// readiness must NOT promote. +func TestAdaptiveImmediatePromote_Epoll(t *testing.T) { + s := New(Config{Engine: Epoll, AsyncHandlers: true}) + s.GET("/ping", func(c *Context) error { return c.String(http.StatusOK, "ok") }) + s.GET("/slow", func(c *Context) error { + time.Sleep(3 * time.Millisecond) // > adaptiveBlockingThreshold (2ms) + return c.String(http.StatusOK, "slow") + }) + if !s.router.adaptiveRoutes["/slow"] || !s.router.adaptiveRoutes["/ping"] { + t.Fatal("both routes must be adaptive under AsyncHandlers=true") + } + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + go func() { _ = s.StartWithListener(ln) }() + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = s.Shutdown(ctx) + }() + + base := "http://" + ln.Addr().String() + client := &http.Client{} + // Readiness on the FAST route only (so /slow is untouched until our 1 probe). + deadline := time.Now().Add(5 * time.Second) + ready := false + for time.Now().Before(deadline) { + if resp, err := client.Get(base + "/ping"); err == nil { + _ = resp.Body.Close() + ready = true + break + } + time.Sleep(10 * time.Millisecond) + } + if !ready { + t.Fatal("server did not become ready") + } + + if s.router.isPromoted("/slow") { + t.Fatal("/slow must not be promoted before any request to it") + } + // Exactly ONE request to the blocking route. + resp, err := client.Get(base + "/slow") + if err != nil { + t.Fatalf("GET /slow: %v", err) + } + _ = resp.Body.Close() + + if !s.router.isPromoted("/slow") { + t.Fatal("#3: a single >2ms inline run must promote /slow immediately (got not-promoted)") + } + // The fast route hammered for readiness must never promote. + if s.router.isPromoted("/ping") { + t.Fatal("/ping (fast) must not be promoted") + } +} diff --git a/config.go b/config.go index 16226a2e..e69a25d9 100644 --- a/config.go +++ b/config.go @@ -158,13 +158,20 @@ type Config struct { // groups can override it per handler with [Route.Async] / // [RouteGroup.Async] (most-specific wins: route > group > this // default), so a server with mostly CPU routes + a few DB routes can - // keep this false and mark just the DB routes .Async(), or set this - // true and mark hot CPU routes .Async(false). On HTTP/2 the override - // is honored per-stream (sync routes run inline on the event loop, - // async routes dispatch to the worker pool). Note: celeris drivers - // opened WithEngine(srv) consult the server-level AsyncHandlers (not - // per-route overrides) for their auto-async path selection — set this - // true when using WithEngine drivers under per-route async. + // keep this false and mark just the DB routes .Async() / .UsesDriver(), + // or set this true and mark hot CPU routes .Async(false). On HTTP/2 the + // override is honored per-stream (sync routes run inline on the event + // loop, async routes dispatch to the worker pool). + // + // DRIVERS: celeris drivers opened WithEngine(srv) pick their netpoll-park + // fast path from the server's EFFECTIVE async state — true when this flag + // is set OR any route is .Async(). So "keep this false + mark DB routes + // .Async()/.UsesDriver()" selects the fast driver path too, PROVIDED the + // driver is opened AFTER those routes are registered (the effective state + // is read at driver construction); otherwise set this true. Setting this + // true also enables the adaptive safety net that auto-promotes any unmarked + // handler slower than ~300µs, at a small learning-phase cost that settles to + // zero for static routes. // // Default: false. AsyncHandlers bool diff --git a/handler.go b/handler.go index 2ecd2e19..1eb3b060 100644 --- a/handler.go +++ b/handler.go @@ -141,7 +141,14 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro if rt.adaptiveRoutes[fullPath] && rt.adaptiveLearning(fullPath) { start := time.Now() err := c.Next() - rt.recordInlineRun(fullPath, time.Since(start) > adaptivePromoteThreshold) + dur := time.Since(start) + if dur > adaptiveBlockingThreshold { + // Unambiguously a blocking I/O round-trip: promote on the first such + // run rather than waiting for adaptivePromoteStreak slow runs. + rt.promoteRouteImmediate(fullPath) + } else { + rt.recordInlineRun(fullPath, dur > adaptivePromoteThreshold) + } if err != nil { a.handleError(c, s, err) } @@ -168,6 +175,16 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro // on EVERY request. const adaptivePromoteThreshold = 300 * time.Microsecond +// adaptiveBlockingThreshold is the inline duration that is UNAMBIGUOUSLY a +// blocking I/O round-trip (not CPU work under contention). A single inline run +// over this bar promotes the route IMMEDIATELY, skipping the +// adaptivePromoteStreak hysteresis — a genuinely-blocking handler that an +// operator forgot to mark .Async() then stalls a worker for at most one request +// instead of adaptivePromoteStreak of them. The bar sits far above any CPU-bound +// chain's wall-clock (even under GC/scheduling jitter), so it cannot misfire on +// a CPU route; the 300µs/streak path still handles the borderline 300µs–2ms band. +const adaptiveBlockingThreshold = 2 * time.Millisecond + // adaptivePromoteStreak is how many CONSECUTIVE slow inline runs promote an // adaptive route to async. The consecutive requirement (a fast run resets the // streak) makes a one-off cold start / GC pause harmless, while a handler that diff --git a/router.go b/router.go index 0ba24f68..50697047 100644 --- a/router.go +++ b/router.go @@ -241,6 +241,22 @@ func (r *Route) Sync() *Route { return r.setAsync(false) } +// UsesDriver marks a route whose handler performs a blocking backend round-trip +// via a celeris driver (postgres / redis / memcached) opened WithEngine(srv). +// It is exactly equivalent to .Async() — the route is dispatched off the worker +// so the blocking call parks the handler goroutine on netpoll instead of +// stalling an I/O worker — but reads as intent at the call site and is the +// recommended way to mark driver routes: +// +// srv.GET("/users/:id", getUser).UsesDriver() // == .Async(), clearer intent +// +// The adaptive safety net (Config.AsyncHandlers=true) only auto-promotes +// handlers slower than ~300µs, so a fast localhost driver call (sub-300µs) would +// otherwise keep blocking a worker every request — mark such routes explicitly. +func (r *Route) UsesDriver() *Route { + return r.setAsync(true) +} + // setAsync is the shared implementation of [Route.Async] and [Route.Sync]. // Updates the node, the router's asyncRouteCount, and the static-fast-path // map entry so all three observers see the same async flag. @@ -331,6 +347,18 @@ func (r *router) promoteRoute(fullPath string) { r.promoted.Store(fullPath, nowNano()) } +// promoteRouteImmediate promotes an adaptive route after a single +// unambiguously-blocking inline run (> adaptiveBlockingThreshold), skipping the +// adaptivePromoteStreak hysteresis. The fast streak is cleared so a burst of +// prior fast runs cannot settle the route once the promotion TTL expires and it +// re-enters the learning path. +func (r *router) promoteRouteImmediate(fullPath string) { + if fv, ok := r.fastStreak.Load(fullPath); ok { + fv.(*atomic.Int32).Store(0) + } + r.promoteRoute(fullPath) +} + // recordInlineRun feeds one inline-run observation into the adaptive // classifier (celeris#356). A fast run resets the slow streak; a slow run // increments it, and once a route is slow on adaptivePromoteStreak CONSECUTIVE diff --git a/server.go b/server.go index 6cf14e09..e6977a6c 100644 --- a/server.go +++ b/server.go @@ -476,7 +476,16 @@ func (s *Server) EventLoopProvider() engine.EventLoopProvider { // (their net.TCPConn goroutines keep running regardless of which // sub-engine is active). func (s *Server) AsyncHandlers() bool { - if !s.config.AsyncHandlers { + // Report the EFFECTIVE async state, not the raw Config flag: the server + // stands up async-dispatch infrastructure when the server-level default is + // set OR any route opted in via .Async() (doPrepare flips the engine cfg the + // same way via hasAsyncRoutes). WithEngine drivers consult this to pick their + // netpoll-park fast path; keying on the raw config alone put a driver called + // from a per-route-.Async() handler on the slower mini-loop path — the + // documented footgun. NOTE: hasAsyncRoutes reflects routes registered so far, + // so open WithEngine drivers AFTER registering .Async() routes (or set + // Config.AsyncHandlers=true) to be order-independent. + if !s.config.AsyncHandlers && !s.router.hasAsyncRoutes() { return false } switch s.config.Engine { From a2c48bd9f7872465a4dacdd8cef1de1ee7278b26 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Sun, 21 Jun 2026 15:58:25 +0200 Subject: [PATCH 23/27] release(v1.5.3): bump Version 1.5.0->1.5.3 + middleware celeris pins celeris.Version was stale at 1.5.0 (unchanged since the 1.5.0 tag, so even v1.5.2 shipped "1.5.0"). Bump to 1.5.3. Bump the four publishable middleware submodules' `require github.com/goceleris/celeris` from v1.4.4 to v1.5.3 so the tagged submodules resolve the matching core (release.yml's tag-submodules job warns when they drift); the local `replace => ../../` keeps in-tree builds against the live core. --- middleware/compress/go.mod | 2 +- middleware/metrics/go.mod | 2 +- middleware/otel/go.mod | 2 +- middleware/protobuf/go.mod | 2 +- server.go | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/middleware/compress/go.mod b/middleware/compress/go.mod index d14eeb27..d68ab9da 100644 --- a/middleware/compress/go.mod +++ b/middleware/compress/go.mod @@ -4,7 +4,7 @@ go 1.26.4 require ( github.com/andybalholm/brotli v1.2.1 - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 github.com/klauspost/compress v1.18.6 ) diff --git a/middleware/metrics/go.mod b/middleware/metrics/go.mod index 33d52836..98f40569 100644 --- a/middleware/metrics/go.mod +++ b/middleware/metrics/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/metrics go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 github.com/prometheus/common v0.68.1 diff --git a/middleware/otel/go.mod b/middleware/otel/go.mod index fc135432..61423c64 100644 --- a/middleware/otel/go.mod +++ b/middleware/otel/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/otel go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 go.opentelemetry.io/otel v1.44.0 go.opentelemetry.io/otel/metric v1.44.0 go.opentelemetry.io/otel/sdk v1.44.0 diff --git a/middleware/protobuf/go.mod b/middleware/protobuf/go.mod index 948f256a..bad3be60 100644 --- a/middleware/protobuf/go.mod +++ b/middleware/protobuf/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/protobuf go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 google.golang.org/protobuf v1.36.11 ) diff --git a/server.go b/server.go index e6977a6c..515f1e99 100644 --- a/server.go +++ b/server.go @@ -21,7 +21,7 @@ import ( ) // Version is the semantic version of the celeris module. -const Version = "1.5.0" +const Version = "1.5.3" // ErrAlreadyStarted is returned when Start or StartWithContext is called on a // server that is already running. From 1397991635cf047fb16aa31b7ee97168c8e8b54e Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Sun, 21 Jun 2026 15:58:25 +0200 Subject: [PATCH 24/27] docs(security): update SECURITY.md for the 1.5.x supported line The Supported Versions table + policy still named the 1.4.x line as supported even though 1.5.0-1.5.2 had shipped. Mark >= 1.5.0 supported, 1.4.x and earlier unsupported, and add a v1.5.x Security Improvements section (io_uring gen-tagged-CQE UAF hardening, H1 request-smuggling hardening, cpuMon + epoll-detach data-race fixes, middleware/secure COEP + X-Download-Options default-off behavior change, Go 1.26.4 toolchain bump). --- SECURITY.md | 62 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 29345ae4..0b103a0f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,14 +2,58 @@ ## Supported Versions -| Version | Supported | -|---------------|-----------| -| >= 1.4.0 | Yes | -| < 1.4.0 | No | - -Security updates are issued only for the 1.4.x line. Earlier versions -(1.3.x and below) no longer receive fixes, including critical ones — -upgrade to the latest 1.4.x to remain covered. +| Version | Supported | +|----------|-----------| +| >= 1.5.0 | Yes | +| 1.4.x | No | +| < 1.4.0 | No | + +Security updates are issued only for the 1.5.x line. The 1.4.x line and +earlier (1.3.x and below) no longer receive fixes, including critical +ones — upgrade to the latest 1.5.x to remain covered. + +### v1.5.x Security Improvements + +The 1.5.x line is the core-engine performance milestone (io_uring / epoll / +adaptive). Several of its changes are memory-safety or DoS-posture +relevant: + +- **io_uring use-after-free hardening under churn**: the io_uring engine + serializes close behind in-flight completions (cancel-then-release) and + tags each completion's `user_data` with a 16-bit per-connection + generation counter. A stale CQE arriving after a socket's slot has been + recycled is now detected by the generation mismatch and dropped instead + of being misrouted to the new connection occupying that slot — closing a + gen-collision window that could cross-wire two connections' buffers or + corrupt the heap under sustained POST / connection-churn load. + +- **H1 parser request-smuggling hardening**: the H1 request parser was + hardened against request-smuggling vectors and RFC 9110/9112 framing + violations (conflicting / duplicate `Content-Length`, `Transfer-Encoding` + vs `Content-Length` ambiguity, malformed chunk framing), so a + front-end / back-end desync cannot be induced through celeris. + +- **Latent data-race / memory-safety fixes**: two races that are + memory-safety bugs under the Go memory model were closed — a + `Start` / `Shutdown` race on the server CPU monitor, and the epoll + async-detach path's `h1State.Detached` flag, now an `atomic.Bool` + published with a release barrier so a detaching connection cannot be + observed half-initialized by the engine loop. + +- **`middleware/secure` default hardening (behavior change)**: + `Cross-Origin-Embedder-Policy` (`require-corp`) and `X-Download-Options` + (`noopen`) are now **off by default** and opt-in, matching Helmet's + posture. The previous COEP default silently broke cross-origin resources + without a corresponding security win for most apps; set + `CrossOriginEmbedderPolicy: "require-corp"` (or `"credentialless"`) + explicitly where cross-origin isolation is required. The default + secure-header count drops from 11 to 9 — **audit your deployment if you + relied on the implicit COEP header**. + +- **Go toolchain bump (1.26.3 → 1.26.4)**: every `go.mod` in the repo + (and the loadgen sub-module) moves to `go 1.26.4` for the stdlib + security fixes in that patch release; CI pins the explicit patch version + so a stale runner cache cannot regress. ### v1.4.2 Security Improvements @@ -164,7 +208,7 @@ posture is conservative: ## Historical (unsupported) -Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.4.x to remain covered. +Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.5.x to remain covered. ## Reporting a Vulnerability From f630b3fddc3700af32d7850784afdd0a37eb41c2 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Sun, 21 Jun 2026 15:58:25 +0200 Subject: [PATCH 25/27] test(probe): move CAP_SYS_NICE side-effect test to the linux-tagged file TestCheckCapSysNiceIsSideEffectFree lived in the untagged probe_test.go but called getNice(), which is defined only in probe_caps_linux_test.go (//go:build linux). The package failed to compile on darwin/non-linux, so `go test ./...` / `go vet ./...` broke for local devs (CI was unaffected: linux jobs compile it, the macos job only runs go build). Move the test next to getNice under the linux tag. --- probe/probe_caps_linux_test.go | 26 ++++++++++++++++++++++++++ probe/probe_test.go | 28 ---------------------------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/probe/probe_caps_linux_test.go b/probe/probe_caps_linux_test.go index fcd2234e..84c949e7 100644 --- a/probe/probe_caps_linux_test.go +++ b/probe/probe_caps_linux_test.go @@ -16,3 +16,29 @@ func getNice(t *testing.T) (int, error) { t.Helper() return unix.Getpriority(unix.PRIO_PROCESS, 0) } + +// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the +// dangerous Setpriority-based probe (finding 5.2): the real read-only +// implementation must not change the process scheduling priority. We +// snapshot the current nice value, run the check, and assert it is +// unchanged. The boolean result itself is environment-dependent and not +// asserted. Linux-only (lives here with getNice so the cross-platform +// probe_test.go compiles on darwin/non-linux). +func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) { + sp := defaultProber() + if sp.CheckCapSysNice == nil { + t.Skip("no CheckCapSysNice on this platform") + } + before, err := getNice(t) + if err != nil { + t.Skipf("cannot read nice value: %v", err) + } + _ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects + after, err := getNice(t) + if err != nil { + t.Fatalf("cannot read nice value after check: %v", err) + } + if before != after { + t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after) + } +} diff --git a/probe/probe_test.go b/probe/probe_test.go index 8debb548..60944acd 100644 --- a/probe/probe_test.go +++ b/probe/probe_test.go @@ -314,34 +314,6 @@ func TestSQPollGateAtOptionalTier(t *testing.T) { } } -// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the -// dangerous Setpriority-based probe (finding 5.2): the real read-only -// implementation must not change the process scheduling priority. We -// snapshot the current nice value, run the check, and assert it is -// unchanged. The boolean result itself is environment-dependent and not -// asserted. -func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) { - if runtime.GOOS != "linux" { - t.Skip("CAP_SYS_NICE check is linux-only") - } - sp := defaultProber() - if sp.CheckCapSysNice == nil { - t.Skip("no CheckCapSysNice on this platform") - } - before, err := getNice(t) - if err != nil { - t.Skipf("cannot read nice value: %v", err) - } - _ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects - after, err := getNice(t) - if err != nil { - t.Fatalf("cannot read nice value after check: %v", err) - } - if before != after { - t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after) - } -} - // TestProbeSendfileAndZerocopy pins the sendfile / zerocopy capability // flags (celeris#317). Sendfile is unconditional on Linux (kernel // 2.6.23+, every distro past 2.6.33) and is now actually wired into the From ae02d09b60bb605b6b3bbd5cc134e08555416a8e Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Sun, 21 Jun 2026 16:11:04 +0200 Subject: [PATCH 26/27] deps: refresh dependencies before the v1.5.3 tag - middleware/metrics: prometheus/common v0.68.1 -> v0.69.0 (the published submodule's only outdated direct dep; supersedes dependabot #381). - test/drivercmp/memcached + test/perfmatrix: bradfitz/gomemcache -> latest; perfmatrix also goceleris/loadgen v1.4.8 -> v1.4.9 (test-only modules). - CI: actions/checkout v6 -> v7 across ci/release/drivers (supersedes dependabot #382). Root module + the other three published middleware modules already pin current direct deps. metrics builds+tests green on 0.69.0; actionlint clean. --- .github/workflows/ci.yml | 12 ++++++------ .github/workflows/drivers.yml | 14 +++++++------- .github/workflows/release.yml | 4 ++-- middleware/metrics/go.mod | 2 +- middleware/metrics/go.sum | 4 ++-- test/drivercmp/memcached/go.mod | 2 +- test/drivercmp/memcached/go.sum | 4 ++-- test/perfmatrix/go.mod | 4 ++-- test/perfmatrix/go.sum | 8 ++++---- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index addbc320..02cb57c4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: name: Lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -64,7 +64,7 @@ jobs: name: Unit (root + middleware sub-modules) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -107,7 +107,7 @@ jobs: name: Conformance runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -144,7 +144,7 @@ jobs: CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -164,7 +164,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -187,7 +187,7 @@ jobs: name: Vulnerability Check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/.github/workflows/drivers.yml b/.github/workflows/drivers.yml index 9f4b9d5c..fc48462a 100644 --- a/.github/workflows/drivers.yml +++ b/.github/workflows/drivers.yml @@ -91,7 +91,7 @@ jobs: env: CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -132,7 +132,7 @@ jobs: env: CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -173,7 +173,7 @@ jobs: env: CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -210,7 +210,7 @@ jobs: env: CELERIS_MEMCACHED_ADDR: 127.0.0.1:11211 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -263,7 +263,7 @@ jobs: --health-timeout 5s --health-retries 10 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -296,7 +296,7 @@ jobs: REDIS_IMAGE: redis:${{ matrix.redis }} CELERIS_REDIS_CLUSTER_ADDRS: "127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002" steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -382,7 +382,7 @@ jobs: CELERIS_REDIS_SENTINEL_ADDRS: "127.0.0.1:26379,127.0.0.1:26380,127.0.0.1:26381" CELERIS_REDIS_SENTINEL_MASTER: mymaster steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 690f35a2..0a87ec26 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,7 +31,7 @@ jobs: needs: [validate-tag, ci] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 with: fetch-depth: 0 - name: Create sub-module tags @@ -70,7 +70,7 @@ jobs: needs: [validate-tag, ci, tag-submodules] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/middleware/metrics/go.mod b/middleware/metrics/go.mod index 98f40569..dd707a33 100644 --- a/middleware/metrics/go.mod +++ b/middleware/metrics/go.mod @@ -6,7 +6,7 @@ require ( github.com/goceleris/celeris v1.5.3 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/prometheus/common v0.68.1 + github.com/prometheus/common v0.69.0 ) require ( diff --git a/middleware/metrics/go.sum b/middleware/metrics/go.sum index b7ee233c..4baf42e7 100644 --- a/middleware/metrics/go.sum +++ b/middleware/metrics/go.sum @@ -14,8 +14,8 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.68.1 h1:omjRRl4QP4komogpXuhfeOiisQg7xdy8VM1UY+pStaY= -github.com/prometheus/common v0.68.1/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y= +github.com/prometheus/common v0.69.0 h1:OA85nJQS/T/MaYh/Q2CcgDKSGWqNIgrBDvDH85CuiNk= +github.com/prometheus/common v0.69.0/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= diff --git a/test/drivercmp/memcached/go.mod b/test/drivercmp/memcached/go.mod index 144cb857..df597ee0 100644 --- a/test/drivercmp/memcached/go.mod +++ b/test/drivercmp/memcached/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/test/drivercmp/memcached go 1.26.4 require ( - github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf + github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c github.com/goceleris/celeris v0.0.0 ) diff --git a/test/drivercmp/memcached/go.sum b/test/drivercmp/memcached/go.sum index 581e931c..0f9dd4b5 100644 --- a/test/drivercmp/memcached/go.sum +++ b/test/drivercmp/memcached/go.sum @@ -1,4 +1,4 @@ -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= diff --git a/test/perfmatrix/go.mod b/test/perfmatrix/go.mod index 954c9d03..2caad41f 100644 --- a/test/perfmatrix/go.mod +++ b/test/perfmatrix/go.mod @@ -4,12 +4,12 @@ go 1.26.4 require ( github.com/HdrHistogram/hdrhistogram-go v1.2.0 - github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf + github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c github.com/cloudwego/hertz v0.10.5 github.com/gin-gonic/gin v1.12.0 github.com/go-chi/chi/v5 v5.3.0 github.com/goceleris/celeris v1.4.15 - github.com/goceleris/loadgen v1.4.8 + github.com/goceleris/loadgen v1.4.9 github.com/gofiber/fiber/v3 v3.3.0 github.com/google/uuid v1.6.0 github.com/gorilla/sessions v1.4.0 diff --git a/test/perfmatrix/go.sum b/test/perfmatrix/go.sum index efbc701a..b04f85db 100644 --- a/test/perfmatrix/go.sum +++ b/test/perfmatrix/go.sum @@ -18,8 +18,8 @@ github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eT github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -76,8 +76,8 @@ github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM= github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= -github.com/goceleris/loadgen v1.4.8 h1:r162NrVoLxuCQ3IlhWp+nqAgeFikHImlUSagJbP98m8= -github.com/goceleris/loadgen v1.4.8/go.mod h1:BtjUHc0ULnqa2LsSoJNzDdBt05xUx5jajeF6XnJfFJA= +github.com/goceleris/loadgen v1.4.9 h1:Kd/AmLHP520Su3azQ9tCNoc6tsaeEf7Nx8ECr4AdYfg= +github.com/goceleris/loadgen v1.4.9/go.mod h1:Olg2awQufUnRemRlCvFPFL6Ww3byUd+UvZYQAMJm6Co= github.com/gofiber/fiber/v3 v3.3.0 h1:QBd3sYCqdy6Qs5gJYzSw4I4SbqL204jPqpdub/ueiw8= github.com/gofiber/fiber/v3 v3.3.0/go.mod h1:YH7/TAoRaU4kF8slDCtQuFJ1NzC+3MtxUI4KfvQtaIA= github.com/gofiber/schema v1.7.1 h1:oSJBKdgP8JeIME4TQSAqlNKTU2iBB+2RNmKi8Nsc+TI= From 63870b0d8c7d72029d4b57a8a9a60ac040d31302 Mon Sep 17 00:00:00 2001 From: Albert Bausili Date: Sun, 21 Jun 2026 16:30:50 +0200 Subject: [PATCH 27/27] lint(adaptive): fix gofmt + ineffassign + revive (linux-only, pre-existing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The adaptive package is linux-tagged, so golangci-lint only sees it on the linux CI runner — and milestone/v1.5.3 never had a prior CI run (ci.yml gates on PR/push to main), so four findings slipped in: - controller.go:92 + engine.go:137: gofmt formatting. - engine.go:256: ineffectual local 'startType = engine.Epoll' (the struct field e.startType is the one actually read at activeIsPrimary); drop it. - start_test.go:26: withMemlock param 'max' shadowed the builtin (revive redefines-builtin-id); rename to maxWorkers. Verified: GOOS=linux golangci-lint ./... = 0 issues, cross-compile build+vet clean. --- adaptive/controller.go | 10 +++++----- adaptive/engine.go | 21 ++++++++++----------- adaptive/start_test.go | 4 ++-- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/adaptive/controller.go b/adaptive/controller.go index d1a1aef6..145d1a0d 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -89,11 +89,11 @@ type controller struct { func newController(primary, secondary engine.Engine, sampler TelemetrySampler, logger *slog.Logger) *controller { return &controller{ - primary: primary, - secondary: secondary, - sampler: sampler, - evalInterval: 1 * time.Second, - cooldown: 30 * time.Second, + primary: primary, + secondary: secondary, + sampler: sampler, + evalInterval: 1 * time.Second, + cooldown: 30 * time.Second, // Thresholds from the epoll-vs-io_uring sweep: io_uring overtakes epoll // at ~24 conns/worker for h1 small payloads (was 20); the heavy-load // fast-path snaps only well past the crossover (48); large payloads are diff --git a/adaptive/engine.go b/adaptive/engine.go index 3c83a34f..f12ca2cf 100644 --- a/adaptive/engine.go +++ b/adaptive/engine.go @@ -134,16 +134,16 @@ var maxWorkersForMemlock = iouring.MaxWorkersForMemlock // unknowable here (no connections exist yet). So the start decision is gated // only on t0-knowable disqualifiers, with a safe default: // -// 1. env override CELERIS_ADAPTIVE_START=iouring|epoll (operator escape hatch). -// 2. io_uring not viable (old kernel / missing fast tier / memlock too low) → epoll. -// 3. configured Protocol == H2C → epoll (io_uring's win is h1-small-payload only; -// h2c never benefits — its framing/HPACK cost dwarfs the engine delta). -// 4. explicit operator WorkloadHint == HighConcurrency → io_uring (the ONLY -// input that can express a high-concurrency expectation up front). -// 5. DEFAULT → epoll. Every server ramps from zero connections, i.e. the -// low-concurrency regime where epoll wins on throughput AND tail latency; -// the runtime switch then promotes new conns to io_uring if sustained -// high load develops. +// 1. env override CELERIS_ADAPTIVE_START=iouring|epoll (operator escape hatch). +// 2. io_uring not viable (old kernel / missing fast tier / memlock too low) → epoll. +// 3. configured Protocol == H2C → epoll (io_uring's win is h1-small-payload only; +// h2c never benefits — its framing/HPACK cost dwarfs the engine delta). +// 4. explicit operator WorkloadHint == HighConcurrency → io_uring (the ONLY +// input that can express a high-concurrency expectation up front). +// 5. DEFAULT → epoll. Every server ramps from zero connections, i.e. the +// low-concurrency regime where epoll wins on throughput AND tail latency; +// the runtime switch then promotes new conns to io_uring if sustained +// high load develops. // // This flips the previous default (io_uring on modern kernels): io_uring now // wins the start only on an explicit high-concurrency hint, because the @@ -253,7 +253,6 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) eng, err := buildIOUring() if err != nil { logger.Warn("io_uring start engine unavailable, falling back to epoll start", "error", err) - startType = engine.Epoll e.startType = engine.Epoll eng, err = buildEpoll() if err != nil { diff --git a/adaptive/start_test.go b/adaptive/start_test.go index ac278191..71aaa498 100644 --- a/adaptive/start_test.go +++ b/adaptive/start_test.go @@ -23,10 +23,10 @@ func oldProfile() engine.CapabilityProfile { } // withMemlock overrides the memlock probe for the duration of a test. -func withMemlock(t *testing.T, max int) { +func withMemlock(t *testing.T, maxWorkers int) { t.Helper() prev := maxWorkersForMemlock - maxWorkersForMemlock = func() int { return max } + maxWorkersForMemlock = func() int { return maxWorkers } t.Cleanup(func() { maxWorkersForMemlock = prev }) }