diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index addbc320..02cb57c4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ jobs:
     name: Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -64,7 +64,7 @@ jobs:
     name: Unit (root + middleware sub-modules)
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -107,7 +107,7 @@ jobs:
     name: Conformance
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -144,7 +144,7 @@ jobs:
       CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable
       CELERIS_REDIS_ADDR: 127.0.0.1:6379
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -164,7 +164,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest]
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -187,7 +187,7 @@ jobs:
     name: Vulnerability Check
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
diff --git a/.github/workflows/drivers.yml b/.github/workflows/drivers.yml
index 9f4b9d5c..fc48462a 100644
--- a/.github/workflows/drivers.yml
+++ b/.github/workflows/drivers.yml
@@ -91,7 +91,7 @@ jobs:
     env:
       CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -132,7 +132,7 @@ jobs:
     env:
       CELERIS_REDIS_ADDR: 127.0.0.1:6379
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -173,7 +173,7 @@ jobs:
     env:
       CELERIS_REDIS_ADDR: 127.0.0.1:6379
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -210,7 +210,7 @@ jobs:
     env:
       CELERIS_MEMCACHED_ADDR: 127.0.0.1:11211
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -263,7 +263,7 @@ jobs:
           --health-timeout 5s
           --health-retries 10
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -296,7 +296,7 @@ jobs:
       REDIS_IMAGE: redis:${{ matrix.redis }}
       CELERIS_REDIS_CLUSTER_ADDRS: "127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002"
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
@@ -382,7 +382,7 @@ jobs:
       CELERIS_REDIS_SENTINEL_ADDRS: "127.0.0.1:26379,127.0.0.1:26380,127.0.0.1:26381"
       CELERIS_REDIS_SENTINEL_MASTER: mymaster
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 690f35a2..0a87ec26 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -31,7 +31,7 @@ jobs:
     needs: [validate-tag, ci]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
         with:
           fetch-depth: 0
       - name: Create sub-module tags
@@ -70,7 +70,7 @@ jobs:
     needs: [validate-tag, ci, tag-submodules]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v7
       - uses: actions/setup-go@v6
         with:
           go-version: "1.26.4"
diff --git a/SECURITY.md b/SECURITY.md
index 29345ae4..0b103a0f 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -2,14 +2,58 @@
 
 ## Supported Versions
 
-| Version       | Supported |
-|---------------|-----------|
-| >= 1.4.0      | Yes       |
-| < 1.4.0       | No        |
-
-Security updates are issued only for the 1.4.x line. Earlier versions
-(1.3.x and below) no longer receive fixes, including critical ones —
-upgrade to the latest 1.4.x to remain covered.
+| Version  | Supported |
+|----------|-----------|
+| >= 1.5.0 | Yes       |
+| 1.4.x    | No        |
+| < 1.4.0  | No        |
+
+Security updates are issued only for the 1.5.x line. The 1.4.x line and
+earlier (1.3.x and below) no longer receive fixes, including critical
+ones — upgrade to the latest 1.5.x to remain covered.
+
+### v1.5.x Security Improvements
+
+The 1.5.x line is the core-engine performance milestone (io_uring / epoll /
+adaptive). Several of its changes are memory-safety or DoS-posture
+relevant:
+
+- **io_uring use-after-free hardening under churn**: the io_uring engine
+  serializes close behind in-flight completions (cancel-then-release) and
+  tags each completion's `user_data` with a 16-bit per-connection
+  generation counter. A stale CQE arriving after a socket's slot has been
+  recycled is now detected by the generation mismatch and dropped instead
+  of being misrouted to the new connection occupying that slot — closing a
+  gen-collision window that could cross-wire two connections' buffers or
+  corrupt the heap under sustained POST / connection-churn load.
+
+- **H1 parser request-smuggling hardening**: the H1 request parser was
+  hardened against request-smuggling vectors and RFC 9110/9112 framing
+  violations (conflicting / duplicate `Content-Length`, `Transfer-Encoding`
+  vs `Content-Length` ambiguity, malformed chunk framing), so a
+  front-end / back-end desync cannot be induced through celeris.
+
+- **Latent data-race / memory-safety fixes**: two races that are
+  memory-safety bugs under the Go memory model were closed — a
+  `Start` / `Shutdown` race on the server CPU monitor, and the epoll
+  async-detach path's `h1State.Detached` flag, now an `atomic.Bool`
+  published with a release barrier so a detaching connection cannot be
+  observed half-initialized by the engine loop.
+
+- **`middleware/secure` default hardening (behavior change)**:
+  `Cross-Origin-Embedder-Policy` (`require-corp`) and `X-Download-Options`
+  (`noopen`) are now **off by default** and opt-in, matching Helmet's
+  posture. The previous COEP default silently broke cross-origin resources
+  without a corresponding security win for most apps; set
+  `CrossOriginEmbedderPolicy: "require-corp"` (or `"credentialless"`)
+  explicitly where cross-origin isolation is required. The default
+  secure-header count drops from 11 to 9 — **audit your deployment if you
+  relied on the implicit COEP header**.
+
+- **Go toolchain bump (1.26.3 → 1.26.4)**: every `go.mod` in the repo
+  (and the loadgen sub-module) moves to `go 1.26.4` for the stdlib
+  security fixes in that patch release; CI pins the explicit patch version
+  so a stale runner cache cannot regress.
 
 ### v1.4.2 Security Improvements
 
@@ -164,7 +208,7 @@ posture is conservative:
 
 ## Historical (unsupported)
 
-Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.4.x to remain covered.
+Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.5.x to remain covered.
 
 ## Reporting a Vulnerability
 
diff --git a/adaptive/controller.go b/adaptive/controller.go
index 1687795a..145d1a0d 100644
--- a/adaptive/controller.go
+++ b/adaptive/controller.go
@@ -9,6 +9,30 @@ import (
 	"github.com/goceleris/celeris/engine"
 )
 
+// The adaptive engine starts on epoll and switches to io_uring under load
+// using a DIRECT conns-per-worker policy (no benchmark fingerprinting, no
+// CPU monitor required). The thresholds come from the empirical crossover on
+// this hardware: epoll and io_uring tie up to ~16 conns/worker; io_uring
+// pulls ahead above ~20/worker and keeps scaling while epoll plateaus
+// (io_uring ~+14% at 64 conns/worker); epoll wins at ~1 conn (lower latency).
+//
+// Policy:
+//   - On epoll, switch UP to io_uring when conns/worker sustains the up
+//     threshold for sustainTicks consecutive ticks, OR snap immediately when
+//     conns/worker crosses the heavy-load high-watermark (the fast path).
+//   - On io_uring, revert DOWN to epoll when conns/worker sustains BELOW the
+//     down threshold for sustainTicks ticks. The down threshold sits well
+//     under the up threshold so the band between them is a hysteresis zone
+//     that prevents flapping.
+//   - Large-payload workloads (avg bytes/req above largePayloadBytes) are
+//     link-bound — the engines tie — so an io_uring switch is suppressed to
+//     avoid pointless churn.
+//   - A safety revert fires if io_uring is active and the error rate climbs
+//     above errorRevertRate, regardless of load.
+//
+// The oscillation lock (3 switches in 5 min → 5 min lock) and the post-switch
+// cooldown bound any residual thrash and hold io_uring after the fast snap.
+
 type controllerState struct {
 	activeIsPrimary bool
 	lastSwitch      time.Time
@@ -17,21 +41,50 @@ type controllerState struct {
 	switchCount     int
 	locked          bool
 	lockUntil       time.Time
-	lastActiveScore map[engine.EngineType]float64
-	lastActiveTime  map[engine.EngineType]time.Time
+
+	// upTicks / downTicks count consecutive evaluations that satisfy the
+	// switch-up / switch-down condition; a normal switch needs sustainTicks
+	// of them, the heavy-load fast path needs only one. Reset on a switch
+	// (recordSwitch) and whenever the condition lapses.
+	upTicks   int
+	downTicks int
 }
 
 type controller struct {
-	primary   engine.Engine
-	secondary engine.Engine
+	primary   engine.Engine // epoll  (low-conns winner / starting engine)
+	secondary engine.Engine // io_uring (high-conns winner)
 	sampler   TelemetrySampler
-	weights   ScoreWeights
 	state     controllerState
 
 	evalInterval time.Duration
 	cooldown     time.Duration
-	threshold    float64
-	logger       *slog.Logger
+
+	upThreshold       float64 // conns/worker: epoll → io_uring
+	downThreshold     float64 // conns/worker: io_uring → epoll (hysteresis low edge)
+	highWatermark     float64 // conns/worker: heavy-load fast-path snap
+	largePayloadBytes float64 // avg bytes/req above which io_uring is suppressed
+	errorRevertRate   float64 // io_uring error rate above which we revert to epoll
+	sustainTicks      int     // consecutive ticks required for a normal switch
+
+	// connSwitchEnabled gates the conns-per-worker UP switch (epoll→io_uring).
+	// Production sets it true ONLY on the epoll-start path with io_uring viable
+	// and a non-h2c protocol: there, a sustained high-concurrency ramp should
+	// promote NEW connections to io_uring (it wins ≥~24 conns/worker for h1
+	// small payloads). It is false when io_uring is the start engine (nothing
+	// better to switch up to), when io_uring is unviable, or for h2c. The
+	// always-on error-revert below is independent of this flag. The engine sets
+	// it directly from the profile in New().
+	connSwitchEnabled bool
+
+	// loadDownRevert gates the LOAD-driven io_uring→epoll revert (evaluateDown).
+	// It is OFF in production: because pinned conns never migrate, reverting on
+	// a load dip strands established io_uring keep-alives and routes new conns
+	// back to epoll mid-ramp — pure harm. The always-on error-revert is
+	// independent of this flag. Defaults true in newController so the
+	// conns-per-worker unit tests exercise evaluateDown; New() sets it false.
+	loadDownRevert bool
+
+	logger *slog.Logger
 }
 
 func newController(primary, secondary engine.Engine, sampler TelemetrySampler, logger *slog.Logger) *controller {
@@ -39,20 +92,34 @@ func newController(primary, secondary engine.Engine, sampler TelemetrySampler, l
 		primary:      primary,
 		secondary:    secondary,
 		sampler:      sampler,
-		weights:      DefaultWeights(),
-		evalInterval: 5 * time.Second,
+		evalInterval: 1 * time.Second,
 		cooldown:     30 * time.Second,
-		threshold:    0.15,
-		logger:       logger,
+		// Thresholds from the epoll-vs-io_uring sweep: io_uring overtakes epoll
+		// at ~24 conns/worker for h1 small payloads (was 20); the heavy-load
+		// fast-path snaps only well past the crossover (48); large payloads are
+		// link-bound (engines tie) above 8 KB so suppress the switch there.
+		upThreshold:       24.0,
+		downThreshold:     12.0,
+		highWatermark:     48.0,
+		largePayloadBytes: 8192.0,
+		errorRevertRate:   0.05,
+		sustainTicks:      2,
+		// Both default ON so the conns-per-worker unit tests exercise the policy;
+		// the production New() path sets connSwitchEnabled from the kernel/feature
+		// profile and loadDownRevert=false (pinning makes load-revert harmful).
+		connSwitchEnabled: true,
+		loadDownRevert:    true,
+		logger:            logger,
 		state: controllerState{
 			activeIsPrimary: true,
-			lastActiveScore: make(map[engine.EngineType]float64),
-			lastActiveTime:  make(map[engine.EngineType]time.Time),
 		},
 	}
 }
 
-// evaluate checks whether a switch is warranted. Returns true if a switch should occur.
+// evaluate decides whether a switch is warranted given the current load. It
+// returns true when the engine should switch to its standby. The decision is
+// driven entirely by conns-per-worker (with payload-size and error-rate
+// refinements); the frozen check, oscillation lock and cooldown gate it.
 func (c *controller) evaluate(now time.Time, frozen bool) bool {
 	if frozen {
 		return false
@@ -70,112 +137,111 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool {
 		return false
 	}
 
-	var active, standby engine.Engine
-	if c.state.activeIsPrimary {
-		active = c.primary
-		standby = c.secondary
-	} else {
-		active = c.secondary
-		standby = c.primary
-	}
-
-	activeSnap := c.sampler.Sample(active)
-	baselineActiveScore := computeScore(activeSnap, c.weights)
+	active := c.activeEngine()
+	snap := c.sampler.Sample(active)
+	cpw := snap.ConnsPerWorker
 
-	// io_uring bias: the modeled io_uring advantage given the current workload
-	// (connection count + CPU pressure). Zero outside the empirical sweet spot.
-	bias := ioUringBias(activeSnap)
-
-	// Apply the bias to the ACTIVE score: reinforce io_uring when it is already
-	// active (resist leaving it), lightly penalise epoll when conditions favor
-	// io_uring (encourage leaving it).
-	activeScore := baselineActiveScore
 	if active.Type() == engine.IOUring {
-		activeScore *= (1.0 + bias) // Bonus: io_uring already active, reinforce
-	} else if bias > 0 {
-		activeScore *= (1.0 - bias*0.5) // Penalty: epoll active but conditions favor io_uring
+		// Safety error-revert is ALWAYS active, independent of connSwitchEnabled:
+		// if io_uring starts erroring on this deployment, fall back to epoll.
+		if snap.ErrorRate > c.errorRevertRate {
+			c.state.downTicks = 0
+			c.state.upTicks = 0
+			c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap)
+			return true
+		}
+		if !c.connSwitchEnabled || !c.loadDownRevert {
+			// Load-driven down-revert is disabled in production: pinned conns
+			// never migrate, so reverting only strands io_uring keep-alives and
+			// routes new conns to epoll mid-ramp. Only the error-revert above moves us.
+			return false
+		}
+		return c.evaluateDown(snap, cpw)
 	}
-
-	// Store active's (biased) score for historical reference.
-	c.state.lastActiveScore[active.Type()] = activeScore
-	c.state.lastActiveTime[active.Type()] = now
-
-	// Seed standby with 80% of active if no history exists.
-	if _, ok := c.state.lastActiveScore[standby.Type()]; !ok {
-		c.state.lastActiveScore[standby.Type()] = activeScore * 0.80
-		c.state.lastActiveTime[standby.Type()] = now
+	// epoll active.
+	if !c.connSwitchEnabled {
+		return false
 	}
+	return c.evaluateUp(snap, cpw)
+}
 
-	// Standby estimate. Two independent signals, combined by max:
-	//
-	//  1. Historical: the last directly-observed score for the standby engine,
-	//     decayed at 1%/sec. This drives switching when the ACTIVE engine
-	//     genuinely degrades below a previously-measured standby.
-	//
-	//  2. Bias-modeled: the standby score implied by the io_uring bias for the
-	//     CURRENT workload, recomputed EACH tick. When io_uring is the standby
-	//     and conditions favor it, the standby is modeled as the unbiased active
-	//     baseline scaled up by the bias; when epoll is the standby (io_uring
-	//     active and favored) it is scaled DOWN so we do not switch back. This
-	//     is what makes an organic epoll→io_uring switch reachable — the
-	//     historical-only path could never exceed the threshold (max attainable
-	//     ratio ~0.70 < 1+threshold).
-	standbyScore := c.historicalScore(standby.Type(), now)
-	if modeled := c.biasModeledStandbyScore(standby.Type(), baselineActiveScore, bias); modeled > standbyScore {
-		standbyScore = modeled
-	}
+// evaluateUp runs while epoll is active and considers a switch UP to io_uring.
+func (c *controller) evaluateUp(snap TelemetrySnapshot, cpw float64) bool {
+	// Large payloads are link-bound: the engines tie, so never switch up.
+	// Keep upTicks pinned at zero so a later small-payload burst restarts the
+	// sustain count from scratch.
+	largePayload := snap.BytesPerReq >= c.largePayloadBytes
 
-	if standbyScore > activeScore*(1.0+c.threshold) {
-		c.logger.Info("switch recommended",
-			"active", active.Type().String(),
-			"standby", standby.Type().String(),
-			"active_score", activeScore,
-			"standby_score", standbyScore,
-		)
+	switch {
+	case !largePayload && cpw >= c.highWatermark:
+		// Heavy-load fast path: snap immediately on a single tick.
+		c.state.upTicks++
+		c.state.downTicks = 0
+		c.logSwitch("epoll", "io_uring", "heavy-load fast path", cpw, snap)
 		return true
+	case !largePayload && cpw >= c.upThreshold:
+		c.state.upTicks++
+		c.state.downTicks = 0
+		if c.state.upTicks >= c.sustainTicks {
+			c.logSwitch("epoll", "io_uring", "sustained high load", cpw, snap)
+			return true
+		}
+	default:
+		c.state.upTicks = 0
 	}
-
+	c.state.downTicks = 0
 	return false
 }
 
-// biasModeledStandbyScore models the standby engine's score for the CURRENT
-// workload from the io_uring bias, using the unbiased active baseline as the
-// reference point. The sign of the adjustment depends on which engine is the
-// standby:
-//
-//   - io_uring standby: when conditions favor io_uring (bias>0) it is modeled
-//     as bias-better than the active baseline → standby = baseline*(1+bias).
-//   - epoll standby (io_uring active and favored): epoll is modeled as
-//     bias-worse → standby = baseline*(1-bias), so a favorable-for-io_uring
-//     workload never recommends switching back to epoll.
-//
-// Recomputed every tick so the estimate tracks live conditions rather than a
-// stale seed.
-func (c *controller) biasModeledStandbyScore(standby engine.EngineType, baselineActiveScore, bias float64) float64 {
-	if bias <= 0 {
-		return baselineActiveScore
+// evaluateDown runs while io_uring is active and considers a revert to epoll.
+func (c *controller) evaluateDown(snap TelemetrySnapshot, cpw float64) bool {
+	// Safety revert: an error storm on io_uring beats any load consideration.
+	if snap.ErrorRate > c.errorRevertRate {
+		c.state.downTicks = 0
+		c.state.upTicks = 0
+		c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap)
+		return true
 	}
-	if standby == engine.IOUring {
-		return baselineActiveScore * (1.0 + bias)
+
+	if cpw < c.downThreshold {
+		c.state.downTicks++
+		if c.state.downTicks >= c.sustainTicks {
+			c.state.upTicks = 0
+			c.logSwitch("io_uring", "epoll", "sustained low load", cpw, snap)
+			return true
+		}
+	} else {
+		c.state.downTicks = 0
 	}
-	return baselineActiveScore * (1.0 - bias)
+	c.state.upTicks = 0
+	return false
 }
 
-// historicalScore returns the last known score for an engine type, decayed
-// at 1% per second since the score was recorded.
-func (c *controller) historicalScore(et engine.EngineType, now time.Time) float64 {
-	score, ok := c.state.lastActiveScore[et]
-	if !ok {
-		return 0
+func (c *controller) activeEngine() engine.Engine {
+	if c.state.activeIsPrimary {
+		return c.primary
 	}
-	elapsed := now.Sub(c.state.lastActiveTime[et]).Seconds()
-	return score * max(0, 1.0-0.01*elapsed)
+	return c.secondary
+}
+
+func (c *controller) logSwitch(from, to, reason string, cpw float64, snap TelemetrySnapshot) {
+	c.logger.Info("engine switch recommended",
+		"from", from,
+		"to", to,
+		"reason", reason,
+		"conns_per_worker", cpw,
+		"bytes_per_req", snap.BytesPerReq,
+		"error_rate", snap.ErrorRate,
+		"active_connections", snap.ActiveConnections,
+	)
 }
 
 // recordSwitch updates controller state after a switch has been performed.
 func (c *controller) recordSwitch(now time.Time) {
 	c.state.activeIsPrimary = !c.state.activeIsPrimary
 	c.state.lastSwitch = now
+	c.state.upTicks = 0
+	c.state.downTicks = 0
 
 	c.state.switchTimes[c.state.switchIdx%len(c.state.switchTimes)] = now
 	c.state.switchIdx++
diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go
index 8d216e5e..beaf4c6d 100644
--- a/adaptive/controller_test.go
+++ b/adaptive/controller_test.go
@@ -3,92 +3,287 @@
 package adaptive
 
 import (
+	"log/slog"
 	"testing"
 	"time"
 
 	"github.com/goceleris/celeris/engine"
-	"github.com/goceleris/celeris/resource"
 )
 
-// TestControllerOrganicSwitch verifies that, in the io_uring sweet spot
-// (high connection count + high CPU), the controller eventually recommends an
-// epoll→io_uring switch driven purely by the io_uring bias — no pre-seeded
-// standby history, no active degradation.
-//
-// This FAILS against the pre-fix logic: the standby was seeded at
-// activeScore*0.80 and only decayed, so standby/active maxed out at ~0.70 and
-// could never clear the 1+threshold (1.15) bar. The bias-modeled standby
-// estimate makes the switch reachable.
-func TestControllerOrganicSwitch(t *testing.T) {
-	primary := newMockEngine(engine.Epoll)     // active
-	secondary := newMockEngine(engine.IOUring) // standby
+// newCtrlEpollActive returns a controller with epoll active (primary) plus the
+// synthetic sampler driving its telemetry, cooldown disabled for unit testing.
+func newCtrlEpollActive() (*controller, *syntheticSampler) {
 	sampler := newSyntheticSampler()
+	c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger())
+	c.cooldown = 0
+	return c, sampler
+}
+
+// newCtrlIOUringActive returns a controller with io_uring active (primary).
+func newCtrlIOUringActive() (*controller, *syntheticSampler) {
+	sampler := newSyntheticSampler()
+	c := newController(newMockEngine(engine.IOUring), newMockEngine(engine.Epoll), sampler, testLogger())
+	c.cooldown = 0
+	return c, sampler
+}
+
+func testLogger() *slog.Logger { return slog.New(slog.DiscardHandler) }
+
+// TestConnSwitchDisabledGate verifies the kernel-aware gate: with conns-per-worker
+// switching OFF (the production default — the feature-gated start engine is
+// authoritative on io_uring-best/epoll-best kernels), neither the epoll up-switch
+// nor the io_uring down-revert fires regardless of load, but the io_uring
+// error-rate safety revert STILL fires. Guards the warmup-dip revert bug where an
+// idle/ramp dip on a 7.0 box reverted io_uring→epoll and stranded the load.
+func TestConnSwitchDisabledGate(t *testing.T) {
+	now := time.Now()
+
+	c, sampler := newCtrlEpollActive()
+	c.connSwitchEnabled = false
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64})
+	for i := 0; i < 5; i++ {
+		if c.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatalf("epoll up-switch fired with switching disabled (tick %d)", i)
+		}
+	}
+
+	c2, s2 := newCtrlIOUringActive()
+	c2.connSwitchEnabled = false
+	s2.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0})
+	for i := 0; i < 5; i++ {
+		if c2.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatalf("io_uring down-revert fired with switching disabled (tick %d)", i)
+		}
+	}
+
+	c3, s3 := newCtrlIOUringActive()
+	c3.connSwitchEnabled = false
+	s3.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.5})
+	if !c3.evaluate(now, false) {
+		t.Fatal("error-rate safety revert did NOT fire with switching disabled")
+	}
+}
+
+// TestUpSwitchRequiresSustain verifies the normal epoll→io_uring switch needs
+// sustainTicks (2) consecutive ticks at or above the up threshold (24).
+func TestUpSwitchRequiresSustain(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24})
+
+	now := time.Now()
+	if c.evaluate(now, false) {
+		t.Fatal("one tick at the up threshold must not switch (sustainTicks=2)")
+	}
+	if c.state.upTicks != 1 {
+		t.Fatalf("upTicks = %d, want 1 after first qualifying tick", c.state.upTicks)
+	}
+	if !c.evaluate(now.Add(time.Second), false) {
+		t.Fatal("second consecutive tick at the up threshold must switch")
+	}
+}
 
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
+// TestUpSwitchSustainResetsOnDip verifies a dip below the up threshold resets
+// the sustain counter, so the count must restart.
+func TestUpSwitchSustainResetsOnDip(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	now := time.Now()
 
-	// Active epoll snapshot lands squarely in io_uring's empirical sweet spot.
-	sampler.Set(engine.Epoll, TelemetrySnapshot{
-		ThroughputRPS:     1000,
-		ActiveConnections: 2048,
-		CPUUtilization:    0.9,
-	})
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26})
+	if c.evaluate(now, false) {
+		t.Fatal("first tick must not switch")
+	}
+	// Dip into the hysteresis band — resets upTicks.
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 15})
+	if c.evaluate(now.Add(time.Second), false) {
+		t.Fatal("dip must not switch")
+	}
+	if c.state.upTicks != 0 {
+		t.Fatalf("upTicks = %d, want 0 after a dip below the up threshold", c.state.upTicks)
+	}
+	// Back above threshold: needs two more ticks now.
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26})
+	if c.evaluate(now.Add(2*time.Second), false) {
+		t.Fatal("first tick after reset must not switch")
+	}
+	if !c.evaluate(now.Add(3*time.Second), false) {
+		t.Fatal("second tick after reset must switch")
+	}
+}
+
+// TestFastPathSnapsImmediately verifies conns/worker at/above the high
+// watermark (48) switches on a single tick.
+func TestFastPathSnapsImmediately(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 48})
+
+	if !c.evaluate(time.Now(), false) {
+		t.Fatal("conns/worker at the high watermark must snap on one tick")
+	}
+}
 
-	if e.ActiveEngine().Type() != engine.Epoll {
-		t.Fatal("expected epoll active initially")
+// TestHysteresisBandNoFlap verifies the 12–24 band switches neither way.
+func TestHysteresisBandNoFlap(t *testing.T) {
+	for _, cpw := range []float64{12, 14, 16, 18, 19.9} {
+		c, sampler := newCtrlEpollActive()
+		sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: cpw})
+		now := time.Now()
+		for i := range 4 {
+			if c.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+				t.Fatalf("cpw=%.1f inside hysteresis band must not switch up", cpw)
+			}
+		}
 	}
+}
+
+// TestLargePayloadSuppression verifies a large average payload suppresses the
+// switch even well above the high watermark.
+func TestLargePayloadSuppression(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	// Exactly at the threshold counts as large (>=).
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8192})
 
 	now := time.Now()
-	switched := false
-	// Advance well past any cooldown/observation window and let the estimate
-	// settle over a few ticks.
 	for i := range 5 {
-		if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) {
-			switched = true
-			break
+		if c.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatal("large payload must suppress io_uring switch")
 		}
 	}
-	if !switched {
-		t.Fatal("expected organic epoll→io_uring switch in the io_uring sweet spot")
-	}
-}
-
-// TestControllerNoSwitchOutsideSweetSpot is the inverse: low CPU or too few
-// connections yields zero bias, so the controller must NOT recommend a switch
-// (no degradation, no favorable conditions).
-func TestControllerNoSwitchOutsideSweetSpot(t *testing.T) {
-	cases := []struct {
-		name string
-		snap TelemetrySnapshot
-	}{
-		{
-			name: "low CPU",
-			snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 2048, CPUUtilization: 0.10},
-		},
-		{
-			name: "too few connections",
-			snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 32, CPUUtilization: 0.9},
-		},
-	}
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			primary := newMockEngine(engine.Epoll)
-			secondary := newMockEngine(engine.IOUring)
-			sampler := newSyntheticSampler()
-
-			cfg := resource.Config{Protocol: engine.HTTP1}
-			e := newFromEngines(primary, secondary, sampler, cfg)
-			e.ctrl.cooldown = 0
-
-			sampler.Set(engine.Epoll, tc.snap)
-
-			now := time.Now()
-			for i := range 5 {
-				if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) {
-					t.Fatalf("unexpected switch outside io_uring sweet spot (%s)", tc.name)
-				}
-			}
-		})
+
+	// Drop below the large-payload threshold → fast path fires.
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8191})
+	if !c.evaluate(now.Add(10*time.Second), false) {
+		t.Fatal("small payload above high watermark must switch")
+	}
+}
+
+// TestRevertRequiresSustain verifies the io_uring→epoll revert needs
+// sustainTicks below the down threshold (12).
+func TestRevertRequiresSustain(t *testing.T) {
+	c, sampler := newCtrlIOUringActive()
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 8})
+
+	now := time.Now()
+	if c.evaluate(now, false) {
+		t.Fatal("one low tick must not revert (sustainTicks=2)")
+	}
+	if !c.evaluate(now.Add(time.Second), false) {
+		t.Fatal("second consecutive low tick must revert")
+	}
+}
+
+// TestErrorRevertImmediate verifies an error rate above errorRevertRate (0.05)
+// reverts io_uring→epoll immediately regardless of load.
+func TestErrorRevertImmediate(t *testing.T) {
+	c, sampler := newCtrlIOUringActive()
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.06})
+
+	if !c.evaluate(time.Now(), false) {
+		t.Fatal("error rate above errorRevertRate must revert on one tick")
+	}
+}
+
+// TestNoRevertWhenBusy verifies io_uring stays put above the down threshold
+// with a clean error rate.
+func TestNoRevertWhenBusy(t *testing.T) {
+	c, sampler := newCtrlIOUringActive()
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 16, ErrorRate: 0.0})
+
+	now := time.Now()
+	for i := range 5 {
+		if c.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatal("must not revert while busy with a clean error rate")
+		}
+	}
+}
+
+// TestCooldownGatesRevert verifies the cooldown blocks a revert immediately
+// after a switch, but the first switch (no prior switch) is not gated.
+func TestCooldownGatesRevert(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	c.cooldown = 1 * time.Hour
+
+	now := time.Now()
+	// First switch is not gated (lastSwitch is zero).
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50})
+	if !c.evaluate(now, false) {
+		t.Fatal("first switch must not be cooldown-gated")
+	}
+	c.recordSwitch(now)
+
+	// io_uring now active and idle, but the revert is cooldown-gated.
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0})
+	if c.evaluate(now.Add(time.Minute), false) {
+		t.Fatal("revert within cooldown must be blocked")
+	}
+}
+
+// TestFrozenBlocksSwitch verifies the frozen flag short-circuits evaluate.
+func TestFrozenBlocksSwitch(t *testing.T) {
+	c, sampler := newCtrlEpollActive()
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 100})
+	if c.evaluate(time.Now(), true) {
+		t.Fatal("frozen must block all switches")
+	}
+}
+
+// TestOscillationLockExpiry verifies 3 switches in 5 minutes locks the
+// controller, and the lock expires after 5 minutes.
+func TestOscillationLockExpiry(t *testing.T) {
+	sampler := newSyntheticSampler()
+	c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger())
+	c.cooldown = 0
+
+	now := time.Now()
+	for range 3 {
+		c.recordSwitch(now)
+		now = now.Add(time.Second)
+	}
+	if !c.state.locked {
+		t.Fatal("expected lock after 3 switches within 5 minutes")
+	}
+
+	// Locked: a strong load must not switch.
+	sampler.Set(c.activeEngine().Type(), TelemetrySnapshot{ConnsPerWorker: 100})
+	if c.evaluate(now, false) {
+		t.Fatal("locked controller must not switch")
+	}
+
+	// After lock expiry, evaluation resumes (lock cleared on the next eval).
+	after := c.state.lockUntil.Add(time.Second)
+	_ = c.evaluate(after, false)
+	if c.state.locked {
+		t.Fatal("lock should clear once lockUntil has passed")
+	}
+}
+
+// TestRecordSwitchResetsTicks verifies recordSwitch zeroes both tick counters.
+func TestRecordSwitchResetsTicks(t *testing.T) {
+	c, _ := newCtrlEpollActive()
+	c.state.upTicks = 5
+	c.state.downTicks = 3
+	c.recordSwitch(time.Now())
+	if c.state.upTicks != 0 || c.state.downTicks != 0 {
+		t.Fatalf("recordSwitch must reset ticks, got up=%d down=%d", c.state.upTicks, c.state.downTicks)
+	}
+	if c.state.activeIsPrimary {
+		t.Fatal("recordSwitch must toggle activeIsPrimary")
+	}
+}
+
+// guards against a config drift in the documented thresholds.
+func TestDefaultThresholds(t *testing.T) {
+	c, _ := newCtrlEpollActive()
+	if c.upThreshold != 24 || c.downThreshold != 12 || c.highWatermark != 48 {
+		t.Fatalf("threshold drift: up=%.0f down=%.0f hwm=%.0f", c.upThreshold, c.downThreshold, c.highWatermark)
+	}
+	if c.largePayloadBytes != 8192 || c.errorRevertRate != 0.05 || c.sustainTicks != 2 {
+		t.Fatalf("policy drift: largePayload=%.0f errRevert=%.3f sustain=%d",
+			c.largePayloadBytes, c.errorRevertRate, c.sustainTicks)
+	}
+	if c.evalInterval != time.Second {
+		t.Fatalf("evalInterval = %v, want 1s", c.evalInterval)
+	}
+	if c.cooldown != 0 { // overridden to 0 by the helper
+		t.Fatalf("test helper should zero cooldown, got %v", c.cooldown)
 	}
 }
diff --git a/adaptive/driver_provider_test.go b/adaptive/driver_provider_test.go
index dddffd18..510c01b8 100644
--- a/adaptive/driver_provider_test.go
+++ b/adaptive/driver_provider_test.go
@@ -12,6 +12,7 @@ import (
 	"golang.org/x/sys/unix"
 
 	"github.com/goceleris/celeris/engine"
+	"github.com/goceleris/celeris/probe"
 	"github.com/goceleris/celeris/protocol/h2/stream"
 	"github.com/goceleris/celeris/resource"
 )
@@ -29,6 +30,14 @@ var _ stream.Handler = noopHandler{}
 // (we need both primary + secondary up for a meaningful switch test).
 func newBoundAdaptive(t *testing.T) (*Engine, func()) {
 	t.Helper()
+	// These tests need BOTH sub-engines for a meaningful switch. Since New()
+	// now LAZILY builds the standby (and falls back to an epoll start when
+	// io_uring is unavailable), a no-io_uring host would let New() succeed yet
+	// fail the first switch when the io_uring standby cannot be built. Skip up
+	// front when io_uring is genuinely unavailable.
+	if !probe.Probe().IOUringTier.Available() {
+		t.Skip("io_uring unavailable: switch test needs both sub-engines")
+	}
 	cfg := resource.Config{
 		Addr:     "127.0.0.1:0",
 		Protocol: engine.HTTP1,
@@ -347,13 +356,8 @@ func TestAdaptiveSwitchAfterDriverQuiescence(t *testing.T) {
 	e, stop := newBoundAdaptive(t)
 	defer stop()
 
-	// Seed historical scores so ForceSwitch actually flips active.
-	now := time.Now()
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 100
-	e.ctrl.state.lastActiveScore[engine.IOUring] = 500
-	e.ctrl.state.lastActiveTime[engine.Epoll] = now
-	e.ctrl.state.lastActiveTime[engine.IOUring] = now
-
+	// ForceSwitch bypasses the decision policy and flips the active engine
+	// directly, so no telemetry seeding is needed here.
 	local, peer := socketpairNonblocking(t)
 	defer func() { _ = unix.Close(peer) }()
 	defer func() { _ = unix.Close(local) }()
diff --git a/adaptive/engine.go b/adaptive/engine.go
index 10fe6644..f12ca2cf 100644
--- a/adaptive/engine.go
+++ b/adaptive/engine.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"log/slog"
 	"net"
+	"os"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -15,7 +16,7 @@ import (
 	"github.com/goceleris/celeris/engine"
 	"github.com/goceleris/celeris/engine/epoll"
 	"github.com/goceleris/celeris/engine/iouring"
-	"github.com/goceleris/celeris/engine/scaler"
+	"github.com/goceleris/celeris/probe"
 	"github.com/goceleris/celeris/protocol/h2/stream"
 	"github.com/goceleris/celeris/resource"
 )
@@ -26,9 +27,21 @@ var (
 )
 
 // Engine is an adaptive meta-engine that switches between io_uring and epoll.
+//
+// The two sub-engine slots map to a fixed protocol direction the controller
+// keys off: primary is ALWAYS the epoll engine (the controller's
+// activeIsPrimary==true means epoll is active) and secondary is ALWAYS the
+// io_uring engine. On the public New() path only the START engine is built
+// eagerly; the other slot stays nil until the first switch actually needs it
+// (see buildStandby + performSwitch). Under the default policy the start engine
+// is epoll, so the io_uring standby is built lazily — and only if a sustained
+// high-concurrency ramp promotes new conns to it; an engine that never switches
+// never constructs its standby, so that heap never exists. newFromEngines
+// (tests) populates BOTH slots eagerly, exercising the standby-already-exists
+// switch path.
 type Engine struct {
-	primary   engine.Engine // io_uring
-	secondary engine.Engine // epoll
+	primary   engine.Engine // epoll  (nil until built when it is the lazy standby)
+	secondary engine.Engine // io_uring (nil until built when it is the lazy standby)
 	active    atomic.Pointer[engine.Engine]
 	ctrl      *controller
 	cfg       resource.Config
@@ -39,12 +52,30 @@ type Engine struct {
 	frozen    atomic.Bool
 	logger    *slog.Logger
 
+	// startType is the engine type chosen for the eager start engine. The
+	// standby is the other type; buildStandby constructs it on demand.
+	startType engine.EngineType
+
+	// buildStandby constructs the LAZY standby sub-engine on first switch.
+	// It captures cfg + handler (+ cpuMon for the sampler symmetry) and is
+	// nil on the newFromEngines (tests) path where both engines are eager.
+	buildStandby func() (engine.Engine, error)
+
+	// listenCtx / listenWG are captured by Listen so performSwitch can start a
+	// freshly-built standby's Listen goroutine under the SAME context and wait
+	// group as the active engine. Shutdown then joins it implicitly via the
+	// wait group (wg.Wait in Listen) — a never-built standby added nothing to
+	// the group, so there is nothing to join. Guarded by mu (performSwitch
+	// holds mu across the whole switch; Listen sets these once under mu).
+	listenCtx context.Context
+	listenWG  *sync.WaitGroup
+
 	// freezeCooldown is the duration to suppress further switches after a switch.
 	// Zero means no cooldown (default).
 	freezeCooldown time.Duration
 
 	// listenMu guards listenCancel/listenDone, which let Shutdown deterministically
-	// stop and JOIN the evaluation/scaler goroutines started by Listen. Without
+	// stop and JOIN the evaluation-loop goroutine started by Listen. Without
 	// this, Shutdown could return (sub-engines stopped) while the eval loop is
 	// still mid-Sample on the CPU monitor the server is about to close.
 	listenMu     sync.Mutex
@@ -63,12 +94,96 @@ type Engine struct {
 	switchRejected  atomic.Uint64 // telemetry: how many switches were blocked by driver FDs
 }
 
-// New creates a new adaptive engine with epoll as primary and io_uring as secondary.
-// Epoll starts first because it has lower H2 latency on current kernels (single-pass
-// read→process→write vs io_uring's two-iteration CQE model). The controller may
-// switch to io_uring if telemetry indicates it would perform better for the workload.
-// Both sub-engines get the full resource config. This is safe because standby
-// workers are fully suspended (zero CPU, zero connections, listen sockets closed).
+// ioUringViable reports whether io_uring is worth running at all on this host:
+// the kernel must expose the fast tier AND RLIMIT_MEMLOCK must be able to fund
+// the requested worker count. These are the two t0-knowable disqualifiers from
+// the epoll-vs-io_uring sweep:
+//
+//   - Kernel/feature: io_uring loses to epoll on old kernels (missing the
+//     fast-path setup flags); require the "bundles" era (>6.10) OR the 6.1+
+//     fast tier (DEFER_TASKRUN + SINGLE_ISSUER + MULTISHOT_RECV + PROVIDED_BUFFERS).
+//   - Memlock: io_uring's provided-buffer rings need locked pages per worker
+//     (minMemlockPerWorker). If RLIMIT_MEMLOCK can't fund the requested workers,
+//     io_uring caps to a fraction of them and its throughput collapses; epoll
+//     does not memlock buffer rings, so it keeps all workers. In that case
+//     io_uring is never the right engine.
+func ioUringViable(p engine.CapabilityProfile, cfg resource.Config) bool {
+	bundlesEra := p.KernelMajor > 6 || (p.KernelMajor == 6 && p.KernelMinor >= 10)
+	fastTier := p.DeferTaskrun && p.SingleIssuer && p.MultishotRecv && p.ProvidedBuffers
+	if !bundlesEra && !fastTier {
+		return false
+	}
+	wantWorkers := cfg.Resources.Resolve().Workers
+	if maxW := maxWorkersForMemlock(); maxW != -1 && maxW < wantWorkers {
+		return false
+	}
+	return true
+}
+
+// maxWorkersForMemlock is the io_uring memlock worker-ceiling probe behind a var
+// so tests can inject a low cap without mutating the process RLIMIT_MEMLOCK.
+var maxWorkersForMemlock = iouring.MaxWorkersForMemlock
+
+// chooseStartEngine selects which sub-engine the adaptive meta-engine starts
+// (and builds eagerly), from facts knowable at Listen() time only.
+//
+// THE PINNING CONSTRAINT: an established connection cannot migrate between
+// epoll and io_uring, so the START engine decides keep-alive throughput; the
+// runtime switch can only route NEW connections. And the workload's
+// concurrency — the thing that actually decides which engine wins — is
+// unknowable here (no connections exist yet). So the start decision is gated
+// only on t0-knowable disqualifiers, with a safe default:
+//
+//  1. env override CELERIS_ADAPTIVE_START=iouring|epoll (operator escape hatch).
+//  2. io_uring not viable (old kernel / missing fast tier / memlock too low) → epoll.
+//  3. configured Protocol == H2C → epoll (io_uring's win is h1-small-payload only;
+//     h2c never benefits — its framing/HPACK cost dwarfs the engine delta).
+//  4. explicit operator WorkloadHint == HighConcurrency → io_uring (the ONLY
+//     input that can express a high-concurrency expectation up front).
+//  5. DEFAULT → epoll. Every server ramps from zero connections, i.e. the
+//     low-concurrency regime where epoll wins on throughput AND tail latency;
+//     the runtime switch then promotes new conns to io_uring if sustained
+//     high load develops.
+//
+// This flips the previous default (io_uring on modern kernels): io_uring now
+// wins the start only on an explicit high-concurrency hint, because the
+// benchmark-shaped "saturating burst at t0" is the only case where defaulting
+// io_uring helps, and it costs the common low/mid-conc + latency cases.
+func chooseStartEngine(p engine.CapabilityProfile, cfg resource.Config) engine.EngineType {
+	switch os.Getenv("CELERIS_ADAPTIVE_START") {
+	case "iouring":
+		return engine.IOUring
+	case "epoll":
+		return engine.Epoll
+	case "auto", "":
+		// fall through to the policy below
+	default:
+		// Unknown value: fall through to auto rather than fail hard.
+	}
+
+	if !ioUringViable(p, cfg) {
+		return engine.Epoll
+	}
+	if cfg.Protocol == engine.H2C {
+		return engine.Epoll
+	}
+	if cfg.Resources.WorkloadHint == resource.WorkloadHighConcurrency {
+		return engine.IOUring
+	}
+	return engine.Epoll
+}
+
+// New creates a new adaptive engine. Only the START engine is built and
+// Listen'd eagerly; the other engine (the standby) is constructed lazily on the
+// first switch that actually needs it. The start engine is chosen by
+// chooseStartEngine from the probed io_uring capabilities (feature-gated, with a
+// CELERIS_ADAPTIVE_START env override).
+//
+// Both sub-engines bind the SAME SO_REUSEPORT port so the adaptive switch is
+// transparent: resolvePort pins a concrete port up front, and the lazily-built
+// standby reuses it. Building only the start engine eliminates the parked
+// standby's GC-rooted heap — on a modern kernel that starts on io_uring and
+// never reverts, the epoll standby is never constructed (≈0 standby tax).
 //
 // cpuMon is an engine.CPUMonitor (the public interface); when non-nil it
 // supplies the live sampler with CPU utilization data so the io_uring bias can
@@ -92,47 +207,110 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor)
 		}
 	}
 
-	// Suppress the per-engine built-in scaler in both sub-engines —
-	// adaptive runs ONE higher-level scaler that delegates to whichever
-	// sub-engine is currently active. Two scalers fighting over the same
-	// worker pool produced -54 % to +118 % variance on pinning tests
-	// during the spike-B exploration; gating this way eliminates that.
-	subCfg := cfg
-	subCfg.SkipBuiltinScaler = true
+	// probe.Probe() reads kernel version + io_uring setup feature bits WITHOUT
+	// constructing an engine, so it is cheap enough for the start decision.
+	profile := probe.Probe()
+	startType := chooseStartEngine(profile, cfg)
 
-	primary, err := epoll.New(subCfg, handler)
-	if err != nil {
-		return nil, fmt.Errorf("epoll sub-engine: %w", err)
+	sampler := newLiveSampler(cpuMon)
+	logger := cfg.Logger
+	if logger == nil {
+		logger = slog.Default()
 	}
 
-	secondary, err := iouring.New(subCfg, handler)
-	if err != nil {
-		return nil, fmt.Errorf("io_uring sub-engine: %w", err)
+	// Constructors for each slot. The standby's constructor is stored on the
+	// Engine and only invoked on the first switch. The io_uring constructor
+	// does not take cpuMon (iouring.New has no such parameter); cpuMon already
+	// feeds the shared sampler via newLiveSampler above.
+	buildEpoll := func() (engine.Engine, error) {
+		eng, err := epoll.New(cfg, handler)
+		if err != nil {
+			return nil, fmt.Errorf("epoll sub-engine: %w", err)
+		}
+		return eng, nil
+	}
+	buildIOUring := func() (engine.Engine, error) {
+		eng, err := iouring.New(cfg, handler)
+		if err != nil {
+			return nil, fmt.Errorf("io_uring sub-engine: %w", err)
+		}
+		return eng, nil
 	}
-
-	sampler := newLiveSampler(cpuMon)
-	logger := cfg.Logger
 
 	e := &Engine{
-		primary:   primary,
-		secondary: secondary,
 		cfg:       cfg,
 		handler:   handler,
 		logger:    logger,
+		startType: startType,
 	}
 
-	e.ctrl = newController(primary, secondary, sampler, logger)
-
-	// Start with primary (epoll) for all protocols. Epoll has better H2
-	// throughput on current kernels and matches H1 performance.
-	var initialActive engine.Engine = primary
-	e.ctrl.state.activeIsPrimary = true
-	e.active.Store(&initialActive)
+	var startEngine engine.Engine
+	if startType == engine.IOUring {
+		// io_uring is the eager start; epoll is the lazy standby.
+		// io_uring construction can fail on a kernel that probed as capable
+		// but cannot actually set up the ring (e.g. low RLIMIT_MEMLOCK). Fall
+		// back to starting on epoll rather than failing New outright.
+		eng, err := buildIOUring()
+		if err != nil {
+			logger.Warn("io_uring start engine unavailable, falling back to epoll start", "error", err)
+			e.startType = engine.Epoll
+			eng, err = buildEpoll()
+			if err != nil {
+				return nil, err
+			}
+			startEngine = eng
+			e.primary = eng
+			e.buildStandby = buildIOUring
+		} else {
+			startEngine = eng
+			e.secondary = eng
+			e.buildStandby = buildEpoll
+		}
+	} else {
+		// epoll is the eager start; io_uring is the lazy standby.
+		eng, err := buildEpoll()
+		if err != nil {
+			return nil, err
+		}
+		startEngine = eng
+		e.primary = eng
+		e.buildStandby = buildIOUring
+	}
 
+	// The controller needs BOTH engine TYPES to decide switch direction even
+	// while the standby engine is nil, but it only ever dereferences the
+	// ACTIVE engine (activeEngine()). Pass the start engine for the active slot
+	// and nil for the lazy standby slot — newController stores them; activeIsPrimary
+	// records which slot the start engine occupies (primary==epoll).
+	e.ctrl = newController(e.primary, e.secondary, sampler, logger)
+	e.ctrl.state.activeIsPrimary = e.startType == engine.Epoll
+	// Re-enable the conns-per-worker UP switch ONLY on the epoll-start path with
+	// io_uring viable and a non-h2c protocol. Rationale from the sweep:
+	//   - When we START on epoll (the new default), a sustained high-concurrency
+	//     ramp should promote NEW connections to io_uring (it wins ≥~24 conns/
+	//     worker for h1 small payloads). The switch routes new SYNs only —
+	//     pinned conns stay on epoll — so it helps ramps/churn, and is inert for
+	//     a pure keep-alive burst (which is fine; that case wants WorkloadHint).
+	//   - When we START on io_uring there is nothing better to switch UP to, and
+	//     a load-driven DOWN-revert would only strand pinned io_uring conns — so
+	//     leave switching OFF there (the always-on error-revert still applies).
+	//   - h2c never benefits from io_uring, so never switch up for it.
+	// The controller's load-driven DOWN-revert is disabled regardless (pinning);
+	// only the always-on io_uring error-revert can move us back to epoll.
+	e.ctrl.connSwitchEnabled = e.startType == engine.Epoll &&
+		ioUringViable(profile, cfg) &&
+		cfg.Protocol != engine.H2C
+	// Load-driven down-revert is always off in production (pinning makes it
+	// harmful); only the always-on io_uring error-revert can return us to epoll.
+	e.ctrl.loadDownRevert = false
+
+	e.active.Store(&startEngine)
 	return e, nil
 }
 
-// newFromEngines creates an adaptive engine from pre-built engines (for testing).
+// newFromEngines creates an adaptive engine from pre-built engines (for
+// testing). BOTH slots are populated eagerly and buildStandby is left nil, so
+// performSwitch exercises the "standby already exists" path (no lazy build).
 func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, cfg resource.Config) *Engine {
 	logger := cfg.Logger
 	if logger == nil {
@@ -144,6 +322,7 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler,
 		secondary: secondary,
 		cfg:       cfg,
 		logger:    logger,
+		startType: engine.Epoll,
 	}
 
 	e.ctrl = newController(primary, secondary, sampler, logger)
@@ -155,13 +334,15 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler,
 	return e
 }
 
-// Listen starts both sub-engines and the evaluation loop.
+// Listen starts ONLY the active sub-engine and the evaluation loop. The standby
+// is built and Listen'd lazily by performSwitch on the first switch (joined
+// under the same ctx + wait group captured here).
 func (e *Engine) Listen(ctx context.Context) error {
 	innerCtx, innerCancel := context.WithCancel(ctx)
 	defer innerCancel()
 
 	// Publish the cancel + a done channel so Shutdown can stop and join the
-	// goroutines this Listen owns (eval loop, scaler) before the server closes
+	// goroutine this Listen owns (the eval loop) before the server closes
 	// shared resources such as the CPU monitor.
 	done := make(chan struct{})
 	e.listenMu.Lock()
@@ -172,23 +353,25 @@ func (e *Engine) Listen(ctx context.Context) error {
 
 	var wg sync.WaitGroup
 
-	errCh := make(chan error, 2)
+	// Publish ctx + wg so performSwitch can launch the lazily-built standby's
+	// Listen goroutine under the same lifetime (Shutdown joins it via wg.Wait).
+	e.mu.Lock()
+	e.listenCtx = innerCtx
+	e.listenWG = &wg
+	e.mu.Unlock()
 
-	wg.Go(func() {
-		if err := e.primary.Listen(innerCtx); err != nil {
-			errCh <- fmt.Errorf("primary (epoll): %w", err)
-		}
-	})
+	errCh := make(chan error, 2)
 
+	active := *e.active.Load()
 	wg.Go(func() {
-		if err := e.secondary.Listen(innerCtx); err != nil {
-			errCh <- fmt.Errorf("secondary (io_uring): %w", err)
+		if err := active.Listen(innerCtx); err != nil {
+			errCh <- fmt.Errorf("active (%s): %w", active.Type().String(), err)
 		}
 	})
 
-	// Wait for both engines to bind their addresses.
+	// Wait for the ACTIVE engine to bind its address.
 	// io_uring may need multiple tier fallback attempts, so allow ample time —
-	// but if either sub-engine has already returned an error to errCh
+	// but if the active sub-engine has already returned an error to errCh
 	// (e.g. ENOMEM at io_uring_setup under low RLIMIT_MEMLOCK), surface it
 	// immediately instead of waiting out the deadline.
 	deadline := time.Now().Add(20 * time.Second)
@@ -198,7 +381,7 @@ func (e *Engine) Listen(ctx context.Context) error {
 	defer bindWait.Stop()
 	var startErr error
 bindLoop:
-	for e.primary.Addr() == nil || e.secondary.Addr() == nil {
+	for active.Addr() == nil {
 		select {
 		case startErr = <-errCh:
 			break bindLoop
@@ -213,50 +396,22 @@ bindLoop:
 		wg.Wait()
 		return fmt.Errorf("sub-engine startup failed: %w", startErr)
 	}
-	if e.primary.Addr() == nil || e.secondary.Addr() == nil {
+	if active.Addr() == nil {
 		innerCancel()
 		wg.Wait()
-		return fmt.Errorf("sub-engines failed to initialize within 20s deadline")
-	}
-
-	// Pause standby engine's accept BEFORE publishing Addr so the
-	// SO_REUSEPORT group only contains the active engine by the time
-	// callers start dialing. Without this gate, a burst of incoming
-	// connections in the window between secondary.Listen() succeeding
-	// and PauseAccept taking effect lands some dials on the standby's
-	// accept queue; closing that queue then RSTs them. H1 clients
-	// reconnect transparently but H2 prior-knowledge clients see the
-	// dial fail mid-handshake. Read c.state.activeIsPrimary under
-	// switchMu to avoid racing with performSwitch → recordSwitch (a
-	// concurrent ForceSwitch or controller tick can fire before Listen
-	// finishes its own setup).
-	e.switchMu.Lock()
-	activeIsPrimary := e.ctrl.state.activeIsPrimary
-	e.switchMu.Unlock()
-	if activeIsPrimary {
-		if ac, ok := e.secondary.(engine.AcceptController); ok {
-			if err := ac.PauseAccept(); err != nil {
-				innerCancel()
-				wg.Wait()
-				return fmt.Errorf("pause secondary: %w", err)
-			}
-		}
-	} else {
-		if ac, ok := e.primary.(engine.AcceptController); ok {
-			if err := ac.PauseAccept(); err != nil {
-				innerCancel()
-				wg.Wait()
-				return fmt.Errorf("pause primary: %w", err)
-			}
-		}
+		return fmt.Errorf("active sub-engine failed to initialize within 20s deadline")
 	}
 
-	addr := e.primary.Addr()
+	// No standby to pause: only the active engine is in the SO_REUSEPORT group,
+	// so publishing Addr cannot expose a dial to a phantom standby listener.
+	// (The original pause-standby-before-publish-Addr step guarded that window;
+	// with a lazy standby there is no standby listening here.)
+	addr := active.Addr()
 	e.addr.Store(&addr)
 
 	e.logger.Info("adaptive engine listening",
 		"addr", e.cfg.Addr,
-		"active", (*e.active.Load()).Type().String(),
+		"active", active.Type().String(),
 	)
 
 	// Start evaluation loop.
@@ -264,22 +419,10 @@ bindLoop:
 		e.runEvalLoop(innerCtx)
 	})
 
-	// Start the higher-level dynamic worker scaler. This is the only
-	// worker scaler that runs in an adaptive setup — sub-engines have
-	// their built-in scalers suppressed via Config.SkipBuiltinScaler.
-	// Typed cfg.WorkerScaling takes precedence over env vars. The
-	// algorithm lives in engine/scaler; adaptive provides a switch-aware
-	// Source via adaptive/scaler.go.
-	if scalerCfg := scaler.Resolve(e.cfg, e.cfg.Resources.Resolve().Workers); scalerCfg.Enabled {
-		wg.Go(func() {
-			e.runScaler(innerCtx, scalerCfg)
-		})
-	}
-
 	select {
 	case <-innerCtx.Done():
 		// Parent context cancelled, or Shutdown cancelled innerCtx directly to
-		// stop and join the eval/scaler goroutines.
+		// stop and join the eval-loop goroutine.
 	case err := <-errCh:
 		innerCancel()
 		wg.Wait()
@@ -310,6 +453,47 @@ func (e *Engine) runEvalLoop(ctx context.Context) {
 	}
 }
 
+// buildAndStartStandby constructs the lazy standby sub-engine, launches its
+// Listen goroutine under the same ctx + wait group Listen captured (so Shutdown
+// joins it), and waits — bounded — for it to bind the shared SO_REUSEPORT port.
+// The caller holds e.mu. wantType is purely for error messages. On any failure
+// it returns an error and the engine state is left untouched (no slot stored),
+// so the current active keeps serving.
+func (e *Engine) buildAndStartStandby(wantType engine.EngineType) (engine.Engine, error) {
+	if e.buildStandby == nil {
+		return nil, fmt.Errorf("no standby builder for %s", wantType.String())
+	}
+	if e.listenCtx == nil || e.listenWG == nil {
+		return nil, fmt.Errorf("cannot build standby before Listen has started")
+	}
+
+	built, err := e.buildStandby()
+	if err != nil {
+		return nil, fmt.Errorf("build %s standby: %w", wantType.String(), err)
+	}
+
+	ctx := e.listenCtx
+	wg := e.listenWG
+	wg.Go(func() {
+		if lerr := built.Listen(ctx); lerr != nil {
+			e.logger.Warn("lazy standby Listen returned error",
+				"standby", built.Type().String(), "error", lerr)
+		}
+	})
+
+	// Wait (bounded) for the standby to bind the shared port — once Addr() is
+	// non-nil it has joined the SO_REUSEPORT group and is accepting, so the
+	// resume-before-pause overlap is real and connections are never dropped.
+	deadline := time.Now().Add(5 * time.Second)
+	for built.Addr() == nil {
+		if time.Now().After(deadline) || ctx.Err() != nil {
+			return nil, fmt.Errorf("%s standby failed to bind within 5s", wantType.String())
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+	return built, nil
+}
+
 func (e *Engine) performSwitch() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
@@ -339,21 +523,92 @@ func (e *Engine) performSwitch() {
 		e.freezeState.Unlock()
 		return
 	}
+	// Release freezeState across the (possibly slow) lazy standby build +
+	// Listen + bind-wait below; re-acquired before the active.Store commit.
+	// Holding it across a multi-second build would block driver
+	// register/unregister flows (same reasoning as the PauseAccept release
+	// at the end of this function). e.mu (held for the whole function)
+	// already serialises performSwitch against itself, so no other switch
+	// can race the build.
+	e.freezeState.Unlock()
 
 	now := time.Now()
 
+	// Determine the direction. activeIsPrimary toggles on recordSwitch, so it
+	// always reflects the engine we are switching AWAY from.
 	e.switchMu.Lock()
+	switchingFromPrimary := e.ctrl.state.activeIsPrimary
+	e.switchMu.Unlock()
+
+	// Resolve the standby slot for this direction. On the lazy New() path the
+	// target slot may be nil and must be built + Listen'd now (it binds the
+	// shared SO_REUSEPORT port and joins the accept pool). On the
+	// newFromEngines (tests) path both slots are pre-populated and buildStandby
+	// is nil, so the build is skipped.
+	freshlyBuilt := false
+	if switchingFromPrimary {
+		// primary (active) → secondary (standby).
+		if e.secondary == nil {
+			built, err := e.buildAndStartStandby(engine.IOUring)
+			if err != nil {
+				e.logger.Warn("aborting switch: lazy standby build failed; staying on current active",
+					"standby", engine.IOUring.String(), "error", err)
+				return
+			}
+			// Publish the built engine to both the Engine and controller
+			// slots under switchMu so a concurrent evaluate (ForceSwitch
+			// racing the eval loop) never reads a torn controller slot.
+			e.switchMu.Lock()
+			e.secondary = built
+			e.ctrl.secondary = built
+			e.switchMu.Unlock()
+			freshlyBuilt = true
+		}
+	} else {
+		// secondary (active) → primary (standby).
+		if e.primary == nil {
+			built, err := e.buildAndStartStandby(engine.Epoll)
+			if err != nil {
+				e.logger.Warn("aborting switch: lazy standby build failed; staying on current active",
+					"standby", engine.Epoll.String(), "error", err)
+				return
+			}
+			e.switchMu.Lock()
+			e.primary = built
+			e.ctrl.primary = built
+			e.switchMu.Unlock()
+			freshlyBuilt = true
+		}
+	}
+
 	var newActive, newStandby engine.Engine
-	if e.ctrl.state.activeIsPrimary {
-		// Switching: primary → secondary.
+	if switchingFromPrimary {
 		newActive = e.secondary
 		newStandby = e.primary
 	} else {
-		// Switching: secondary → primary.
 		newActive = e.primary
 		newStandby = e.secondary
 	}
-	e.switchMu.Unlock()
+
+	// Re-acquire freezeState for the commit and RE-CHECK driverFDs: a driver
+	// may have registered during the build window above. If so, abort — but the
+	// freshly-built standby stays cached for the next attempt. Pause its accept
+	// first so it does not sit in the SO_REUSEPORT pool alongside the (still
+	// active) old engine; the next switch ResumeAccepts it.
+	e.freezeState.Lock()
+	if e.driverFDs.Load() > 0 {
+		e.switchRejected.Add(1)
+		e.logger.Warn("refusing engine switch: driver FDs registered during standby build",
+			"driver_fds", e.driverFDs.Load(),
+		)
+		e.freezeState.Unlock()
+		if freshlyBuilt {
+			if ac, ok := newActive.(engine.AcceptController); ok {
+				_ = ac.PauseAccept()
+			}
+		}
+		return
+	}
 
 	// Resume new active BEFORE pausing old — this creates a brief overlap
 	// where both engines listen (via SO_REUSEPORT), which is correct. The
@@ -423,8 +678,8 @@ func (e *Engine) maybeThawLocked() {
 
 // Shutdown gracefully shuts down both sub-engines.
 //
-// It first cancels and JOINS the goroutines started by Listen (the evaluation
-// loop and worker scaler), so that no controller tick can still be sampling
+// It first cancels and JOINS the goroutine started by Listen (the evaluation
+// loop), so that no controller tick can still be sampling
 // telemetry — including the CPU monitor the server closes immediately after
 // Shutdown returns — by the time this function completes. Only then are the
 // sub-engines shut down. This is purely a join/sequencing concern; it does not
@@ -447,16 +702,40 @@ func (e *Engine) Shutdown(ctx context.Context) error {
 		}
 	}
 
-	return errors.Join(
-		e.primary.Shutdown(ctx),
-		e.secondary.Shutdown(ctx),
-	)
+	// Only shut down engines that exist. On the lazy New() path the standby
+	// slot is nil if no switch ever built it; cancelling listenCtx (above)
+	// already unwound the active engine's Listen goroutine and any lazily
+	// started standby Listen goroutine (both share that ctx + wait group).
+	e.mu.Lock()
+	primary := e.primary
+	secondary := e.secondary
+	e.mu.Unlock()
+
+	var errs []error
+	if primary != nil {
+		errs = append(errs, primary.Shutdown(ctx))
+	}
+	if secondary != nil {
+		errs = append(errs, secondary.Shutdown(ctx))
+	}
+	return errors.Join(errs...)
 }
 
-// Metrics aggregates metrics from both sub-engines.
+// Metrics aggregates metrics from whichever sub-engines exist. On the lazy
+// New() path a never-built standby is nil and contributes nothing.
 func (e *Engine) Metrics() engine.EngineMetrics {
-	pm := e.primary.Metrics()
-	sm := e.secondary.Metrics()
+	e.mu.Lock()
+	primary := e.primary
+	secondary := e.secondary
+	e.mu.Unlock()
+
+	var pm, sm engine.EngineMetrics
+	if primary != nil {
+		pm = primary.Metrics()
+	}
+	if secondary != nil {
+		sm = secondary.Metrics()
+	}
 	// Both sub-engines were built from the same handler + cfg, so their
 	// AsyncRoutes counts are identical; take one (not the sum) for the
 	// adaptive view. AsyncPromotedConns IS additive — promotions on the
diff --git a/adaptive/engine_test.go b/adaptive/engine_test.go
index 8bd0df0f..cf277ba0 100644
--- a/adaptive/engine_test.go
+++ b/adaptive/engine_test.go
@@ -4,6 +4,7 @@ package adaptive
 
 import (
 	"context"
+	"errors"
 	"net"
 	"sync"
 	"sync/atomic"
@@ -99,92 +100,168 @@ func TestInitialBias(t *testing.T) {
 	}
 }
 
-func TestSwitchTrigger(t *testing.T) {
+// newAdaptiveStartingOnEpoll builds an adaptive engine whose active engine is
+// epoll (primary), the production starting configuration. The synthetic
+// sampler returned lets the test drive conns-per-worker for the active engine.
+func newAdaptiveStartingOnEpoll(t *testing.T) (*Engine, *syntheticSampler) {
+	t.Helper()
+	primary := newMockEngine(engine.Epoll)     // active
+	secondary := newMockEngine(engine.IOUring) // standby
+	sampler := newSyntheticSampler()
+	e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1})
+	e.ctrl.cooldown = 0
+	return e, sampler
+}
+
+// newAdaptiveStartingOnIOUring builds an adaptive engine whose active engine is
+// io_uring (primary), for exercising the revert-to-epoll path.
+func newAdaptiveStartingOnIOUring(t *testing.T) (*Engine, *syntheticSampler) {
+	t.Helper()
 	primary := newMockEngine(engine.IOUring)
 	secondary := newMockEngine(engine.Epoll)
 	sampler := newSyntheticSampler()
-
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.evalInterval = 1 * time.Millisecond
+	e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1})
 	e.ctrl.cooldown = 0
+	return e, sampler
+}
 
-	// Make active (io_uring) score 100.
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100, ErrorRate: 0.01})
+// TestSwitchTrigger: sustained high conns/worker on the active epoll engine
+// triggers an epoll→io_uring switch after sustainTicks ticks.
+func TestSwitchTrigger(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24})
 
-	// Pre-seed standby historical score (epoll was previously active with score 150).
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 150
-	e.ctrl.state.lastActiveTime[engine.Epoll] = time.Now()
+	if e.ActiveEngine().Type() != engine.Epoll {
+		t.Fatal("expected epoll initially")
+	}
+
+	now := time.Now()
+	// First tick arms the sustain counter but must not switch yet.
+	if e.ctrl.evaluate(now, false) {
+		t.Fatal("switch must require sustainTicks consecutive ticks, not one")
+	}
+	// Second tick crosses sustainTicks → switch.
+	if !e.ctrl.evaluate(now.Add(time.Second), false) {
+		t.Fatal("expected switch after sustained high conns/worker")
+	}
 
+	e.performSwitch()
 	if e.ActiveEngine().Type() != engine.IOUring {
-		t.Fatal("expected io_uring initially")
+		t.Errorf("expected io_uring after switch, got %v", e.ActiveEngine().Type())
+	}
+}
+
+// TestSwitchFastPath: conns/worker above the high watermark snaps to io_uring
+// on a single tick (no sustain wait).
+func TestSwitchFastPath(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50})
+
+	if !e.ctrl.evaluate(time.Now(), false) {
+		t.Fatal("expected heavy-load fast-path snap on a single tick")
 	}
+}
+
+// TestNoSwitchInHysteresisBand: conns/worker between the down and up
+// thresholds (12–20) must not switch in either direction.
+func TestNoSwitchInHysteresisBand(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 16})
 
-	// Run evaluation — should trigger switch (150 > 100*1.15).
 	now := time.Now()
-	switched := e.ctrl.evaluate(now, false)
-	if !switched {
-		t.Fatal("expected switch to be recommended")
+	for i := range 5 {
+		if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatal("must not switch up inside the 12–20 hysteresis band")
+		}
 	}
+}
 
-	e.performSwitch()
+// TestLargePayloadSuppressesSwitch: even far above the high watermark, a
+// large average payload (link-bound) must suppress the io_uring switch.
+func TestLargePayloadSuppressesSwitch(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 32768})
 
-	if e.ActiveEngine().Type() != engine.Epoll {
-		t.Errorf("expected epoll after switch, got %v", e.ActiveEngine().Type())
+	now := time.Now()
+	for i := range 5 {
+		if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatal("large payload (link-bound) must suppress io_uring switch")
+		}
 	}
 }
 
-func TestHysteresis(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
+// TestRevertOnLowLoad: io_uring active, conns/worker drops below the down
+// threshold for sustainTicks → revert to epoll.
+func TestRevertOnLowLoad(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnIOUring(t)
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 4})
+
+	now := time.Now()
+	if e.ctrl.evaluate(now, false) {
+		t.Fatal("revert must require sustainTicks below the down threshold")
+	}
+	if !e.ctrl.evaluate(now.Add(time.Second), false) {
+		t.Fatal("expected revert to epoll after sustained low load")
+	}
+}
 
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 1 * time.Hour // Very long cooldown to test blocking.
+// TestErrorRevert: an error rate above errorRevertRate forces a revert from
+// io_uring to epoll regardless of load (even at high conns/worker).
+func TestErrorRevert(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnIOUring(t)
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.10})
+
+	if !e.ctrl.evaluate(time.Now(), false) {
+		t.Fatal("expected error-rate safety revert from io_uring")
+	}
+}
+
+// TestNoRevertWhenIOUringBusy: io_uring active with conns/worker above the
+// down threshold and no errors must stay on io_uring.
+func TestNoRevertWhenIOUringBusy(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnIOUring(t)
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 24})
 
-	// Pre-seed standby historical score so initial switch triggers.
 	now := time.Now()
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 200
-	e.ctrl.state.lastActiveTime[engine.Epoll] = now
+	for i := range 5 {
+		if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) {
+			t.Fatal("must not revert while io_uring is still busy")
+		}
+	}
+}
 
-	// Trigger initial switch.
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100})
+// TestCooldownGate: after a switch, a revert is blocked until the cooldown
+// window elapses, even when the revert condition holds.
+func TestCooldownGate(t *testing.T) {
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	e.ctrl.cooldown = 1 * time.Hour
 
+	now := time.Now()
+	// Fast-path switch epoll→io_uring.
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50})
 	if !e.ctrl.evaluate(now, false) {
-		t.Fatal("expected initial switch")
+		t.Fatal("expected initial fast-path switch")
 	}
 	e.performSwitch()
 
-	// Immediately try to switch back — should be blocked by cooldown.
-	sampler.Set(engine.Epoll, TelemetrySnapshot{ThroughputRPS: 100})
-
+	// Now io_uring active and idle — but cooldown blocks the revert.
+	sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0})
 	if e.ctrl.evaluate(now.Add(1*time.Second), false) {
-		t.Error("switch should be blocked by cooldown")
+		t.Error("revert should be blocked by cooldown")
 	}
 }
 
 func TestConnectionDraining(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
+	e, sampler := newAdaptiveStartingOnEpoll(t)
+	primary := e.primary.(*mockEngine)
+	secondary := e.secondary.(*mockEngine)
 
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
-
-	// Initial state: primary active, secondary should be paused.
-	// Simulate the initial pause that Listen() would do.
+	// Simulate the initial pause that Listen() would do on the standby.
 	_ = secondary.PauseAccept()
 	secondary.pauseCalls.Store(0) // reset counter
 
-	// Pre-seed standby historical score.
 	now := time.Now()
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 200
-	e.ctrl.state.lastActiveTime[engine.Epoll] = now
-
-	// Trigger switch.
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100})
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path
 
 	if !e.ctrl.evaluate(now, false) {
 		t.Fatal("expected switch")
@@ -202,28 +279,19 @@ func TestConnectionDraining(t *testing.T) {
 }
 
 func TestOscillationLock(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
-
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
+	e, sampler := newAdaptiveStartingOnEpoll(t)
 
 	now := time.Now()
 
-	// Perform 3 switches rapidly.
+	// Perform 3 switches rapidly via the fast path; each iteration sets a
+	// load that triggers the active engine's switch direction.
 	for range 3 {
-		activeType := e.ActiveEngine().Type()
-		sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50})
-		other := engine.Epoll
-		if activeType == engine.Epoll {
-			other = engine.IOUring
+		switch e.ActiveEngine().Type() {
+		case engine.Epoll:
+			sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // switch up
+		case engine.IOUring:
+			sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) // error revert
 		}
-		// Pre-seed standby historical score for each iteration.
-		e.ctrl.state.lastActiveScore[other] = 200
-		e.ctrl.state.lastActiveTime[other] = now
-
 		if !e.ctrl.evaluate(now, false) {
 			t.Fatal("expected switch before lock")
 		}
@@ -231,16 +299,13 @@ func TestOscillationLock(t *testing.T) {
 		now = now.Add(10 * time.Second)
 	}
 
-	// Fourth switch should be locked.
-	activeType := e.ActiveEngine().Type()
-	sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50})
-	other := engine.Epoll
-	if activeType == engine.Epoll {
-		other = engine.IOUring
+	// Fourth switch should be locked (3 switches within 5 minutes).
+	switch e.ActiveEngine().Type() {
+	case engine.Epoll:
+		sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50})
+	case engine.IOUring:
+		sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5})
 	}
-	e.ctrl.state.lastActiveScore[other] = 200
-	e.ctrl.state.lastActiveTime[other] = now
-
 	if e.ctrl.evaluate(now, false) {
 		t.Error("expected oscillation lock to prevent switch")
 	}
@@ -250,123 +315,164 @@ func TestOscillationLock(t *testing.T) {
 }
 
 func TestOverloadFreeze(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
-
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
+	e, sampler := newAdaptiveStartingOnEpoll(t)
 
 	now := time.Now()
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50})
-
-	// Pre-seed standby historical score.
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 200
-	e.ctrl.state.lastActiveTime[engine.Epoll] = now
+	sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path load
 
 	// Freeze switching.
 	e.FreezeSwitching()
-
 	if e.ctrl.evaluate(now, e.frozen.Load()) {
 		t.Error("expected freeze to block evaluation")
 	}
 
 	// Unfreeze.
 	e.UnfreezeSwitching()
-
 	if !e.ctrl.evaluate(now, e.frozen.Load()) {
 		t.Error("expected evaluation to proceed after unfreeze")
 	}
 }
 
-func TestHistoricalScoreDecay(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
+// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor
+// is implemented on *Engine (post-v1.0.0).
 
-	base := time.Now()
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0
-	e.ctrl.state.lastActiveTime[engine.Epoll] = base
+// --- Part A: feature-gated start-engine selection ------------------------
 
-	// At t=0, score should be 100.
-	got := e.ctrl.historicalScore(engine.Epoll, base)
-	if got != 100.0 {
-		t.Errorf("at t=0: score = %f, want 100.0", got)
-	}
+// chooseStartEngine + env-override + ioUringViable coverage now lives in
+// start_test.go (the policy was redesigned: default epoll, io_uring only on an
+// explicit high-concurrency hint with a viable kernel + memlock + non-h2c).
 
-	// At t=50s, score should be 50 (50% decay).
-	got = e.ctrl.historicalScore(engine.Epoll, base.Add(50*time.Second))
-	if got != 50.0 {
-		t.Errorf("at t=50s: score = %f, want 50.0", got)
-	}
+// --- Part B: lazy standby build on first switch + reuse ------------------
 
-	// At t=100s, score should be 0 (100% decay).
-	got = e.ctrl.historicalScore(engine.Epoll, base.Add(100*time.Second))
-	if got != 0.0 {
-		t.Errorf("at t=100s: score = %f, want 0.0", got)
-	}
+// newLazyAdaptive builds an adaptive engine in the LAZY shape: epoll is the
+// eager active (primary), io_uring is nil and built on demand by a counting
+// buildStandby. listenCtx/listenWG are populated as Listen would, so
+// performSwitch can launch + bind-wait the lazily-built standby. The returned
+// counter pointer records how many times buildStandby ran.
+func newLazyAdaptive(t *testing.T) (*Engine, *syntheticSampler, *mockEngine, *int) {
+	t.Helper()
+	active := newMockEngine(engine.Epoll)
+	sampler := newSyntheticSampler()
 
-	// At t=200s, score should still be 0 (clamped).
-	got = e.ctrl.historicalScore(engine.Epoll, base.Add(200*time.Second))
-	if got != 0.0 {
-		t.Errorf("at t=200s: score = %f, want 0.0", got)
+	var builtCount int
+	lazyStandby := newMockEngine(engine.IOUring)
+	e := &Engine{
+		primary:   active, // epoll active
+		secondary: nil,    // io_uring lazy standby
+		cfg:       resource.Config{Protocol: engine.HTTP1},
+		logger:    testLogger(),
+		startType: engine.Epoll,
 	}
+	e.buildStandby = func() (engine.Engine, error) {
+		builtCount++
+		return lazyStandby, nil
+	}
+	e.ctrl = newController(e.primary, e.secondary, sampler, e.logger)
+	e.ctrl.state.activeIsPrimary = true
+	e.ctrl.cooldown = 0
+	var ap engine.Engine = active
+	e.active.Store(&ap)
+
+	// Wire the Listen-owned ctx + wait group so performSwitch can start the
+	// lazily-built standby's Listen goroutine.
+	ctx, cancel := context.WithCancel(t.Context())
+	t.Cleanup(cancel)
+	var wg sync.WaitGroup
+	// Hold the wait group above zero for the test's lifetime (mirrors the
+	// eval-loop goroutine being a wg member in production), so wg.Go from
+	// performSwitch is always called while the counter is positive.
+	wg.Add(1)
+	t.Cleanup(func() { wg.Done() })
+	e.listenCtx = ctx
+	e.listenWG = &wg
+
+	return e, sampler, lazyStandby, &builtCount
 }
 
-func TestHistoricalScoreSeeding(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
-
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
+// TestLazyStandbyBuiltOnFirstSwitchAndReused verifies the lazy New() path
+// builds the standby exactly once (on the first switch) and reuses it on a
+// subsequent switch back.
+func TestLazyStandbyBuiltOnFirstSwitchAndReused(t *testing.T) {
+	e, _, lazyStandby, builtCount := newLazyAdaptive(t)
 
-	// Active (io_uring) has score 100. No standby history yet.
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100})
+	if e.secondary != nil {
+		t.Fatal("standby must be nil before the first switch")
+	}
+	if *builtCount != 0 {
+		t.Fatalf("buildStandby ran %d times before any switch, want 0", *builtCount)
+	}
 
-	now := time.Now()
-	e.ctrl.evaluate(now, false)
+	// First switch: epoll -> io_uring. Must build + cache + activate the standby.
+	e.performSwitch()
 
-	// Standby should be seeded at 80% of active.
-	standbyScore, ok := e.ctrl.state.lastActiveScore[engine.Epoll]
-	if !ok {
-		t.Fatal("expected standby score to be seeded")
+	if *builtCount != 1 {
+		t.Fatalf("buildStandby ran %d times after first switch, want 1", *builtCount)
 	}
-
-	// Active score = 1.0*100 - 2.0*0 = 100. 80% = 80.
-	if standbyScore != 80.0 {
-		t.Errorf("standby seed score = %f, want 80.0", standbyScore)
+	if e.secondary == nil {
+		t.Fatal("standby must be cached after the first switch")
+	}
+	if e.ActiveEngine().Type() != engine.IOUring {
+		t.Fatalf("active = %v after first switch, want io_uring", e.ActiveEngine().Type())
+	}
+	// The lazily-built standby must have been Listen'd (its goroutine ran).
+	select {
+	case <-lazyStandby.listenStarted:
+	case <-time.After(2 * time.Second):
+		t.Fatal("lazy standby Listen was never started")
+	}
+	// New active (io_uring) must have been resumed; old active (epoll) paused.
+	if lazyStandby.resumeCalls.Load() == 0 {
+		t.Error("expected ResumeAccept on the freshly-built (now active) standby")
+	}
+	if e.primary.(*mockEngine).pauseCalls.Load() == 0 {
+		t.Error("expected PauseAccept on the old active (epoll)")
 	}
-}
 
-func TestSwitchAfterActiveDegrades(t *testing.T) {
-	primary := newMockEngine(engine.IOUring)
-	secondary := newMockEngine(engine.Epoll)
-	sampler := newSyntheticSampler()
+	// Second switch: io_uring -> epoll. The standby slot for THIS direction is
+	// epoll (e.primary, already non-nil), so no new build happens. The io_uring
+	// engine stays cached too.
+	e.performSwitch()
 
-	cfg := resource.Config{Protocol: engine.HTTP1}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-	e.ctrl.cooldown = 0
+	if *builtCount != 1 {
+		t.Fatalf("buildStandby ran %d times after second switch, want 1 (reuse)", *builtCount)
+	}
+	if e.ActiveEngine().Type() != engine.Epoll {
+		t.Fatalf("active = %v after second switch, want epoll", e.ActiveEngine().Type())
+	}
+	if e.secondary != lazyStandby {
+		t.Error("io_uring standby must remain cached (same instance) after switching back")
+	}
 
-	now := time.Now()
+	// Third switch: epoll -> io_uring again. Reuses the cached io_uring standby.
+	e.performSwitch()
+	if *builtCount != 1 {
+		t.Fatalf("buildStandby ran %d times after third switch, want 1 (reuse)", *builtCount)
+	}
+	if e.ActiveEngine().Type() != engine.IOUring {
+		t.Fatalf("active = %v after third switch, want io_uring", e.ActiveEngine().Type())
+	}
+}
 
-	// Pre-seed standby (epoll) with a strong historical score.
-	e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0
-	e.ctrl.state.lastActiveTime[engine.Epoll] = now
+// TestLazyStandbyBuildFailureAbortsSwitch verifies a buildStandby error aborts
+// the switch cleanly: the active engine is unchanged and the standby slot stays
+// nil.
+func TestLazyStandbyBuildFailureAbortsSwitch(t *testing.T) {
+	e, _, _, _ := newLazyAdaptive(t)
+	e.buildStandby = func() (engine.Engine, error) {
+		return nil, errBuildFailed
+	}
 
-	// Active (io_uring) degrades to 50 RPS.
-	sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50})
+	e.performSwitch()
 
-	// Standby historical = 100 * (1 - 0.01*0) = 100.
-	// Active score = 50. 100 > 50*1.15 = 57.5 → switch.
-	if !e.ctrl.evaluate(now, false) {
-		t.Error("expected switch when active degrades below standby historical")
+	if e.ActiveEngine().Type() != engine.Epoll {
+		t.Fatalf("active changed to %v after a failed build; must stay epoll", e.ActiveEngine().Type())
+	}
+	if e.secondary != nil {
+		t.Fatal("standby slot must stay nil after a failed build")
+	}
+	if e.ctrl.state.activeIsPrimary != true {
+		t.Fatal("controller direction must be unchanged after a failed build")
 	}
 }
 
-// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor
-// is implemented on *Engine (post-v1.0.0).
+var errBuildFailed = errors.New("build failed")
diff --git a/adaptive/scaler_h2_dial_race_test.go b/adaptive/pauseaccept_h2_dial_race_test.go
similarity index 72%
rename from adaptive/scaler_h2_dial_race_test.go
rename to adaptive/pauseaccept_h2_dial_race_test.go
index 5dfaafaa..28e0ecb0 100644
--- a/adaptive/scaler_h2_dial_race_test.go
+++ b/adaptive/pauseaccept_h2_dial_race_test.go
@@ -16,23 +16,27 @@ import (
 	"github.com/goceleris/celeris/resource"
 )
 
-// TestAdaptiveScaler_H2DialNoRSTRace mirrors TestAdaptiveH2DialNoRSTRace
-// with the dynamic worker scaler enabled. Locks in that the scaler's
-// start-high default does not pause workers before the listen FDs
-// settle in the SO_REUSEPORT group — which would RST in-flight H2
-// prior-knowledge handshakes mid-flush.
+// TestAdaptivePauseAccept_H2DialNoRSTRace is a second angle on the
+// H2-dial-RST race that TestAdaptiveH2DialNoRSTRace guards, run with a
+// larger (4) worker pool so the standby engine has more listen FDs to
+// evict from the SO_REUSEPORT group before Addr() is published. The
+// race is the same: if PauseAccept on the standby has not synchronously
+// removed every listen FD from the routing pool by the time adaptive
+// exposes Addr, a burst of dials gets split across active and standby,
+// and the standby's FD close RSTs the conns that landed on it — fatal
+// for H2 prior-knowledge handshakes mid-flush.
 //
 // Iterations bound at 3: the race only fires on engine spin-up, so we
-// just need "scaler + H2 dial burst" coverage in addition to the
-// no-scaler test's larger budget.
-func TestAdaptiveScaler_H2DialNoRSTRace(t *testing.T) {
+// just need an additional burst against a wider pool in addition to the
+// main test's larger budget.
+func TestAdaptivePauseAccept_H2DialNoRSTRace(t *testing.T) {
 	const iterations = 3
 	for i := 0; i < iterations; i++ {
-		runScalerH2Once(t, i)
+		runPauseAcceptH2Once(t, i)
 	}
 }
 
-func runScalerH2Once(t *testing.T, iter int) {
+func runPauseAcceptH2Once(t *testing.T, iter int) {
 	t.Helper()
 	ln, err := net.Listen("tcp", "127.0.0.1:0")
 	if err != nil {
@@ -47,9 +51,8 @@ func runScalerH2Once(t *testing.T, iter int) {
 		Protocol:        engine.Auto,
 		EnableH2Upgrade: true,
 		Resources: resource.Resources{
-			Workers: 4, // 4 workers so MinActive=2 has 2 paused at start
+			Workers: 4,
 		},
-		WorkerScaling: &resource.WorkerScalingConfig{}, // zero value → start-high
 	}
 	e, err := New(cfg, &h2PrefaceHandler{}, nil)
 	if err != nil {
diff --git a/adaptive/scaler.go b/adaptive/scaler.go
deleted file mode 100644
index b136d923..00000000
--- a/adaptive/scaler.go
+++ /dev/null
@@ -1,114 +0,0 @@
-//go:build linux
-
-package adaptive
-
-import (
-	"context"
-	"log/slog"
-	"sync/atomic"
-
-	"github.com/goceleris/celeris/engine"
-	"github.com/goceleris/celeris/engine/scaler"
-)
-
-// adaptiveScalerSource adapts the adaptive Engine to the scaler.Source
-// interface. The active sub-engine can change at runtime
-// (performSwitch); this source watches for that and routes
-// PauseWorker / ResumeWorker calls to whichever engine is active right
-// now. Generation increments on each switch so the shared scaler loop
-// can re-baseline the new active engine's worker pause state.
-//
-// Both sub-engines must implement engine.WorkerScaler — verified via a
-// type-assertion in the constructor. Both must have the same NumWorkers;
-// if they ever diverge (defensive), the smaller pool is used as the cap.
-type adaptiveScalerSource struct {
-	e       *Engine
-	primary engine.WorkerScaler
-	standby engine.WorkerScaler
-	gen     atomic.Uint64
-
-	// lastActive tracks which sub-engine was active on the previous
-	// scaler-source method call. When activeFor() observes a different
-	// engine, gen is incremented so the scaler.Run loop notices and
-	// re-baselines.
-	lastActive atomic.Pointer[engine.Engine]
-}
-
-// newAdaptiveScalerSource attempts to construct a Source. Returns nil if
-// either sub-engine fails the engine.WorkerScaler assertion (defensive
-// — should not happen in production since both iouring and epoll
-// implement it).
-func newAdaptiveScalerSource(e *Engine) *adaptiveScalerSource {
-	primary, ok1 := e.primary.(engine.WorkerScaler)
-	standby, ok2 := e.secondary.(engine.WorkerScaler)
-	if !ok1 || !ok2 {
-		return nil
-	}
-	return &adaptiveScalerSource{e: e, primary: primary, standby: standby}
-}
-
-// activeFor returns the WorkerScaler view of the engine that is
-// currently active in adaptive. Bumps Generation if the active engine
-// changed since the previous call, so scaler.Run re-baselines.
-func (s *adaptiveScalerSource) activeFor() engine.WorkerScaler {
-	cur := *s.e.active.Load()
-	prev := s.lastActive.Load()
-	if prev == nil || *prev != cur {
-		s.lastActive.Store(&cur)
-		s.gen.Add(1)
-	}
-	if cur == s.e.secondary {
-		return s.standby
-	}
-	return s.primary
-}
-
-func (s *adaptiveScalerSource) NumWorkers() int {
-	a, b := s.primary.NumWorkers(), s.standby.NumWorkers()
-	if a < b {
-		return a
-	}
-	return b
-}
-
-func (s *adaptiveScalerSource) ActiveConns() int64 {
-	// Sum BOTH sub-engines' active conns, not just the currently-active
-	// one. During an adaptive switch, in-flight conns on the OLD
-	// sub-engine continue to be served until their clients close — the
-	// scaler must count them so the desired-worker calculation doesn't
-	// undershoot the real CPU load (#300 G1).
-	return s.e.primary.Metrics().ActiveConnections +
-		s.e.secondary.Metrics().ActiveConnections
-}
-
-func (s *adaptiveScalerSource) PauseWorker(i int) {
-	s.activeFor().PauseWorker(i)
-}
-
-func (s *adaptiveScalerSource) ResumeWorker(i int) {
-	s.activeFor().ResumeWorker(i)
-}
-
-func (s *adaptiveScalerSource) Generation() uint64 {
-	// Refresh: a tick reading Generation should also drive the
-	// active-engine detection so we don't miss a switch that happens
-	// between two ticks where neither Pause/Resume nor ActiveConns ran.
-	s.activeFor()
-	return s.gen.Load()
-}
-
-func (s *adaptiveScalerSource) Logger() *slog.Logger { return s.e.logger }
-
-// runScaler is started by Engine.Listen when the scaler is enabled. The
-// algorithm itself lives in engine/scaler; this just wires up the
-// adaptive-flavoured source.
-func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config) {
-	src := newAdaptiveScalerSource(e)
-	if src == nil {
-		if e.logger != nil {
-			e.logger.Warn("adaptive scaler disabled: sub-engine does not implement WorkerScaler")
-		}
-		return
-	}
-	scaler.Run(ctx, src, cfg)
-}
diff --git a/adaptive/scaler_test.go b/adaptive/scaler_test.go
deleted file mode 100644
index 5d5bec6b..00000000
--- a/adaptive/scaler_test.go
+++ /dev/null
@@ -1,116 +0,0 @@
-//go:build linux
-
-package adaptive
-
-import (
-	"context"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/goceleris/celeris/engine"
-	"github.com/goceleris/celeris/engine/scaler"
-	"github.com/goceleris/celeris/resource"
-)
-
-// scalableMockEngine extends mockEngine with the engine.WorkerScaler
-// interface so the adaptive scaler can exercise its
-// PauseWorker / ResumeWorker delegation logic against a controllable
-// fake. Pause/Resume calls are tracked per-worker and per-direction so
-// tests can assert the scaler routed work to the active engine and
-// not the standby.
-type scalableMockEngine struct {
-	*mockEngine
-	numWorkers int
-	pauseLog   []int
-	resumeLog  []int
-	logMu      sync.Mutex
-}
-
-func newScalableMockEngine(et engine.EngineType, n int) *scalableMockEngine {
-	return &scalableMockEngine{mockEngine: newMockEngine(et), numWorkers: n}
-}
-
-func (m *scalableMockEngine) NumWorkers() int { return m.numWorkers }
-
-func (m *scalableMockEngine) PauseWorker(i int) {
-	m.logMu.Lock()
-	m.pauseLog = append(m.pauseLog, i)
-	m.logMu.Unlock()
-}
-
-func (m *scalableMockEngine) ResumeWorker(i int) {
-	m.logMu.Lock()
-	m.resumeLog = append(m.resumeLog, i)
-	m.logMu.Unlock()
-}
-
-func (m *scalableMockEngine) snapshotLog() (pause, resume []int) {
-	m.logMu.Lock()
-	defer m.logMu.Unlock()
-	pause = append(pause, m.pauseLog...)
-	resume = append(resume, m.resumeLog...)
-	return
-}
-
-// TestAdaptiveScaler_DelegatesToActiveEngine spins the higher-level
-// scaler with two mock engines, sets connection counts on the active
-// one, and verifies the scaler calls PauseWorker / ResumeWorker on
-// the active engine — never on the standby. This is the architectural
-// invariant that the v1.4.1 adaptive scaler refactor exists to enforce
-// (the pre-refactor design ran two scalers, producing -54 % to +118 %
-// pinning-test variance — see PR #257).
-func TestAdaptiveScaler_DelegatesToActiveEngine(t *testing.T) {
-	primary := newScalableMockEngine(engine.Epoll, 8)
-	secondary := newScalableMockEngine(engine.IOUring, 8)
-	sampler := newSyntheticSampler()
-	cfg := resource.Config{}
-	e := newFromEngines(primary, secondary, sampler, cfg)
-
-	// Active = primary (matches newFromEngines default).
-	// Set activeConns high enough that desired = 8 (max), so the scaler
-	// will call ResumeWorker on workers 4..7 of the active engine.
-	primary.SetMetrics(engine.EngineMetrics{ActiveConnections: 200})
-	secondary.SetMetrics(engine.EngineMetrics{ActiveConnections: 0})
-
-	scfg := scaler.Config{
-		Enabled:              true,
-		MinActive:            4,
-		TargetConnsPerWorker: 20,
-		Interval:             10 * time.Millisecond,
-		ScaleUpStep:          4, // burst-resume to active=8 in one tick
-		ScaleDownStep:        1,
-		ScaleDownHysteresis:  1,
-		ScaleDownIdleTicks:   100, // never scale-down within the test budget
-	}
-
-	ctx, cancel := context.WithCancel(t.Context())
-	defer cancel()
-	go e.runScaler(ctx, scfg)
-
-	// Give the scaler a few ticks to ramp up.
-	time.Sleep(100 * time.Millisecond)
-	cancel()
-	// Tiny grace for the scaler goroutine to observe the cancel.
-	time.Sleep(20 * time.Millisecond)
-
-	pPause, pResume := primary.snapshotLog()
-	sPause, sResume := secondary.snapshotLog()
-
-	// Initial state pauses workers 4..7 on the active engine. Then
-	// scaler resumes 4..7 on the ACTIVE (primary) engine to reach
-	// desired=8.
-	if len(pPause) < 4 {
-		t.Errorf("primary expected ≥4 PauseWorker calls (init), got %d: %v", len(pPause), pPause)
-	}
-	if len(pResume) == 0 {
-		t.Errorf("primary expected ResumeWorker calls (active scaling up), got 0")
-	}
-	// Standby (secondary) should NEVER see ResumeWorker — that's the
-	// invariant. The scaler routes scale-up to the active engine only.
-	if len(sResume) != 0 {
-		t.Errorf("secondary should NOT see ResumeWorker calls; got %v", sResume)
-	}
-	// And no spurious calls on the secondary either (it's the standby).
-	_ = sPause
-}
diff --git a/adaptive/score.go b/adaptive/score.go
deleted file mode 100644
index 85034d55..00000000
--- a/adaptive/score.go
+++ /dev/null
@@ -1,83 +0,0 @@
-//go:build linux
-
-package adaptive
-
-// ScoreWeights defines the weighting for each telemetry signal in score
-// computation. Higher throughput weight favors faster engines; higher error
-// weight penalizes unreliable ones.
-type ScoreWeights struct {
-	// Throughput is the weight applied to requests-per-second in the score.
-	Throughput float64
-	// ErrorRate is the penalty weight applied to the error fraction.
-	ErrorRate float64
-}
-
-// DefaultWeights returns the default score weights.
-func DefaultWeights() ScoreWeights {
-	return ScoreWeights{
-		Throughput: 1.0,
-		ErrorRate:  2.0,
-	}
-}
-
-// computeScore produces a weighted score from a telemetry snapshot.
-// Higher is better.
-func computeScore(snap TelemetrySnapshot, w ScoreWeights) float64 {
-	return w.Throughput*snap.ThroughputRPS - w.ErrorRate*snap.ErrorRate
-}
-
-// ioUringBias returns a positive value when conditions favor io_uring over
-// epoll, based on benchmark data. The bias is added to io_uring's score (or
-// subtracted from epoll's) to enable proactive switching without requiring
-// a throughput advantage.
-//
-// Signals from 3-run median benchmarks (2026-03-27):
-//   - io_uring CPU efficiency: 2-8% less CPU at 256-4096 connections
-//   - io_uring p99 tail: 17-39% better at 1024-4096 connections
-//   - Throughput: equivalent (±2%) at 256-4096 connections
-//   - io_uring loses at <64 connections (fixed overhead) and >8192 on x86
-//
-// Pre-v1.5.0 this function was effectively dead: liveSampler.cpuMon was
-// never assigned (celeris#316), so snap.CPUUtilization was always 0 and
-// cpuFactor was always 0, making the function return 0 unconditionally.
-// v1.5.0 wires the CPU monitor at engine construction; the bias now fires
-// when CPU is above 30% and conns land in the 128-8192 sweet spot.
-//
-// The conn-factor falloff above 8192 is the empirical cost structure on
-// x86 — see the bench data above — not a bug. If #318 identifies a
-// different cause (e.g. a PbufRing bottleneck, fixed by #322), update
-// this comment to point at the new finding.
-func ioUringBias(snap TelemetrySnapshot) float64 {
-	conns := snap.ActiveConnections
-	cpu := snap.CPUUtilization
-
-	// No bias at very low or very high connection counts.
-	if conns < 128 || conns > 8192 {
-		return 0
-	}
-
-	// Base bias: proportional to connection count in the sweet spot.
-	// Peaks at 1024-4096 connections where io_uring's advantages are strongest.
-	var connFactor float64
-	switch {
-	case conns < 256:
-		connFactor = 0.1
-	case conns < 1024:
-		connFactor = 0.3
-	case conns <= 4096:
-		connFactor = 0.5 // Peak: io_uring's best range
-	default:
-		connFactor = 0.2 // 4096-8192: still beneficial but declining
-	}
-
-	// CPU factor: bias increases when CPU-bound (io_uring's efficiency advantage
-	// matters most when cores are saturated). No bias below 30% CPU.
-	cpuFactor := 0.0
-	if cpu > 0.3 {
-		cpuFactor = (cpu - 0.3) / 0.7 // 0.0 at 30% CPU, 1.0 at 100%
-	}
-
-	// Combined bias: connection suitability × CPU pressure.
-	// Maximum bias is 0.5 (50% of throughput score).
-	return connFactor * cpuFactor
-}
diff --git a/adaptive/score_test.go b/adaptive/score_test.go
deleted file mode 100644
index 599c89da..00000000
--- a/adaptive/score_test.go
+++ /dev/null
@@ -1,55 +0,0 @@
-//go:build linux
-
-package adaptive
-
-import "testing"
-
-func TestIoUringBiasConnFactorFalloff(t *testing.T) {
-	tests := []struct {
-		name      string
-		conns     int64
-		cpu       float64
-		wantBias  float64
-		tolerance float64
-	}{
-		{name: "below 128: zero", conns: 64, cpu: 0.9, wantBias: 0.0},
-		{name: "at 128 boundary: connFactor 0.1", conns: 128, cpu: 1.0, wantBias: 0.1 * (1.0 - 0.3) / 0.7},
-		{name: "256 conns peak: 0.5", conns: 2048, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7},
-		{name: "4096 boundary: 0.5 still", conns: 4096, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7},
-		{name: "4097: drops to 0.2", conns: 4097, cpu: 1.0, wantBias: 0.2 * (1.0 - 0.3) / 0.7},
-		{name: "above 8192: zero", conns: 9000, cpu: 0.9, wantBias: 0.0},
-		{name: "low CPU: zero cpuFactor", conns: 2048, cpu: 0.20, wantBias: 0.5 * 0.0},
-		{name: "exactly 30% CPU: zero cpuFactor", conns: 2048, cpu: 0.30, wantBias: 0.5 * 0.0},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			snap := TelemetrySnapshot{
-				ActiveConnections: tt.conns,
-				CPUUtilization:    tt.cpu,
-			}
-			got := ioUringBias(snap)
-			if absDiff(got, tt.wantBias) > tt.tolerance {
-				t.Errorf("ioUringBias(conns=%d, cpu=%.2f) = %v, want %v ± %v",
-					tt.conns, tt.cpu, got, tt.wantBias, tt.tolerance)
-			}
-		})
-	}
-}
-
-func TestComputeScoreTwoTerm(t *testing.T) {
-	w := DefaultWeights()
-	snap := TelemetrySnapshot{ThroughputRPS: 1000, ErrorRate: 0.05}
-	got := computeScore(snap, w)
-	want := 1.0*1000.0 - 2.0*0.05
-	if got != want {
-		t.Errorf("computeScore = %v, want %v", got, want)
-	}
-}
-
-func absDiff(a, b float64) float64 {
-	d := a - b
-	if d < 0 {
-		return -d
-	}
-	return d
-}
diff --git a/adaptive/start_test.go b/adaptive/start_test.go
new file mode 100644
index 00000000..71aaa498
--- /dev/null
+++ b/adaptive/start_test.go
@@ -0,0 +1,92 @@
+//go:build linux
+
+package adaptive
+
+import (
+	"testing"
+
+	"github.com/goceleris/celeris/engine"
+	"github.com/goceleris/celeris/resource"
+)
+
+// viableProfile is a modern kernel with the full io_uring fast tier.
+func viableProfile() engine.CapabilityProfile {
+	return engine.CapabilityProfile{
+		KernelMajor: 6, KernelMinor: 12,
+		DeferTaskrun: true, SingleIssuer: true, MultishotRecv: true, ProvidedBuffers: true,
+	}
+}
+
+// oldProfile is a pre-fast-tier kernel (io_uring not worth running).
+func oldProfile() engine.CapabilityProfile {
+	return engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15}
+}
+
+// withMemlock overrides the memlock probe for the duration of a test.
+func withMemlock(t *testing.T, maxWorkers int) {
+	t.Helper()
+	prev := maxWorkersForMemlock
+	maxWorkersForMemlock = func() int { return maxWorkers }
+	t.Cleanup(func() { maxWorkersForMemlock = prev })
+}
+
+// TestChooseStartEngine_GateOrder asserts the new start-engine policy: the
+// DEFAULT is epoll (the flipped default), io_uring only on an explicit
+// high-concurrency hint with a viable kernel + memlock + non-h2c protocol.
+func TestChooseStartEngine_GateOrder(t *testing.T) {
+	cfg := func(proto engine.Protocol, hint resource.WorkloadHint) resource.Config {
+		return resource.Config{Protocol: proto, Resources: resource.Resources{WorkloadHint: hint}}
+	}
+	tests := []struct {
+		name    string
+		profile engine.CapabilityProfile
+		cfg     resource.Config
+		memlock int // -1 = no cap
+		want    engine.EngineType
+	}{
+		{"default-is-epoll (viable, no hint)", viableProfile(), cfg(engine.HTTP1, resource.WorkloadUnspecified), -1, engine.Epoll},
+		{"low-conc hint -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadLowConcurrency), -1, engine.Epoll},
+		{"high-conc hint + viable -> iouring", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.IOUring},
+		{"high-conc hint but old kernel -> epoll", oldProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.Epoll},
+		{"high-conc hint but h2c -> epoll", viableProfile(), cfg(engine.H2C, resource.WorkloadHighConcurrency), -1, engine.Epoll},
+		{"high-conc hint but memlock-starved -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), 1, engine.Epoll},
+		{"auto protocol + high-conc hint -> iouring", viableProfile(), cfg(engine.Auto, resource.WorkloadHighConcurrency), -1, engine.IOUring},
+	}
+	t.Setenv("CELERIS_ADAPTIVE_START", "") // neutralize any ambient override
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			withMemlock(t, tc.memlock)
+			if got := chooseStartEngine(tc.profile, tc.cfg); got != tc.want {
+				t.Fatalf("chooseStartEngine = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+// TestChooseStartEngine_EnvOverride asserts the env escape hatch wins over policy.
+func TestChooseStartEngine_EnvOverride(t *testing.T) {
+	withMemlock(t, -1)
+	t.Setenv("CELERIS_ADAPTIVE_START", "iouring")
+	if got := chooseStartEngine(oldProfile(), resource.Config{Protocol: engine.H2C}); got != engine.IOUring {
+		t.Fatalf("env=iouring should force IOUring even on old/h2c, got %v", got)
+	}
+	t.Setenv("CELERIS_ADAPTIVE_START", "epoll")
+	if got := chooseStartEngine(viableProfile(), resource.Config{Resources: resource.Resources{WorkloadHint: resource.WorkloadHighConcurrency}}); got != engine.Epoll {
+		t.Fatalf("env=epoll should force Epoll even with high-conc hint, got %v", got)
+	}
+}
+
+// TestIOUringViable covers the two t0 disqualifiers.
+func TestIOUringViable(t *testing.T) {
+	withMemlock(t, -1)
+	if !ioUringViable(viableProfile(), resource.Config{}) {
+		t.Fatal("viable profile + unlimited memlock should be viable")
+	}
+	if ioUringViable(oldProfile(), resource.Config{}) {
+		t.Fatal("old kernel should be non-viable")
+	}
+	withMemlock(t, 1) // 1 worker < resolved workers -> non-viable
+	if ioUringViable(viableProfile(), resource.Config{}) {
+		t.Fatal("memlock-starved should be non-viable")
+	}
+}
diff --git a/adaptive/telemetry.go b/adaptive/telemetry.go
index 992d74d2..5b52d1a1 100644
--- a/adaptive/telemetry.go
+++ b/adaptive/telemetry.go
@@ -22,6 +22,20 @@ type TelemetrySnapshot struct {
 	// CPUUtilization is the estimated CPU usage fraction (0.0-1.0). Read from
 	// the live sampler's CPUMonitor; zero when no monitor is wired.
 	CPUUtilization float64
+	// ConnsPerWorker is ActiveConnections divided by the engine's worker
+	// count. This is the PRIMARY load signal driving engine selection:
+	// epoll and io_uring tie at low conns/worker, but io_uring pulls ahead
+	// and keeps scaling above ~20/worker while epoll plateaus.
+	ConnsPerWorker float64
+	// AcceptRate is the new-connection arrival rate (accepts/sec) over the
+	// last sampling interval, derived like ThroughputRPS. A secondary load
+	// signal: a high accept rate indicates connection churn.
+	AcceptRate float64
+	// BytesPerReq is the average payload size (read+written bytes per
+	// request) over the last interval. When this exceeds the controller's
+	// large-payload threshold the workload is link-bound — the engines tie
+	// — so the controller suppresses an io_uring switch to avoid churn.
+	BytesPerReq float64
 }
 
 // TelemetrySampler produces telemetry snapshots from an engine.
@@ -54,6 +68,11 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot {
 		ActiveConnections: m.ActiveConnections,
 	}
 
+	// ConnsPerWorker is a point-in-time ratio (not a delta), so it needs no
+	// prior sample. max(Workers, 1) guards the pre-Listen window where the
+	// worker count is still zero.
+	snap.ConnsPerWorker = float64(m.ActiveConnections) / float64(max(m.Workers, 1))
+
 	prev, hasPrev := s.prevMetrics[et]
 	prevT, hasT := s.prevTime[et]
 	if hasPrev && hasT {
@@ -61,9 +80,13 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot {
 		if elapsed > 0 {
 			deltaReqs := m.RequestCount - prev.RequestCount
 			deltaErrs := m.ErrorCount - prev.ErrorCount
+			deltaAccepts := m.AcceptCount - prev.AcceptCount
+			deltaBytes := (m.BytesRead + m.BytesWritten) - (prev.BytesRead + prev.BytesWritten)
 			snap.ThroughputRPS = float64(deltaReqs) / elapsed
+			snap.AcceptRate = float64(deltaAccepts) / elapsed
 			if deltaReqs > 0 {
 				snap.ErrorRate = float64(deltaErrs) / float64(deltaReqs)
+				snap.BytesPerReq = float64(deltaBytes) / float64(deltaReqs)
 			}
 		}
 	}
diff --git a/async_improvements_test.go b/async_improvements_test.go
new file mode 100644
index 00000000..e72a2a5c
--- /dev/null
+++ b/async_improvements_test.go
@@ -0,0 +1,132 @@
+package celeris
+
+import (
+	"sync/atomic"
+	"testing"
+)
+
+// --- #1: Server.AsyncHandlers() reports the EFFECTIVE async state ---------
+
+// TestAsyncHandlersEffective_PerRouteAsync verifies the driver-footgun fix: a
+// server with Config.AsyncHandlers=false but a .Async() route reports
+// AsyncHandlers()==true, so a WithEngine driver picks its netpoll fast path
+// under the recommended per-route-async idiom (instead of the slow mini-loop).
+func TestAsyncHandlersEffective_PerRouteAsync(t *testing.T) {
+	s := New(Config{Engine: Std, AsyncHandlers: false})
+	if s.AsyncHandlers() {
+		t.Fatal("no async routes + AsyncHandlers=false must report false")
+	}
+	s.GET("/db", noopHandler).Async()
+	if !s.AsyncHandlers() {
+		t.Fatal("a .Async() route must make AsyncHandlers() effective-true (driver fast path)")
+	}
+}
+
+// TestAsyncHandlersEffective_ServerDefault verifies the server-level flag alone
+// still reports true (unchanged behavior).
+func TestAsyncHandlersEffective_ServerDefault(t *testing.T) {
+	s := New(Config{Engine: Std, AsyncHandlers: true})
+	if !s.AsyncHandlers() {
+		t.Fatal("AsyncHandlers=true must report true")
+	}
+}
+
+// TestAsyncHandlersEffective_GroupAsync verifies a group-level .Async() route
+// also flips the effective state.
+func TestAsyncHandlersEffective_GroupAsync(t *testing.T) {
+	s := New(Config{Engine: Std, AsyncHandlers: false})
+	api := s.Group("/api").Async()
+	api.GET("/x", noopHandler)
+	if !s.AsyncHandlers() {
+		t.Fatal("a group .Async() route must make AsyncHandlers() effective-true")
+	}
+}
+
+// --- #4: .UsesDriver() is an intent-revealing alias for .Async() ----------
+
+func TestUsesDriver(t *testing.T) {
+	s := New(Config{Engine: Std, AsyncHandlers: false})
+	s.GET("/cache/:key", noopHandler).UsesDriver()
+	if !s.router.routeAsync("GET", "/cache/anything") {
+		t.Fatal(".UsesDriver() must resolve hard-async")
+	}
+	if s.router.adaptiveRoutes["/cache/:key"] {
+		t.Fatal(".UsesDriver() (== .Async()) must not be adaptive")
+	}
+	if !s.AsyncHandlers() {
+		t.Fatal(".UsesDriver() must make AsyncHandlers() effective-true")
+	}
+}
+
+// --- #3: immediate promotion of an unambiguously-blocking inline run -------
+
+// TestAdaptiveBlockingThresholdConst guards the threshold ordering: the
+// immediate-promote bar must sit above the slow bar so the streak path still
+// owns the borderline band.
+func TestAdaptiveBlockingThresholdConst(t *testing.T) {
+	if adaptiveBlockingThreshold <= adaptivePromoteThreshold {
+		t.Fatalf("blocking threshold %v must exceed slow threshold %v",
+			adaptiveBlockingThreshold, adaptivePromoteThreshold)
+	}
+}
+
+// TestPromoteRouteImmediate verifies a single unambiguously-blocking inline run
+// promotes the route immediately (no adaptivePromoteStreak wait), so a
+// forgotten-async blocking handler stalls a worker for at most one request.
+func TestPromoteRouteImmediate(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/slow", noopHandler) // inherits default → adaptive
+	r := s.router
+	if !r.adaptiveRoutes["/slow"] {
+		t.Fatal("/slow should be adaptive under AsyncHandlers=true")
+	}
+	if r.isPromoted("/slow") {
+		t.Fatal("must not be promoted before any blocking run")
+	}
+	r.promoteRouteImmediate("/slow") // one >adaptiveBlockingThreshold run
+	if !r.isPromoted("/slow") {
+		t.Fatal("a single blocking inline run must promote immediately")
+	}
+	if !r.routeAsync("GET", "/slow") {
+		t.Fatal("a promoted route must resolve async")
+	}
+}
+
+// TestPromoteImmediate_ClearsFastStreak verifies the immediate promotion clears
+// any accumulated fast streak so the route cannot wrongly settle on stale fast
+// runs after the promotion TTL expires.
+func TestPromoteImmediate_ClearsFastStreak(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/slow", noopHandler)
+	r := s.router
+	for i := 0; i < 10; i++ {
+		r.recordInlineRun("/slow", false) // accumulate fast runs
+	}
+	r.promoteRouteImmediate("/slow")
+	if !r.isPromoted("/slow") {
+		t.Fatal("must be promoted")
+	}
+	if v, ok := r.fastStreak.Load("/slow"); ok {
+		if got := v.(*atomic.Int32).Load(); got != 0 {
+			t.Fatalf("fast streak must be cleared on immediate promote, got %d", got)
+		}
+	}
+}
+
+// TestStreakStillRequiresEight verifies the slow-streak path is unchanged: a
+// borderline-slow (300µs–2ms) route still needs adaptivePromoteStreak runs.
+func TestStreakStillRequiresEight(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/borderline", noopHandler)
+	r := s.router
+	for i := 0; i < adaptivePromoteStreak-1; i++ {
+		r.recordInlineRun("/borderline", true)
+	}
+	if r.isPromoted("/borderline") {
+		t.Fatalf("must not promote before %d consecutive slow runs", adaptivePromoteStreak)
+	}
+	r.recordInlineRun("/borderline", true) // the 8th
+	if !r.isPromoted("/borderline") {
+		t.Fatalf("must promote on the %dth consecutive slow run", adaptivePromoteStreak)
+	}
+}
diff --git a/async_promote_integration_linux_test.go b/async_promote_integration_linux_test.go
new file mode 100644
index 00000000..736c844a
--- /dev/null
+++ b/async_promote_integration_linux_test.go
@@ -0,0 +1,75 @@
+//go:build linux
+
+package celeris
+
+import (
+	"context"
+	"net"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestAdaptiveImmediatePromote_Epoll verifies improvement #3 end-to-end on the
+// real epoll engine (which runs HandleStream's inline timing): under
+// AsyncHandlers=true, an UNMARKED route whose handler blocks for longer than
+// adaptiveBlockingThreshold is promoted to async dispatch on the FIRST request,
+// not after adaptivePromoteStreak (8) of them. A fast route used only for
+// readiness must NOT promote.
+func TestAdaptiveImmediatePromote_Epoll(t *testing.T) {
+	s := New(Config{Engine: Epoll, AsyncHandlers: true})
+	s.GET("/ping", func(c *Context) error { return c.String(http.StatusOK, "ok") })
+	s.GET("/slow", func(c *Context) error {
+		time.Sleep(3 * time.Millisecond) // > adaptiveBlockingThreshold (2ms)
+		return c.String(http.StatusOK, "slow")
+	})
+	if !s.router.adaptiveRoutes["/slow"] || !s.router.adaptiveRoutes["/ping"] {
+		t.Fatal("both routes must be adaptive under AsyncHandlers=true")
+	}
+
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("listen: %v", err)
+	}
+	go func() { _ = s.StartWithListener(ln) }()
+	defer func() {
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		_ = s.Shutdown(ctx)
+	}()
+
+	base := "http://" + ln.Addr().String()
+	client := &http.Client{}
+	// Readiness on the FAST route only (so /slow is untouched until our 1 probe).
+	deadline := time.Now().Add(5 * time.Second)
+	ready := false
+	for time.Now().Before(deadline) {
+		if resp, err := client.Get(base + "/ping"); err == nil {
+			_ = resp.Body.Close()
+			ready = true
+			break
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	if !ready {
+		t.Fatal("server did not become ready")
+	}
+
+	if s.router.isPromoted("/slow") {
+		t.Fatal("/slow must not be promoted before any request to it")
+	}
+	// Exactly ONE request to the blocking route.
+	resp, err := client.Get(base + "/slow")
+	if err != nil {
+		t.Fatalf("GET /slow: %v", err)
+	}
+	_ = resp.Body.Close()
+
+	if !s.router.isPromoted("/slow") {
+		t.Fatal("#3: a single >2ms inline run must promote /slow immediately (got not-promoted)")
+	}
+	// The fast route hammered for readiness must never promote.
+	if s.router.isPromoted("/ping") {
+		t.Fatal("/ping (fast) must not be promoted")
+	}
+}
diff --git a/config.go b/config.go
index ecb84547..e69a25d9 100644
--- a/config.go
+++ b/config.go
@@ -40,6 +40,25 @@ const (
 // String returns the engine type name.
 func (t EngineType) String() string { return engine.EngineType(t).String() }
 
+// WorkloadHint is an OPTIONAL declaration of expected steady-state concurrency,
+// used only by the Adaptive engine's start-engine decision. Because connections
+// cannot migrate between epoll and io_uring, the START engine decides keep-alive
+// throughput — and the concurrency is unknowable when the server binds. This
+// hint is the ONLY way to make Adaptive START on io_uring; without it Adaptive
+// starts on epoll (best for the ramp-from-zero / low-concurrency / latency case)
+// and promotes new connections to io_uring under sustained high load.
+type WorkloadHint resource.WorkloadHint
+
+const (
+	// WorkloadUnspecified (default) → start epoll, promote under load.
+	WorkloadUnspecified WorkloadHint = WorkloadHint(resource.WorkloadUnspecified)
+	// WorkloadLowConcurrency → thin/latency-sensitive traffic; stay epoll.
+	WorkloadLowConcurrency WorkloadHint = WorkloadHint(resource.WorkloadLowConcurrency)
+	// WorkloadHighConcurrency → many h1 keep-alive conns/worker; start io_uring
+	// (when the kernel and RLIMIT_MEMLOCK allow it).
+	WorkloadHighConcurrency WorkloadHint = WorkloadHint(resource.WorkloadHighConcurrency)
+)
+
 // Config holds the public server configuration.
 type Config struct {
 	// Addr is the TCP address to listen on (e.g. ":8080").
@@ -52,6 +71,12 @@ type Config struct {
 	// Workers is the number of I/O worker goroutines (default GOMAXPROCS).
 	Workers int
 
+	// WorkloadHint optionally declares the expected steady-state concurrency.
+	// It only affects the Adaptive engine's start-engine choice (see WorkloadHint):
+	// the zero value starts on epoll; WorkloadHighConcurrency starts on io_uring
+	// when the kernel + memlock allow.
+	WorkloadHint WorkloadHint
+
 	// ReadTimeout is the max duration for reading the entire request.
 	// Zero uses the default (60s). Set to -1 for no timeout.
 	ReadTimeout time.Duration
@@ -133,13 +158,20 @@ type Config struct {
 	// groups can override it per handler with [Route.Async] /
 	// [RouteGroup.Async] (most-specific wins: route > group > this
 	// default), so a server with mostly CPU routes + a few DB routes can
-	// keep this false and mark just the DB routes .Async(), or set this
-	// true and mark hot CPU routes .Async(false). On HTTP/2 the override
-	// is honored per-stream (sync routes run inline on the event loop,
-	// async routes dispatch to the worker pool). Note: celeris drivers
-	// opened WithEngine(srv) consult the server-level AsyncHandlers (not
-	// per-route overrides) for their auto-async path selection — set this
-	// true when using WithEngine drivers under per-route async.
+	// keep this false and mark just the DB routes .Async() / .UsesDriver(),
+	// or set this true and mark hot CPU routes .Async(false). On HTTP/2 the
+	// override is honored per-stream (sync routes run inline on the event
+	// loop, async routes dispatch to the worker pool).
+	//
+	// DRIVERS: celeris drivers opened WithEngine(srv) pick their netpoll-park
+	// fast path from the server's EFFECTIVE async state — true when this flag
+	// is set OR any route is .Async(). So "keep this false + mark DB routes
+	// .Async()/.UsesDriver()" selects the fast driver path too, PROVIDED the
+	// driver is opened AFTER those routes are registered (the effective state
+	// is read at driver construction); otherwise set this true. Setting this
+	// true also enables the adaptive safety net that auto-promotes any unmarked
+	// handler slower than ~300µs, at a small learning-phase cost that settles to
+	// zero for static routes.
 	//
 	// Default: false.
 	AsyncHandlers bool
@@ -177,34 +209,6 @@ type Config struct {
 	//   - non-nil false: force disabled, even on Protocol=Auto. Useful when
 	//     the engine intentionally only serves HTTP/1.
 	EnableH2Upgrade *bool
-
-	// WorkerScaling configures the dynamic worker scaler. As of v1.4.6
-	// the scaler is DEFAULT-ON — leaving this field nil resolves to a
-	// zero-value [resource.WorkerScalingConfig], which activates the
-	// scaler with the data-validated defaults (Strategy=StartHigh,
-	// MinActive=max(2, NumCPU/2), TargetConnsPerWorker=20,
-	// Interval=250ms, ScaleUpStep=2, ScaleDownStep=1,
-	// ScaleDownHysteresis=1, ScaleDownIdleTicks=4). This matches the
-	// "just-works" public design intent already in place for the
-	// Engine=Adaptive and Protocol=Auto defaults.
-	//
-	// Pre-v1.4.6 behaviour (scaler always disabled unless explicitly
-	// configured) is achievable by passing a non-nil struct that
-	// effectively makes the scaler a no-op:
-	//
-	//	WorkerScaling: &resource.WorkerScalingConfig{
-	//	    MinActive: runtime.GOMAXPROCS(0), // pin at NumCPU; never scale down
-	//	}
-	//
-	// Set to a non-nil pointer with custom values to override one or
-	// more defaults. See [resource.WorkerScalingConfig] for tuning.
-	// The scaler keeps connections-per-active-worker around the target
-	// ratio by pausing/resuming workers; this dramatically improves
-	// CQE/event batching at low/mid concurrency where the static
-	// numCPU default would otherwise lose 30-90 % CPU/req to under-
-	// batched syscalls. See PR #257 / issue #281 for the full
-	// rationale and benchmark data.
-	WorkerScaling *resource.WorkerScalingConfig
 }
 
 // EngineMetrics is a point-in-time snapshot of engine-level performance counters.
@@ -250,23 +254,13 @@ func (c Config) toResourceConfig() resource.Config {
 	if c.MaxConns > 0 {
 		rc.Resources.MaxConns = c.MaxConns
 	}
+	rc.Resources.WorkloadHint = resource.WorkloadHint(c.WorkloadHint)
 
 	rc.MaxRequestBodySize = c.MaxRequestBodySize
 	rc.AsyncHandlers = c.AsyncHandlers
 	rc.OnExpectContinue = c.OnExpectContinue
 	rc.OnConnect = c.OnConnect
 	rc.OnDisconnect = c.OnDisconnect
-	// Dynamic worker scaler default-on (issue #281). A nil
-	// WorkerScaling field — i.e. the user did not configure it — now
-	// resolves to a zero-value struct so the scaler activates with
-	// the data-validated defaults. Opt-out is documented as setting
-	// MinActive=NumCPU (no-op scaler), which preserves backward
-	// compatibility for users who really do want pre-v1.4.6 behaviour.
-	if c.WorkerScaling != nil {
-		rc.WorkerScaling = c.WorkerScaling
-	} else {
-		rc.WorkerScaling = &resource.WorkerScalingConfig{}
-	}
 
 	// h2c upgrade resolution. Nil → protocol-dependent default (Auto → true,
 	// HTTP1/H2C → false). Non-nil → user override honored verbatim.
diff --git a/context.go b/context.go
index b0736478..c3f142aa 100644
--- a/context.go
+++ b/context.go
@@ -168,6 +168,19 @@ type Context struct {
 	hostOverride     string
 
 	respHdrBuf [16][2]string // reusable buffer for response headers (avoids heap escape)
+	// respHdrScratch retains the heap backing array the moment a middleware
+	// stack pushes respHeaders past respHdrBuf's 16 slots (e.g. chain-fullstack:
+	// 15 user headers + content-type + content-length = 17). Reused across
+	// requests so a >16-header route reallocates once per pooled Context, not
+	// once per request. nil until the first overflow.
+	respHdrScratch [][2]string
+	// blobHdrScratch is the reused assembly buffer for Blob's response header
+	// list (content-type + content-length + user headers) when the total exceeds
+	// respHdrBuf's 16 slots. Without it, Blob allocates make([][2]string,0,total)
+	// on EVERY many-header response (chain-fullstack: 18 headers ⇒ the dominant
+	// per-request alloc + GC pressure). Reused so it reallocates once per pooled
+	// Context, not once per request. nil until the first many-header Blob.
+	blobHdrScratch [][2]string
 
 	trustedNets []*net.IPNet
 
@@ -545,26 +558,27 @@ func (c *Context) reset() {
 	c.fullPath = ""
 	c.statusCode = 200
 	if n := len(c.respHeaders); n > 0 {
-		// respHeaders shares the respHdrBuf backing array up to its
-		// fixed capacity (16). Middleware stacks that emit more than 16
-		// response headers (kitchen_sink with secure + cache + ratelimit
-		// + etag etc. easily exceeds it) trigger append() to allocate a
-		// new backing array — respHeaders now has > 16 entries but
-		// respHdrBuf is still just 16. Without this clamp,
-		// `clear(c.respHdrBuf[:n])` panics with "slice bounds out of
-		// range [:N] with length 16" — the panic propagates through
-		// recoverAndRelease and aborts the iouring/epoll async handler
-		// AFTER WriteResponse has queued bytes into the per-conn
-		// writeBuf but BEFORE flushSend pushes them to the socket, so
-		// the client sees an empty response. std-engine escapes the
-		// damage because Go's net/http writes the response inline
-		// before the cleanup panic.
-		if n > len(c.respHdrBuf) {
-			n = len(c.respHdrBuf)
+		// When a middleware stack emits >16 response headers (secure + cache
+		// + ratelimit + etag + ... easily exceeds it), append() has moved
+		// respHeaders onto a heap backing array. Retain it as respHdrScratch
+		// and reuse it next request rather than dropping it (which forced a
+		// fresh ~576B alloc on every header-heavy request). The cap check is
+		// the correct discriminator: cap<=16 means respHeaders is still the
+		// inline respHdrBuf, so clear only its used prefix; cap>16 means a
+		// heap array, clear its full length. (This also removes the old clamp
+		// that papered over a clear(respHdrBuf[:n>16]) bounds panic.)
+		if cap(c.respHeaders) > len(c.respHdrBuf) {
+			clear(c.respHeaders)
+			c.respHdrScratch = c.respHeaders
+		} else {
+			clear(c.respHdrBuf[:n])
 		}
-		clear(c.respHdrBuf[:n])
 	}
-	c.respHeaders = c.respHdrBuf[:0]
+	if c.respHdrScratch != nil {
+		c.respHeaders = c.respHdrScratch[:0]
+	} else {
+		c.respHeaders = c.respHdrBuf[:0]
+	}
 	c.written = false
 	c.aborted = false
 	c.bytesWritten = 0
diff --git a/context_response.go b/context_response.go
index ffe673c6..7a5d749e 100644
--- a/context_response.go
+++ b/context_response.go
@@ -589,7 +589,14 @@ func (c *Context) Blob(code int, contentType string, data []byte) error {
 		headers = append(headers, [2]string{"content-length", itoa(len(data))})
 		headers = append(headers, tmp[:nUser]...)
 	} else {
-		headers = make([][2]string, 0, total)
+		// Reuse a per-Context scratch instead of allocating per request — the
+		// dominant chain-fullstack alloc (18 headers > respHdrBuf's 16) lived
+		// here. respHeaders never aliases blobHdrScratch (separate buffers; the
+		// append below copies the [2]string values).
+		if cap(c.blobHdrScratch) < total {
+			c.blobHdrScratch = make([][2]string, 0, total)
+		}
+		headers = c.blobHdrScratch[:0]
 		headers = append(headers, [2]string{"content-type", ct})
 		headers = append(headers, [2]string{"content-length", itoa(len(data))})
 		headers = append(headers, c.respHeaders...)
@@ -761,9 +768,13 @@ func (c *Context) SetResponseHeaders(headers [][2]string) {
 	if len(headers) <= len(c.respHdrBuf) {
 		copy(c.respHdrBuf[:len(headers)], headers)
 		c.respHeaders = c.respHdrBuf[:len(headers)]
+	} else if cap(c.respHdrScratch) >= len(headers) {
+		c.respHeaders = c.respHdrScratch[:len(headers)]
+		copy(c.respHeaders, headers)
 	} else {
 		c.respHeaders = make([][2]string, len(headers))
 		copy(c.respHeaders, headers)
+		c.respHdrScratch = c.respHeaders
 	}
 }
 
diff --git a/context_test.go b/context_test.go
index 98429d31..8f2be7a6 100644
--- a/context_test.go
+++ b/context_test.go
@@ -249,6 +249,73 @@ func TestContextResetWithOverflowedRespHeaders(t *testing.T) {
 	}
 }
 
+// TestContextRespHeaderOverflowReuseZeroAlloc locks in celeris#360: once a
+// middleware stack pushes respHeaders past the inline 16-slot respHdrBuf, the
+// grown heap backing array is retained as respHdrScratch and reused on every
+// subsequent request — so a >16-header route allocates the backing array ONCE
+// per pooled Context, not once per request (chain-fullstack: 17 headers).
+func TestContextRespHeaderOverflowReuseZeroAlloc(t *testing.T) {
+	s, _ := newTestStream("GET", "/test")
+	defer s.Release()
+	c := acquireContext(s)
+	defer releaseContext(c)
+
+	// 17 clean lowercase headers > respHdrBuf's 16 slots → forces the heap path.
+	hdrs := [][2]string{
+		{"h00", "v"}, {"h01", "v"}, {"h02", "v"}, {"h03", "v"}, {"h04", "v"},
+		{"h05", "v"}, {"h06", "v"}, {"h07", "v"}, {"h08", "v"}, {"h09", "v"},
+		{"h10", "v"}, {"h11", "v"}, {"h12", "v"}, {"h13", "v"}, {"h14", "v"},
+		{"h15", "v"}, {"h16", "v"},
+	}
+	avg := testing.AllocsPerRun(500, func() {
+		for _, h := range hdrs {
+			c.SetHeader(h[0], h[1])
+		}
+		c.reset()
+	})
+	if avg != 0 {
+		t.Fatalf("overflowed respHeaders must reuse the scratch backing array: got %.2f allocs/op, want 0", avg)
+	}
+	if cap(c.respHdrScratch) < len(hdrs) {
+		t.Fatalf("respHdrScratch should retain a >=%d cap backing array, got cap=%d", len(hdrs), cap(c.respHdrScratch))
+	}
+}
+
+// nopRW is a no-op ResponseWriter so an alloc test can isolate the Context path
+// from the mock writer's own header/body copies.
+type nopRW struct{}
+
+func (nopRW) WriteResponse(_ *stream.Stream, _ int, _ [][2]string, _ []byte) error { return nil }
+
+// TestContextBlobManyHeadersZeroAlloc locks in the chain-fullstack fix: when a
+// response carries more than the inline-buffer's headers (15 user + content-type
+// + content-length = 17 > 16), Blob must reuse blobHdrScratch instead of
+// allocating make([][2]string,0,total) every request — that was the dominant
+// per-request allocation (≈77% of chain-fullstack allocs → GC pressure).
+func TestContextBlobManyHeadersZeroAlloc(t *testing.T) {
+	s, _ := newTestStream("GET", "/test")
+	defer s.Release()
+	s.ResponseWriter = nopRW{} // the default mock writer allocates; isolate Blob
+	c := acquireContext(s)
+	defer releaseContext(c)
+
+	// 15 user headers ⇒ total 17 > respHdrBuf's 16 ⇒ Blob's many-header path.
+	for i := 0; i < 15; i++ {
+		c.SetHeader("h"+strconv.Itoa(i), "v")
+	}
+	body := []byte("hello")
+	avg := testing.AllocsPerRun(500, func() {
+		c.written = false
+		_ = c.Blob(200, "application/json", body)
+	})
+	if avg != 0 {
+		t.Fatalf("Blob many-header path must reuse blobHdrScratch: got %.2f allocs/op, want 0", avg)
+	}
+	if cap(c.blobHdrScratch) < 17 {
+		t.Fatalf("blobHdrScratch should retain a >=17 cap backing array, got cap=%d", cap(c.blobHdrScratch))
+	}
+}
+
 func TestContextFullPathResetOnRelease(t *testing.T) {
 	s, _ := newTestStream("GET", "/test")
 	defer s.Release()
diff --git a/engine/engine.go b/engine/engine.go
index 295e375a..aeeaf6b2 100644
--- a/engine/engine.go
+++ b/engine/engine.go
@@ -43,24 +43,6 @@ type SwitchFreezer interface {
 	UnfreezeSwitching()
 }
 
-// WorkerScaler is implemented by engines that support per-worker pause/resume
-// for dynamic capacity adjustment based on load. Used by the higher-level
-// scaler in the adaptive engine to delegate worker activation to whichever
-// sub-engine is currently active. Per-worker pause is asynchronous — the
-// worker drains in-flight connections before going SUSPENDED. Resume wakes a
-// suspended worker so it re-creates its listen socket and rejoins the
-// SO_REUSEPORT group.
-type WorkerScaler interface {
-	// NumWorkers returns the total worker pool size (max active count).
-	NumWorkers() int
-	// PauseWorker deactivates worker i. Asynchronous; returns immediately.
-	// Idempotent — pausing an already-paused worker is a no-op.
-	PauseWorker(i int)
-	// ResumeWorker reactivates worker i. Wakes the worker if SUSPENDED.
-	// Idempotent — resuming an active worker is a no-op.
-	ResumeWorker(i int)
-}
-
 // SendfileCapable is an optional interface implemented by engines that
 // support zero-copy file responses via sendfile(2). The H1 static-file
 // response path type-asserts the engine for it; engines that do not
@@ -124,4 +106,27 @@ type EngineMetrics struct { //nolint:revive // user-approved name
 	// how often the inline → goroutine handoff fires vs the pure-sync
 	// inline fast path.
 	AsyncPromotedConns uint64
+	// Workers is the number of I/O workers (io_uring) or event loops
+	// (epoll) the engine is running. Static after Listen. The adaptive
+	// controller divides ActiveConnections by it to derive the
+	// conns-per-worker load signal that drives engine selection.
+	Workers int
+	// AcceptCount is the cumulative number of connections accepted by this
+	// engine since start. Together with elapsed time it yields the accept
+	// rate (new-connection arrival rate) used as a secondary load signal.
+	AcceptCount uint64
+	// CloseCount is the cumulative number of connections closed by this
+	// engine since start. AcceptCount - CloseCount tracks the live count;
+	// a high close rate relative to accepts indicates short-lived
+	// churn-style connections.
+	CloseCount uint64
+	// BytesRead is the cumulative number of payload bytes received from
+	// the network across all connections. Used with BytesWritten and
+	// RequestCount to derive the average bytes-per-request signal that
+	// suppresses io_uring selection for link-bound (large-payload)
+	// workloads where the engines tie.
+	BytesRead uint64
+	// BytesWritten is the cumulative number of payload bytes sent to the
+	// network across all connections. See BytesRead.
+	BytesWritten uint64
 }
diff --git a/engine/epoll/engine.go b/engine/epoll/engine.go
index 8096b699..a1f49d14 100644
--- a/engine/epoll/engine.go
+++ b/engine/epoll/engine.go
@@ -13,7 +13,6 @@ import (
 	"time"
 
 	"github.com/goceleris/celeris/engine"
-	"github.com/goceleris/celeris/engine/scaler"
 	"github.com/goceleris/celeris/internal/platform"
 	"github.com/goceleris/celeris/protocol/h2/stream"
 	"github.com/goceleris/celeris/resource"
@@ -34,6 +33,17 @@ type Engine struct {
 		// asyncPromoted counts inline → dispatch-goroutine promotions
 		// across all loops (celeris #300).
 		asyncPromoted atomic.Uint64
+		// acceptCount / closeCount track cumulative connection lifecycle
+		// events; bytesRead / bytesWritten track cumulative payload bytes.
+		// All four feed the adaptive controller's load signals. Bytes are
+		// batched per-loop and flushed once per event-loop iteration
+		// (mirroring reqCount) to avoid hot-path cache-line bouncing;
+		// accepts/closes are infrequent so they increment directly like
+		// activeConns.
+		acceptCount  atomic.Uint64
+		closeCount   atomic.Uint64
+		bytesRead    atomic.Uint64
+		bytesWritten atomic.Uint64
 	}
 	// asyncRoutes is the static AsyncRoutes count snapshotted at
 	// construction from the handler's AsyncRouteCount (#300 G3).
@@ -91,7 +101,9 @@ func (e *Engine) Listen(ctx context.Context) error {
 		l := newLoop(i, cpus[i], e.handler,
 			resolved, e.cfg,
 			&e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount,
-			&e.metrics.asyncPromoted, &e.acceptPaused)
+			&e.metrics.asyncPromoted, &e.acceptPaused,
+			&e.metrics.acceptCount, &e.metrics.closeCount,
+			&e.metrics.bytesRead, &e.metrics.bytesWritten)
 		e.loops[i] = l
 	}
 	e.mu.Unlock()
@@ -126,16 +138,6 @@ func (e *Engine) Listen(ctx context.Context) error {
 		)
 	}
 
-	// Dynamic loop scaler. Typed cfg.WorkerScaling takes precedence over
-	// env vars. Suppressed when wrapped by adaptive — adaptive runs ONE
-	// higher-level scaler that delegates to the active sub-engine. The
-	// algorithm itself lives in engine/scaler.
-	if !e.cfg.SkipBuiltinScaler {
-		if scalerCfg := scaler.Resolve(e.cfg, len(e.loops)); scalerCfg.Enabled {
-			go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns)
-		}
-	}
-
 	<-ctx.Done()
 	wg.Wait()
 	return nil
@@ -176,6 +178,11 @@ func (e *Engine) Metrics() engine.EngineMetrics {
 		ErrorCount:         e.metrics.errCount.Load(),
 		AsyncRoutes:        e.asyncRoutes,
 		AsyncPromotedConns: e.metrics.asyncPromoted.Load(),
+		Workers:            len(e.loops),
+		AcceptCount:        e.metrics.acceptCount.Load(),
+		CloseCount:         e.metrics.closeCount.Load(),
+		BytesRead:          e.metrics.bytesRead.Load(),
+		BytesWritten:       e.metrics.bytesWritten.Load(),
 	}
 }
 
@@ -250,7 +257,6 @@ func (e *Engine) ResumeAccept() error {
 var (
 	_ engine.Engine           = (*Engine)(nil)
 	_ engine.AcceptController = (*Engine)(nil)
-	_ engine.WorkerScaler     = (*Engine)(nil)
 )
 
 // Addr returns the bound listener address.
diff --git a/engine/epoll/loop.go b/engine/epoll/loop.go
index 72c8e591..58369767 100644
--- a/engine/epoll/loop.go
+++ b/engine/epoll/loop.go
@@ -77,11 +77,6 @@ type Loop struct {
 	wake         chan struct{}
 	wakeMu       sync.Mutex
 	suspended    atomic.Bool
-	// inactive is the per-worker pause flag used by the dynamic worker
-	// scaler. ORed with acceptPaused (which is engine-wide) when computing
-	// effective paused state. The scaler flips this to deactivate idle
-	// loops under low load and reactivate them under burst load.
-	inactive atomic.Bool
 	// listenFDClosed signals that the loop has closed its listen FD in
 	// response to acceptPaused=true. PauseAccept polls this so it only
 	// returns once the SO_REUSEPORT group has actually shed this listener
@@ -93,14 +88,20 @@ type Loop struct {
 	// re-arms the signal.
 	listenFDClosed atomic.Bool
 
-	reqCount         *atomic.Uint64
-	activeConns      *atomic.Int64
-	errCount         *atomic.Uint64
-	asyncPromoted    *atomic.Uint64 // cumulative inline → dispatch promotions (#300)
-	reqBatch         uint64         // batched request count, flushed to reqCount per iteration
-	tickCounter      uint32
-	consecutiveEmpty uint32 // consecutive iterations with no events (for adaptive timeout)
-	cachedNow        int64  // cached time.Now().UnixNano(), refreshed once per events return
+	reqCount          *atomic.Uint64
+	activeConns       *atomic.Int64
+	errCount          *atomic.Uint64
+	asyncPromoted     *atomic.Uint64 // cumulative inline → dispatch promotions (#300)
+	acceptCount       *atomic.Uint64 // cumulative accepts (engine-wide, shared)
+	closeCount        *atomic.Uint64 // cumulative closes (engine-wide, shared)
+	bytesRead         *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared)
+	bytesWritten      *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared)
+	reqBatch          uint64         // batched request count, flushed to reqCount per iteration
+	bytesReadBatch    uint64         // batched recv bytes, flushed to bytesRead per iteration
+	bytesWrittenBatch uint64         // batched send bytes, flushed to bytesWritten per iteration
+	tickCounter       uint32
+	consecutiveEmpty  uint32 // consecutive iterations with no events (for adaptive timeout)
+	cachedNow         int64  // cached time.Now().UnixNano(), refreshed once per events return
 
 	// fdCapDrops counts accepted fds that fell outside the l.conns table
 	// (fd >= connTableSize) and were force-closed in acceptAll. Worker-
@@ -148,7 +149,8 @@ type Loop struct {
 func newLoop(id, cpuID int, handler stream.Handler,
 	resolved resource.ResolvedResources,
 	cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64,
-	asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) *Loop {
+	asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool,
+	acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) *Loop {
 
 	return &Loop{
 		id:           id,
@@ -167,8 +169,10 @@ func newLoop(id, cpuID int, handler stream.Handler,
 		acceptPaused: acceptPaused,
 		wake:         make(chan struct{}),
 		ready:        make(chan error, 1),
+		// TCPNoDelay is omitted here: accepted sockets inherit TCP_NODELAY
+		// from the listen socket (set in createListenSocket), so per-accept
+		// ApplyFD doesn't need to re-issue it.
 		sockOpts: sockopts.Options{
-			TCPNoDelay:  true,
 			TCPQuickAck: true,
 			SOBusyPoll:  50 * time.Microsecond,
 			RecvBuf:     resolved.SocketRecv,
@@ -178,6 +182,10 @@ func newLoop(id, cpuID int, handler stream.Handler,
 		activeConns:   activeConns,
 		errCount:      errCount,
 		asyncPromoted: asyncPromoted,
+		acceptCount:   acceptCount,
+		closeCount:    closeCount,
+		bytesRead:     bytesRead,
+		bytesWritten:  bytesWritten,
 		h2cfg: conn.H2Config{
 			MaxConcurrentStreams: cfg.MaxConcurrentStreams,
 			InitialWindowSize:    cfg.InitialWindowSize,
@@ -282,8 +290,7 @@ func (l *Loop) run(ctx context.Context) {
 		// Cache the atomic load: ACTIVE→DRAINING and SUSPENDED→ACTIVE
 		// branches both read it. Saves 1 atomic load per event-loop
 		// iteration on the steady-state hot path.
-		// OR with the per-loop inactive flag (dynamic scaler).
-		paused := l.acceptPaused.Load() || l.inactive.Load()
+		paused := l.acceptPaused.Load()
 		if l.listenFD >= 0 && paused {
 			// Drain any pending accepts from the kernel's listen queue so
 			// they get a clean shutdown (FIN) rather than the RST that
@@ -465,7 +472,7 @@ func (l *Loop) run(ctx context.Context) {
 			if mu := cs.detachMu; mu != nil {
 				mu.Lock()
 			}
-			err := flushWrites(cs)
+			err := l.flushWrites(cs, true)
 			if err != nil {
 				// Surface I/O failure to detached middleware before closing.
 				if cs.h1State != nil && cs.h1State.OnError != nil {
@@ -510,7 +517,7 @@ func (l *Loop) run(ctx context.Context) {
 			if cs != nil && cs.h2State != nil && cs.h2State.WriteQueuePending() {
 				cs.h2State.DrainWriteQueue(cs.writeFn)
 				if cs.writePos < len(cs.writeBuf) {
-					if fErr := flushWrites(cs); fErr != nil {
+					if fErr := l.flushWrites(cs, true); fErr != nil {
 						l.removeDirty(cs)
 						l.closeConn(fd)
 					} else if cs.writePos < len(cs.writeBuf) {
@@ -531,6 +538,17 @@ func (l *Loop) run(ctx context.Context) {
 			l.reqBatch = 0
 		}
 
+		// Flush batched payload-byte counters with the same per-iteration
+		// cadence as reqCount, for the same cache-line-contention reason.
+		if l.bytesReadBatch > 0 {
+			l.bytesRead.Add(l.bytesReadBatch)
+			l.bytesReadBatch = 0
+		}
+		if l.bytesWrittenBatch > 0 {
+			l.bytesWritten.Add(l.bytesWrittenBatch)
+			l.bytesWrittenBatch = 0
+		}
+
 		// Check connection timeouts. Default cadence is every 1024 iterations
 		// (~100ms under load); when detached conns exist with idle deadlines
 		// the gate tightens to every 32 iterations (~50ms idle wall time)
@@ -549,10 +567,9 @@ func (l *Loop) run(ctx context.Context) {
 		}
 
 		// DRAINING → SUSPENDED: no listen socket, no connections, events processed.
-		// Combined paused: engine-wide OR per-loop (dynamic scaler).
-		if l.listenFD < 0 && l.connCount == 0 && (l.acceptPaused.Load() || l.inactive.Load()) {
+		if l.listenFD < 0 && l.connCount == 0 && l.acceptPaused.Load() {
 			l.wakeMu.Lock()
-			if !l.acceptPaused.Load() && !l.inactive.Load() {
+			if !l.acceptPaused.Load() {
 				l.wakeMu.Unlock()
 				continue
 			}
@@ -661,6 +678,7 @@ func (l *Loop) acceptAll(ctx context.Context, now int64) {
 		}
 		cs.writeFn = l.makeWriteFn(cs)
 		l.activeConns.Add(1)
+		l.acceptCount.Add(1)
 
 		if l.cfg.OnConnect != nil {
 			l.cfg.OnConnect(cs.remoteAddr)
@@ -719,7 +737,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 			if mu := cs.detachMu; mu != nil {
 				mu.Lock()
 			}
-			_ = flushWrites(cs)
+			_ = l.flushWrites(cs, true)
 			// Surface read failure to detached middleware (e.g. WS).
 			if cs.h1State != nil && cs.h1State.OnError != nil {
 				cs.h1State.OnError(err)
@@ -734,7 +752,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 			if mu := cs.detachMu; mu != nil {
 				mu.Lock()
 			}
-			_ = flushWrites(cs)
+			_ = l.flushWrites(cs, true)
 			// Surface peer-close (EOF) to detached middleware.
 			if cs.h1State != nil && cs.h1State.OnError != nil {
 				cs.h1State.OnError(errPeerClosed)
@@ -747,6 +765,9 @@ func (l *Loop) drainRead(fd int, now int64) {
 		}
 
 		cs.lastActivity = now
+		// n > 0 here (the err and EOF cases returned above): bytes read on
+		// this iteration, into either cs.buf or the zero-copy bodyBuf.
+		l.bytesReadBatch += uint64(n)
 
 		// Direct-into-bodyBuf completion path: we read straight into
 		// H1State.bodyBuf; dispatch the handler if the body is full,
@@ -769,7 +790,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 				if mu := cs.detachMu; mu != nil {
 					mu.Lock()
 				}
-				_ = flushWrites(cs)
+				_ = l.flushWrites(cs, true)
 				if mu := cs.detachMu; mu != nil {
 					mu.Unlock()
 				}
@@ -786,7 +807,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 					if mu := cs.detachMu; mu != nil {
 						mu.Lock()
 					}
-					_ = flushWrites(cs)
+					_ = l.flushWrites(cs, true)
 					if mu := cs.detachMu; mu != nil {
 						mu.Unlock()
 					}
@@ -797,7 +818,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 			if mu := cs.detachMu; mu != nil {
 				mu.Lock()
 			}
-			if err := flushWrites(cs); err != nil {
+			if err := l.flushWrites(cs, true); err != nil {
 				if mu := cs.detachMu; mu != nil {
 					mu.Unlock()
 				}
@@ -947,7 +968,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 				// skipped by the `continue` so we must flush explicitly here,
 				// otherwise the client blocks forever waiting for the 101.
 				if cs.writePos < len(cs.writeBuf) {
-					if fErr := flushWrites(cs); fErr != nil {
+					if fErr := l.flushWrites(cs, true); fErr != nil {
 						l.closeConn(fd)
 						return
 					}
@@ -1008,7 +1029,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 			if mu := cs.detachMu; mu != nil {
 				mu.Lock()
 			}
-			_ = flushWrites(cs)
+			_ = l.flushWrites(cs, true)
 			cs.pendingBytes = 0
 			if cs.h1State != nil && cs.h1State.OnError != nil {
 				cs.h1State.OnError(processErr)
@@ -1029,7 +1050,7 @@ func (l *Loop) drainRead(fd int, now int64) {
 			mu.Lock()
 		}
 		if csWritePending(cs) {
-			if fErr := flushWrites(cs); fErr != nil {
+			if fErr := l.flushWrites(cs, true); fErr != nil {
 				if cs.h1State != nil && cs.h1State.OnError != nil {
 					cs.h1State.OnError(fErr)
 				}
@@ -1123,6 +1144,7 @@ func (l *Loop) hijackConn(fd int) (net.Conn, error) {
 	l.conns[fd] = nil
 	l.connCount--
 	l.activeConns.Add(-1)
+	l.closeCount.Add(1)
 
 	f := os.NewFile(uintptr(fd), "tcp")
 	c, err := net.FileConn(f)
@@ -1655,7 +1677,10 @@ func (l *Loop) runAsyncHandler(cs *connState) {
 				// flushWrites may partially complete; residual bytes
 				// stay in cs.writeBuf and the worker retries via
 				// markDirty once drainDetachQueue picks us up.
-				if err := flushWrites(cs); err != nil {
+				// Dispatch-goroutine call site: not the loop thread, so
+				// byte accounting must hit the shared atomic, not the
+				// per-loop batch (false → atomic path).
+				if err := l.flushWrites(cs, false); err != nil {
 					promoteErr = err
 				}
 			}
@@ -1738,7 +1763,8 @@ func (l *Loop) runAsyncHandler(cs *connState) {
 		// race the middleware's own write path.
 		if !errors.Is(processErr, conn.ErrHijacked) &&
 			(cs.writePos < len(cs.writeBuf) || len(cs.bodyBuf) > 0) {
-			flushErr = flushWrites(cs)
+			// Dispatch-goroutine call site (false → shared-atomic byte path).
+			flushErr = l.flushWrites(cs, false)
 		}
 		// Resync pendingBytes with actual buffer state. makeWriteFn uses
 		// pendingBytes to enforce writeCap backpressure; without this
@@ -1945,7 +1971,7 @@ func (l *Loop) handleWritable(cs *connState) {
 	if mu := cs.detachMu; mu != nil {
 		mu.Lock()
 	}
-	err := flushWrites(cs)
+	err := l.flushWrites(cs, true)
 	drained := err == nil && !csWritePending(cs)
 	if err == nil {
 		if drained {
@@ -2238,6 +2264,7 @@ func (l *Loop) closeConn(fd int) {
 	l.conns[fd] = nil
 	l.connCount--
 	l.activeConns.Add(-1)
+	l.closeCount.Add(1)
 
 	if l.cfg.OnDisconnect != nil {
 		l.cfg.OnDisconnect(cs.remoteAddr)
@@ -2379,6 +2406,10 @@ func createListenSocket(addr string) (int, error) {
 		return -1, err
 	}
 
+	// TCP_NODELAY on the listen socket: Linux copies it onto every accepted
+	// socket at SYN time, so per-accept ApplyFD can skip its own NODELAY
+	// setsockopt (one fewer syscall per accept on the hot path).
+	_ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 	// TCP_DEFER_ACCEPT: kernel holds connections until data arrives,
 	// eliminating wasted accept+wait cycles for idle connections.
 	_ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_DEFER_ACCEPT, 1)
diff --git a/engine/epoll/nodelay_inherit_linux_test.go b/engine/epoll/nodelay_inherit_linux_test.go
new file mode 100644
index 00000000..b6d7bcdf
--- /dev/null
+++ b/engine/epoll/nodelay_inherit_linux_test.go
@@ -0,0 +1,92 @@
+//go:build linux
+
+package epoll
+
+import (
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+)
+
+// TestListenSocketHasNoDelay asserts createListenSocket enables TCP_NODELAY on
+// the listen socket itself — the precondition for accepted sockets inheriting
+// it (#337).
+func TestListenSocketHasNoDelay(t *testing.T) {
+	lfd, err := createListenSocket("127.0.0.1:0")
+	if err != nil {
+		t.Skipf("listen socket unavailable: %v", err)
+	}
+	t.Cleanup(func() { _ = unix.Close(lfd) })
+
+	v, err := unix.GetsockoptInt(lfd, unix.IPPROTO_TCP, unix.TCP_NODELAY)
+	if err != nil {
+		t.Fatalf("getsockopt TCP_NODELAY: %v", err)
+	}
+	if v == 0 {
+		t.Fatal("listen socket TCP_NODELAY not set")
+	}
+}
+
+// TestAcceptedConnInheritsNoDelay verifies accepted sockets carry TCP_NODELAY
+// without acceptAll re-issuing the per-accept setsockopt. The kernel copies
+// TCP_NODELAY from the listen socket at SYN time; this guards that the
+// per-accept NODELAY removal (#337) leaves accepted conns nagle-disabled.
+func TestAcceptedConnInheritsNoDelay(t *testing.T) {
+	lfd, err := createListenSocket("127.0.0.1:0")
+	if err != nil {
+		t.Skipf("listen socket unavailable: %v", err)
+	}
+	t.Cleanup(func() { _ = unix.Close(lfd) })
+
+	la := boundAddr(lfd)
+	if la == nil {
+		t.Skip("could not resolve bound addr")
+	}
+	tcpAddr, ok := la.(*net.TCPAddr)
+	if !ok {
+		t.Skipf("unexpected addr type %T", la)
+	}
+
+	cfd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Skipf("client socket unavailable: %v", err)
+	}
+	t.Cleanup(func() { _ = unix.Close(cfd) })
+
+	var sa unix.SockaddrInet4
+	sa.Port = tcpAddr.Port
+	copy(sa.Addr[:], tcpAddr.IP.To4())
+	if err := unix.Connect(cfd, &sa); err != nil {
+		t.Fatalf("connect: %v", err)
+	}
+
+	// TCP_DEFER_ACCEPT is set on the listener, so the connection only becomes
+	// acceptable once data arrives — send a byte before accepting.
+	if _, err := unix.Write(cfd, []byte{'x'}); err != nil {
+		t.Fatalf("write: %v", err)
+	}
+
+	var afd int
+	for i := 0; i < 200; i++ {
+		afd, _, err = unix.Accept4(lfd, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC)
+		if err == unix.EAGAIN {
+			time.Sleep(time.Millisecond)
+			continue
+		}
+		break
+	}
+	if err != nil {
+		t.Fatalf("accept: %v", err)
+	}
+	t.Cleanup(func() { _ = unix.Close(afd) })
+
+	v, err := unix.GetsockoptInt(afd, unix.IPPROTO_TCP, unix.TCP_NODELAY)
+	if err != nil {
+		t.Fatalf("getsockopt TCP_NODELAY on accepted fd: %v", err)
+	}
+	if v == 0 {
+		t.Fatal("accepted socket did not inherit TCP_NODELAY from listener")
+	}
+}
diff --git a/engine/epoll/review_v150_test.go b/engine/epoll/review_v150_test.go
index 64538b57..745c2b0a 100644
--- a/engine/epoll/review_v150_test.go
+++ b/engine/epoll/review_v150_test.go
@@ -80,11 +80,15 @@ func TestCheckTimeoutsClosesAllExpired(t *testing.T) {
 	t.Cleanup(func() { _ = unix.Close(epfd) })
 
 	l := &Loop{
-		epollFD:     epfd,
-		conns:       make([]*connState, 1024),
-		liveConns:   make([]int, 0, 8),
-		activeConns: &atomic.Int64{},
-		cfg:         resource.Config{IdleTimeout: time.Second},
+		epollFD:      epfd,
+		conns:        make([]*connState, 1024),
+		liveConns:    make([]int, 0, 8),
+		activeConns:  &atomic.Int64{},
+		closeCount:   &atomic.Uint64{},
+		acceptCount:  &atomic.Uint64{},
+		bytesRead:    &atomic.Uint64{},
+		bytesWritten: &atomic.Uint64{},
+		cfg:          resource.Config{IdleTimeout: time.Second},
 	}
 	past := time.Now().Add(-time.Hour).UnixNano()
 
@@ -168,17 +172,21 @@ func TestAcceptAllDrainsBacklogNoStrand(t *testing.T) {
 	}
 
 	l := &Loop{
-		epollFD:     epfd,
-		listenFD:    lfd,
-		conns:       make([]*connState, connTableSize),
-		liveConns:   make([]int, 0, 256),
-		activeConns: &atomic.Int64{},
-		errCount:    &atomic.Uint64{},
-		reqCount:    &atomic.Uint64{},
-		eventFD:     -1,
-		timerFD:     -1,
-		resolved:    resource.ResolvedResources{BufferSize: 8192},
-		cfg:         resource.Config{},
+		epollFD:      epfd,
+		listenFD:     lfd,
+		conns:        make([]*connState, connTableSize),
+		liveConns:    make([]int, 0, 256),
+		activeConns:  &atomic.Int64{},
+		errCount:     &atomic.Uint64{},
+		reqCount:     &atomic.Uint64{},
+		acceptCount:  &atomic.Uint64{},
+		closeCount:   &atomic.Uint64{},
+		bytesRead:    &atomic.Uint64{},
+		bytesWritten: &atomic.Uint64{},
+		eventFD:      -1,
+		timerFD:      -1,
+		resolved:     resource.ResolvedResources{BufferSize: 8192},
+		cfg:          resource.Config{},
 	}
 
 	const want = 200
@@ -261,12 +269,16 @@ func TestHijackDefersReleaseWhileAsyncGoroutineActive(t *testing.T) {
 	}
 
 	l := &Loop{
-		epollFD:     epfd,
-		conns:       make([]*connState, connTableSize),
-		liveConns:   make([]int, 0, 8),
-		activeConns: &atomic.Int64{},
-		eventFD:     -1,
-		cfg:         resource.Config{},
+		epollFD:      epfd,
+		conns:        make([]*connState, connTableSize),
+		liveConns:    make([]int, 0, 8),
+		activeConns:  &atomic.Int64{},
+		closeCount:   &atomic.Uint64{},
+		acceptCount:  &atomic.Uint64{},
+		bytesRead:    &atomic.Uint64{},
+		bytesWritten: &atomic.Uint64{},
+		eventFD:      -1,
+		cfg:          resource.Config{},
 	}
 
 	// Async-mode conn: detachMu set, dispatch goroutine "alive" (asyncRun).
@@ -345,12 +357,16 @@ func TestHijackSyncReleasesImmediately(t *testing.T) {
 	}
 
 	l := &Loop{
-		epollFD:     epfd,
-		conns:       make([]*connState, connTableSize),
-		liveConns:   make([]int, 0, 8),
-		activeConns: &atomic.Int64{},
-		eventFD:     -1,
-		cfg:         resource.Config{},
+		epollFD:      epfd,
+		conns:        make([]*connState, connTableSize),
+		liveConns:    make([]int, 0, 8),
+		activeConns:  &atomic.Int64{},
+		closeCount:   &atomic.Uint64{},
+		acceptCount:  &atomic.Uint64{},
+		bytesRead:    &atomic.Uint64{},
+		bytesWritten: &atomic.Uint64{},
+		eventFD:      -1,
+		cfg:          resource.Config{},
 	}
 	// Sync mode: detachMu nil, no dispatch goroutine.
 	cs := &connState{fd: local, liveIdx: -1}
diff --git a/engine/epoll/scaler.go b/engine/epoll/scaler.go
deleted file mode 100644
index f8e948ad..00000000
--- a/engine/epoll/scaler.go
+++ /dev/null
@@ -1,62 +0,0 @@
-//go:build linux
-
-package epoll
-
-import (
-	"context"
-	"log/slog"
-	"sync/atomic"
-
-	"github.com/goceleris/celeris/engine/scaler"
-)
-
-// epollScalerSource adapts the epoll Engine to the scaler.Source
-// interface. Generation always returns 0 — epoll isn't a meta-engine.
-type epollScalerSource struct {
-	e           *Engine
-	activeConns *atomic.Int64
-}
-
-func (s *epollScalerSource) NumWorkers() int      { return s.e.NumWorkers() }
-func (s *epollScalerSource) ActiveConns() int64   { return s.activeConns.Load() }
-func (s *epollScalerSource) PauseWorker(i int)    { s.e.PauseWorker(i) }
-func (s *epollScalerSource) ResumeWorker(i int)   { s.e.ResumeWorker(i) }
-func (s *epollScalerSource) Generation() uint64   { return 0 }
-func (s *epollScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger }
-
-// runScaler is started by Engine.Listen when scaler config is enabled.
-// All algorithm logic lives in engine/scaler.
-func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) {
-	scaler.Run(ctx, &epollScalerSource{e: e, activeConns: activeConns}, cfg)
-}
-
-// PauseWorker deactivates loop i. The loop drains in-flight conns and
-// goes SUSPENDED. Asynchronous; returns immediately.
-func (e *Engine) PauseWorker(i int) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	if i < 0 || i >= len(e.loops) {
-		return
-	}
-	e.loops[i].inactive.Store(true)
-}
-
-// ResumeWorker reactivates loop i. Wakes the loop from SUSPENDED if it was
-// already idle.
-func (e *Engine) ResumeWorker(i int) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	if i < 0 || i >= len(e.loops) {
-		return
-	}
-	l := e.loops[i]
-	l.inactive.Store(false)
-	l.listenFDClosed.Store(false)
-	l.wakeMu.Lock()
-	if l.suspended.Load() {
-		close(l.wake)
-		l.wake = make(chan struct{})
-		l.suspended.Store(false)
-	}
-	l.wakeMu.Unlock()
-}
diff --git a/engine/epoll/scaler_test.go b/engine/epoll/scaler_test.go
deleted file mode 100644
index ee7507ac..00000000
--- a/engine/epoll/scaler_test.go
+++ /dev/null
@@ -1,8 +0,0 @@
-//go:build linux
-
-package epoll
-
-// The scaler algorithm itself lives in engine/scaler and has its own
-// test suite there. The epoll engine's scaler.go is just the
-// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on loops
-// slice).
diff --git a/engine/epoll/writer.go b/engine/epoll/writer.go
index 75f58dc6..7f4526f4 100644
--- a/engine/epoll/writer.go
+++ b/engine/epoll/writer.go
@@ -16,16 +16,33 @@ import (
 // writev(2) with iovec = [writeBuf[writePos:], bodyBuf]. Saves one
 // full body-sized memcpy per request compared to appending the body
 // into writeBuf first.
-func flushWrites(cs *connState) error {
+// onLoopThread distinguishes the worker (event-loop) call sites from the
+// async-dispatch-goroutine ones. Loop-thread bytes accumulate in the
+// non-atomic per-loop batch (flushed once per iteration, like reqBatch);
+// goroutine bytes go straight to the shared atomic. Keeping the two paths
+// apart is what makes the batch field worker-thread-only and therefore
+// race-free — the dispatch goroutine never touches bytesWrittenBatch.
+func (l *Loop) addWrittenBytes(n int, onLoopThread bool) {
+	if n <= 0 {
+		return
+	}
+	if onLoopThread {
+		l.bytesWrittenBatch += uint64(n)
+		return
+	}
+	l.bytesWritten.Add(uint64(n))
+}
+
+func (l *Loop) flushWrites(cs *connState, onLoopThread bool) error {
 	if len(cs.bodyBuf) > 0 {
-		return flushWritesV(cs)
+		return l.flushWritesV(cs, onLoopThread)
 	}
 	if cs.writePos >= len(cs.writeBuf) {
 		cs.writeBuf = cs.writeBuf[:0]
 		cs.writePos = 0
 		// writeBuf is empty — if a zero-copy sendfile response is pending,
 		// drive it now (after any prior pipelined writeBuf bytes flushed).
-		return flushSendfile(cs)
+		return l.flushSendfile(cs, onLoopThread)
 	}
 	n, err := unix.Write(cs.fd, cs.writeBuf[cs.writePos:])
 	if err != nil {
@@ -36,6 +53,7 @@ func flushWrites(cs *connState) error {
 		cs.writePos = 0
 		return err
 	}
+	l.addWrittenBytes(n, onLoopThread)
 	cs.writePos += n
 	if cs.writePos >= len(cs.writeBuf) {
 		// Fully flushed
@@ -43,7 +61,7 @@ func flushWrites(cs *connState) error {
 		cs.writePos = 0
 		// writeBuf drained this call — continue into the sendfile body if a
 		// sendfile response is pending. Keeps header→body ordering correct.
-		return flushSendfile(cs)
+		return l.flushSendfile(cs, onLoopThread)
 	} else if cs.writePos > cap(cs.writeBuf)/2 {
 		// Amortized compaction: only copy when more than half the buffer is consumed.
 		// This reduces the average cost from O(n) per partial write to O(1) amortized.
@@ -66,12 +84,13 @@ func flushWrites(cs *connState) error {
 // the writeBuf path above. On a real I/O error the file is released and
 // the error is surfaced (the caller closes the conn). On completion the
 // dup'd file is closed and cs.sendfile cleared.
-func flushSendfile(cs *connState) error {
+func (l *Loop) flushSendfile(cs *connState, onLoopThread bool) error {
 	st := cs.sendfile
 	if st == nil {
 		return nil
 	}
-	_, err := st.advance(cs.fd)
+	sent, err := st.advance(cs.fd)
+	l.addWrittenBytes(int(sent), onLoopThread)
 	if err != nil {
 		if err == unix.EAGAIN || err == unix.EWOULDBLOCK {
 			return nil // send buffer full; resume on next writable edge
@@ -120,7 +139,7 @@ func csPendingBytes(cs *connState) int {
 // is non-nil; clears bodyBuf once fully sent. Partial writev collapses
 // the remainder into writeBuf so the next flushWrites call uses the
 // plain write path.
-func flushWritesV(cs *connState) error {
+func (l *Loop) flushWritesV(cs *connState, onLoopThread bool) error {
 	headerRem := cs.writeBuf[cs.writePos:]
 	iovs := [2][]byte{headerRem, cs.bodyBuf}
 	n, err := unix.Writev(cs.fd, iovs[:])
@@ -133,6 +152,7 @@ func flushWritesV(cs *connState) error {
 		cs.bodyBuf = nil
 		return err
 	}
+	l.addWrittenBytes(n, onLoopThread)
 	total := len(headerRem) + len(cs.bodyBuf)
 	switch {
 	case n >= total:
diff --git a/engine/iouring/async_revert_test.go b/engine/iouring/async_revert_test.go
new file mode 100644
index 00000000..c298e2cd
--- /dev/null
+++ b/engine/iouring/async_revert_test.go
@@ -0,0 +1,243 @@
+//go:build linux
+
+package iouring
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/goceleris/celeris/engine"
+	"github.com/goceleris/celeris/protocol/h2/stream"
+	"github.com/goceleris/celeris/resource"
+)
+
+// revertResolverHandler is an AsyncRouteResolver whose async classification is
+// flipped at runtime, simulating celeris#356 route promotion (true) and the
+// celeris#364 TTL de-promotion (false) without waiting on the real router TTL.
+type revertResolverHandler struct{ async atomic.Bool }
+
+func (h *revertResolverHandler) HandleStream(_ context.Context, s *stream.Stream) error {
+	if s.ResponseWriter == nil {
+		return nil
+	}
+	return s.ResponseWriter.WriteResponse(s, 200,
+		[][2]string{{"content-type", "text/plain"}, {"content-length", "2"}}, []byte("ok"))
+}
+func (h *revertResolverHandler) RouteAsync(_, _ string) bool { return h.async.Load() }
+func (h *revertResolverHandler) HasAsyncRoutes() bool        { return true }
+
+func startRevertEngine(t *testing.T, h stream.Handler) *Engine {
+	t.Helper()
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("pick port: %v", err)
+	}
+	addr := ln.Addr().String()
+	_ = ln.Close()
+
+	e, err := New(resource.Config{
+		Addr:          addr,
+		Protocol:      engine.HTTP1,
+		Resources:     resource.Resources{Workers: 4},
+		AsyncHandlers: true,
+	}, h)
+	if err != nil {
+		t.Skipf("iouring engine unavailable: %v", err)
+	}
+	ctx, cancel := context.WithCancel(t.Context())
+	errCh := make(chan error, 1)
+	go func() { errCh <- e.Listen(ctx) }()
+	t.Cleanup(func() {
+		cancel()
+		select {
+		case <-errCh:
+		case <-time.After(3 * time.Second):
+		}
+	})
+	dl := time.Now().Add(30 * time.Second)
+	for time.Now().Before(dl) && e.Addr() == nil {
+		select {
+		case err := <-errCh:
+			if err != nil && (strings.Contains(err.Error(), "cannot allocate memory") ||
+				strings.Contains(err.Error(), "io_uring_setup") || strings.Contains(err.Error(), "tier")) {
+				t.Skipf("io_uring unavailable on this runner: %v", err)
+			}
+			t.Fatalf("engine.Listen returned early: %v", err)
+		default:
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	if e.Addr() == nil {
+		t.Fatal("engine did not bind in time")
+	}
+	return e
+}
+
+// sendKeepAlive issues one keep-alive GET on an existing conn and reads the
+// full 2-byte "ok" response, leaving the conn open for the next request.
+func sendKeepAlive(c net.Conn, br *bufio.Reader, timeout time.Duration) error {
+	_ = c.SetDeadline(time.Now().Add(timeout))
+	if _, err := c.Write([]byte("GET /x HTTP/1.1\r\nHost: x\r\n\r\n")); err != nil {
+		return fmt.Errorf("write: %w", err)
+	}
+	statusLine, err := br.ReadString('\n')
+	if err != nil {
+		return fmt.Errorf("read status: %w", err)
+	}
+	if len(statusLine) < 12 || statusLine[9:12] != "200" {
+		return fmt.Errorf("bad status: %q", statusLine)
+	}
+	for {
+		line, err := br.ReadString('\n')
+		if err != nil {
+			return fmt.Errorf("read header: %w", err)
+		}
+		if line == "\r\n" || line == "\n" {
+			break
+		}
+	}
+	body := make([]byte, 2)
+	if _, err := io.ReadFull(br, body); err != nil {
+		return fmt.Errorf("read body: %w", err)
+	}
+	if string(body) != "ok" {
+		return fmt.Errorf("bad body: %q", body)
+	}
+	return nil
+}
+
+// TestAsyncConnRevertsOnRouteDepromotion is the celeris#364 conn-level revert
+// regression: a keep-alive conn promoted to async dispatch must return to the
+// inline fast path once its route de-promotes (RouteAsync flips false), proven
+// by its ability to RE-promote afterwards (a still-async conn cannot re-promote
+// — it never re-runs the inline ErrAsyncDispatch gate).
+func TestAsyncConnRevertsOnRouteDepromotion(t *testing.T) {
+	h := &revertResolverHandler{}
+	h.async.Store(true)
+	e := startRevertEngine(t, h)
+	target := e.Addr().String()
+
+	c, err := net.DialTimeout("tcp", target, 2*time.Second)
+	if err != nil {
+		t.Fatalf("dial: %v", err)
+	}
+	defer func() { _ = c.Close() }()
+	br := bufio.NewReader(c)
+	send := func(phase string) {
+		if err := sendKeepAlive(c, br, 2*time.Second); err != nil {
+			t.Fatalf("%s request failed: %v", phase, err)
+		}
+	}
+
+	// Phase A — RouteAsync=true: the conn promotes to async dispatch.
+	for i := 0; i < 20; i++ {
+		send("promote")
+	}
+	p1 := e.Metrics().AsyncPromotedConns
+	if p1 == 0 {
+		t.Fatal("conn never promoted under RouteAsync=true")
+	}
+
+	// Phase B — RouteAsync=false: the dispatch goroutine must revert the conn
+	// to inline at its next idle point. Requests must keep succeeding.
+	h.async.Store(false)
+	for i := 0; i < 30; i++ {
+		send("revert")
+		time.Sleep(time.Millisecond)
+	}
+
+	// Phase C — RouteAsync=true again: a reverted (now inline) conn re-promotes;
+	// a conn still stuck async would NOT, so AsyncPromotedConns must increase.
+	h.async.Store(true)
+	for i := 0; i < 30; i++ {
+		send("repromote")
+	}
+	p2 := e.Metrics().AsyncPromotedConns
+	if p2 <= p1 {
+		t.Fatalf("conn did not revert+re-promote (still stuck async): promoted before=%d after=%d", p1, p2)
+	}
+	t.Logf("promotions: after-A=%d after-C=%d (revert confirmed by re-promotion)", p1, p2)
+}
+
+// TestAsyncConnRevertRace hammers the promote/feed/revert/re-promote interaction
+// across many keep-alive conns with RouteAsync flipping continuously. Run under
+// -race it validates that the worker recv path and the dispatch goroutine never
+// race on cs.asyncPromoted / asyncInBuf during a revert. Gated on -short.
+func TestAsyncConnRevertRace(t *testing.T) {
+	if testing.Short() {
+		t.Skip("revert race test needs sustained toggling load; -short skips it")
+	}
+	h := &revertResolverHandler{}
+	h.async.Store(true)
+	e := startRevertEngine(t, h)
+	target := e.Addr().String()
+
+	const (
+		concurrency = 64
+		duration    = 10 * time.Second
+	)
+	stop := make(chan struct{})
+	// Toggler: flip the route's async classification fast enough that conns are
+	// constantly promoting and reverting.
+	go func() {
+		tk := time.NewTicker(3 * time.Millisecond)
+		defer tk.Stop()
+		for {
+			select {
+			case <-stop:
+				return
+			case <-tk.C:
+				h.async.Store(!h.async.Load())
+			}
+		}
+	}()
+
+	deadline := time.Now().Add(duration)
+	var ok, failed atomic.Int64
+	var wg sync.WaitGroup
+	for i := 0; i < concurrency; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			c, err := net.DialTimeout("tcp", target, 2*time.Second)
+			if err != nil {
+				failed.Add(1)
+				return
+			}
+			defer func() { _ = c.Close() }()
+			br := bufio.NewReader(c)
+			for time.Now().Before(deadline) {
+				if err := sendKeepAlive(c, br, 2*time.Second); err != nil {
+					failed.Add(1)
+					return // conn broken; a corrupted response would surface here
+				}
+				ok.Add(1)
+			}
+		}()
+	}
+	wg.Wait()
+	close(stop)
+
+	if ok.Load() < 1000 {
+		t.Fatalf("too few successful requests (ok=%d failed=%d) — server may have stalled", ok.Load(), failed.Load())
+	}
+	// Liveness epilogue.
+	h.async.Store(false)
+	c, err := net.DialTimeout("tcp", target, 2*time.Second)
+	if err != nil {
+		t.Fatalf("engine not serving after revert churn: %v", err)
+	}
+	defer func() { _ = c.Close() }()
+	if err := sendKeepAlive(c, bufio.NewReader(c), 2*time.Second); err != nil {
+		t.Fatalf("engine not serving after revert churn: %v", err)
+	}
+	t.Logf("ok=%d failed=%d over %s", ok.Load(), failed.Load(), duration)
+}
diff --git a/engine/iouring/buf_ring_scale_test.go b/engine/iouring/buf_ring_scale_test.go
index e535d782..6aa5a5fe 100644
--- a/engine/iouring/buf_ring_scale_test.go
+++ b/engine/iouring/buf_ring_scale_test.go
@@ -16,7 +16,7 @@ func TestResolveBufRingCountDefaults(t *testing.T) {
 		targetConns int
 		want        int
 	}{
-		// Formula: 2 * TargetConnsPerWorker, rounded up to a power of 2,
+		// Formula: 2 * connsPerWorker, rounded up to a power of 2,
 		// clamped to [bufRingCountMin=1024, bufRingCountMax]. The ring is
 		// PER-WORKER, so the result MUST NOT depend on Workers (celeris#322
 		// follow-up — the prior 2*Workers*target over-sized every worker's
diff --git a/engine/iouring/check_timeouts_test.go b/engine/iouring/check_timeouts_test.go
index 50178e81..8d0c6512 100644
--- a/engine/iouring/check_timeouts_test.go
+++ b/engine/iouring/check_timeouts_test.go
@@ -65,6 +65,7 @@ func TestCheckTimeoutsSwapRemoveNoSkip(t *testing.T) {
 		liveConns:   make([]int, 0, n),
 		errCount:    &atomic.Uint64{},
 		activeConns: &atomic.Int64{},
+		closeCount:  &atomic.Uint64{},
 		cfg:         resource.Config{IdleTimeout: time.Second},
 	}
 	past := time.Now().Add(-time.Hour).UnixNano()
diff --git a/engine/iouring/conn.go b/engine/iouring/conn.go
index 06a8476f..4521400d 100644
--- a/engine/iouring/conn.go
+++ b/engine/iouring/conn.go
@@ -149,10 +149,20 @@ type connState struct {
 	asyncClosed atomic.Bool
 	// asyncPromoted is set once an async-marked route has been observed
 	// on this conn while it ran inline on the worker (per-handler async,
-	// celeris #300). Sticky: once promoted every subsequent recv goes
-	// straight to the dispatch goroutine instead of retrying the inline
-	// fast path. Reset on release.
-	asyncPromoted bool
+	// celeris #300). Once promoted, recv goes to the dispatch goroutine.
+	// REVERSIBLE (celeris#364): the dispatch goroutine clears it to revert
+	// the conn to inline when the promoting route de-promotes. Atomic because
+	// the worker reads it on the recv hot path while the goroutine may clear
+	// it; the worker re-reads it under asyncInMu before feeding to close the
+	// feed-vs-revert race. Reset on release.
+	asyncPromoted atomic.Bool
+	// promotedMethod/promotedPath record the route that forced this conn's
+	// promotion (celeris#364). Written by the worker before the dispatch
+	// goroutine starts (happens-before), read by the goroutine to decide
+	// revert. Empty => not revert-eligible (e.g. promoted for a buffered
+	// partial-header / chunked continuation, where no full route is known).
+	promotedMethod string
+	promotedPath   string
 	// asyncH2Promoted signals the worker that runAsyncHandler observed
 	// ErrUpgradeH2C and completed the cs-local H1→H2 state swap under
 	// detachMu. drainDetachQueue finishes the promotion by appending
@@ -304,7 +314,9 @@ func releaseConnState(cs *connState) {
 	cs.asyncOutBuf = cs.asyncOutBuf[:0]
 	cs.asyncRun = false
 	cs.asyncClosed.Store(false)
-	cs.asyncPromoted = false
+	cs.asyncPromoted.Store(false)
+	cs.promotedMethod = ""
+	cs.promotedPath = ""
 	cs.asyncDetachUnlocked = false
 	cs.asyncDetachPending = false
 	cs.bodyBuf = nil
diff --git a/engine/iouring/consts.go b/engine/iouring/consts.go
index 3ff11afd..a8e24182 100644
--- a/engine/iouring/consts.go
+++ b/engine/iouring/consts.go
@@ -41,8 +41,8 @@ const (
 	// opPROVIDEBUFFERS = 31 removed in v1.5.0 (celeris#320) — the legacy
 	// BufferGroup / PROVIDE_BUFFERS SQE was never wired into the engine.
 	// The supported path is IORING_REGISTER_PBUF_RING (BufferRing above).
-	opSHUTDOWN = 52 // IORING_OP_SHUTDOWN (kernel 5.11+)
-	opSENDZC   = 53 // IORING_OP_SEND_ZC (kernel 6.0+)
+	opSHUTDOWN = 34 // IORING_OP_SHUTDOWN (kernel 5.11+)
+	opSENDZC   = 47 // IORING_OP_SEND_ZC (kernel 6.0+)
 )
 
 // SQE flags.
@@ -57,7 +57,7 @@ const (
 const (
 	cqeFBuffer = 1 << 0
 	cqeFMore   = 1 << 1
-	cqeFNotif  = 1 << 2 // IORING_CQE_F_NOTIF: zero-copy send notification
+	cqeFNotif  = 1 << 3 // IORING_CQE_F_NOTIF: zero-copy send notification
 )
 
 // Accept flags.
@@ -104,3 +104,11 @@ const (
 	sqeSize = 64
 	cqeSize = 16
 )
+
+// sendZCMinBytes is the payload threshold below which SEND_ZC is not worth its
+// cost. Zero-copy adds a second (NOTIF) CQE per send and holds zcNotifPending
+// across the buffer's DMA lifetime, stalling the next flush; it also forfeits
+// the SEND→RECV link on the keep-alive fast path. Only above this size does the
+// avoided memcpy outweigh the extra completion, so small responses use a plain
+// linked SEND (1 CQE, immediate buffer reuse).
+const sendZCMinBytes = 4096
diff --git a/engine/iouring/engine.go b/engine/iouring/engine.go
index 722c64bf..2b4f67e2 100644
--- a/engine/iouring/engine.go
+++ b/engine/iouring/engine.go
@@ -12,7 +12,6 @@ import (
 	"time"
 
 	"github.com/goceleris/celeris/engine"
-	"github.com/goceleris/celeris/engine/scaler"
 	"github.com/goceleris/celeris/internal/platform"
 	"github.com/goceleris/celeris/probe"
 	"github.com/goceleris/celeris/protocol/h2/stream"
@@ -36,6 +35,17 @@ type Engine struct {
 		// asyncPromoted counts the cumulative inline → dispatch-goroutine
 		// promotions across all workers (celeris #300).
 		asyncPromoted atomic.Uint64
+		// acceptCount / closeCount track cumulative connection lifecycle
+		// events; bytesRead / bytesWritten track cumulative payload bytes.
+		// All four feed the adaptive controller's load signals. Bytes are
+		// batched per-worker and flushed once per event-loop iteration
+		// (mirroring reqCount) to avoid hot-path cache-line bouncing;
+		// accepts/closes are infrequent so they increment directly like
+		// activeConns.
+		acceptCount  atomic.Uint64
+		closeCount   atomic.Uint64
+		bytesRead    atomic.Uint64
+		bytesWritten atomic.Uint64
 	}
 	// asyncRoutes is cached from the handler's HasAsyncRoutes/route count
 	// at construction so Metrics() doesn't pay the type-assertion per
@@ -235,16 +245,6 @@ func (e *Engine) Listen(ctx context.Context) error {
 		)
 	}
 
-	// Dynamic worker scaler. Typed cfg.WorkerScaling takes precedence over
-	// env vars. Suppressed when wrapped by adaptive — adaptive runs ONE
-	// higher-level scaler that delegates to the active sub-engine. The
-	// algorithm itself lives in engine/scaler; this is just the call site.
-	if !e.cfg.SkipBuiltinScaler {
-		if scalerCfg := scaler.Resolve(e.cfg, len(workers)); scalerCfg.Enabled {
-			go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns)
-		}
-	}
-
 	<-ctx.Done()
 	// Workers use SubmitAndWaitTimeout and check ctx.Err() on each iteration,
 	// so they will exit within ~100ms of context cancellation.
@@ -259,7 +259,9 @@ func (e *Engine) createWorkers(tier TierStrategy, cpus []int,
 		w, err := newWorker(i, cpus[i], tier, e.handler,
 			resolved, e.cfg,
 			&e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount,
-			&e.metrics.asyncPromoted, &e.acceptPaused)
+			&e.metrics.asyncPromoted, &e.acceptPaused,
+			&e.metrics.acceptCount, &e.metrics.closeCount,
+			&e.metrics.bytesRead, &e.metrics.bytesWritten)
 		if err != nil {
 			// Clean up already-created workers.
 			for _, prev := range workers[:i] {
@@ -314,6 +316,11 @@ func (e *Engine) Metrics() engine.EngineMetrics {
 		ErrorCount:         e.metrics.errCount.Load(),
 		AsyncRoutes:        e.asyncRoutes,
 		AsyncPromotedConns: e.metrics.asyncPromoted.Load(),
+		Workers:            len(e.workers),
+		AcceptCount:        e.metrics.acceptCount.Load(),
+		CloseCount:         e.metrics.closeCount.Load(),
+		BytesRead:          e.metrics.bytesRead.Load(),
+		BytesWritten:       e.metrics.bytesWritten.Load(),
 	}
 }
 
@@ -388,7 +395,6 @@ var (
 	_ engine.Engine            = (*Engine)(nil)
 	_ engine.AcceptController  = (*Engine)(nil)
 	_ engine.EventLoopProvider = (*Engine)(nil)
-	_ engine.WorkerScaler      = (*Engine)(nil)
 )
 
 // NumWorkers returns the number of worker event loops available for
diff --git a/engine/iouring/engine_test.go b/engine/iouring/engine_test.go
index cdee385a..f44acdf8 100644
--- a/engine/iouring/engine_test.go
+++ b/engine/iouring/engine_test.go
@@ -197,9 +197,11 @@ func TestSelectTierOptionalWithoutSendZC(t *testing.T) {
 	}
 }
 
-func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) {
-	// SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN.
-	// SQPOLL path must use neither IPI-related flag.
+func TestSelectTierOptionalUsesTaskrunNotSQPoll(t *testing.T) {
+	// #377: SQPOLL is never selected. Per-worker rings would spawn one kernel
+	// poll thread per worker (N spinning cores), and the dormant SQPOLL submit
+	// path has a latent SQ-tail-publish race. The optional tier uses the
+	// task-run completion model like the high tier instead.
 	profile := engine.CapabilityProfile{
 		IOUringTier:  engine.Optional,
 		CoopTaskrun:  true,
@@ -212,18 +214,36 @@ func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) {
 		t.Fatal("expected non-nil tier")
 	}
 	flags := tier.SetupFlags()
-	if flags&setupDeferTaskrun != 0 {
-		t.Error("DEFER_TASKRUN must not be set with SQPOLL (incompatible)")
+	if flags&setupSQPoll != 0 {
+		t.Error("SQPOLL must not be selected (#377)")
 	}
-	if flags&setupCoopTaskrun != 0 {
-		t.Error("COOP_TASKRUN must not be set with SQPOLL (incompatible)")
+	if flags&setupDeferTaskrun == 0 {
+		t.Error("expected DEFER_TASKRUN when available")
+	}
+	if flags&setupSingleIssuer == 0 {
+		t.Error("expected SINGLE_ISSUER")
 	}
-	if flags&setupSQPoll == 0 {
-		t.Error("expected SQPOLL in setup flags")
+	if tier.SQPollIdle() != 0 {
+		t.Error("SQPollIdle must be 0 when SQPOLL is disabled")
 	}
 	if !tier.SupportsFixedFiles() {
 		t.Error("fixed files should be enabled when profile.FixedFiles is true")
 	}
+
+	// Without DEFER_TASKRUN (e.g. a 6.0 kernel), fall back to COOP_TASKRUN —
+	// still never SQPOLL.
+	coopProfile := engine.CapabilityProfile{
+		IOUringTier: engine.Optional,
+		CoopTaskrun: true,
+		SQPoll:      true,
+	}
+	coopFlags := SelectTier(coopProfile, 0).SetupFlags()
+	if coopFlags&setupSQPoll != 0 {
+		t.Error("SQPOLL must not be selected without DeferTaskrun either (#377)")
+	}
+	if coopFlags&setupCoopTaskrun == 0 {
+		t.Error("expected COOP_TASKRUN when DEFER_TASKRUN unavailable")
+	}
 }
 
 func TestSelectTierNone(t *testing.T) {
diff --git a/engine/iouring/probe.go b/engine/iouring/probe.go
index 19875881..3a9ce69e 100644
--- a/engine/iouring/probe.go
+++ b/engine/iouring/probe.go
@@ -201,7 +201,7 @@ func probeSendZC() (SendZCProbeResult, string) {
 
 	entry = ring.cqeAt(cqHead)
 	notifFlags := entry.Flags
-	isNotif := notifFlags&0x04 != 0 // CQE_F_NOTIF
+	isNotif := notifFlags&cqeFNotif != 0 // CQE_F_NOTIF (1<<3); 0x04 is SOCK_NONEMPTY
 	notifRes := entry.Res
 	ring.EndCQ(cqHead + 1)
 
diff --git a/engine/iouring/ring.go b/engine/iouring/ring.go
index f2d3ed87..d4f54d75 100644
--- a/engine/iouring/ring.go
+++ b/engine/iouring/ring.go
@@ -19,6 +19,29 @@ import (
 // we already promised it'd fit.
 const minMemlockPerWorker = 12 * 1024 * 1024
 
+// MaxWorkersForMemlock returns the io_uring worker ceiling imposed by
+// RLIMIT_MEMLOCK, or -1 when the soft limit is unlimited (RLIM_INFINITY) or
+// cannot be read — i.e. "no cap". It performs no syscall beyond Getrlimit and
+// constructs no ring, so it is safe to call from the adaptive start decision
+// BEFORE building the io_uring engine. The returned ceiling is at least 1 when
+// a finite limit is present (createWorkers/NewRing will surface a precise
+// ENOMEM if even one worker doesn't fit). This is the single source of truth
+// for the rlim.Cur / minMemlockPerWorker math (capWorkersToMemlock calls it).
+func MaxWorkersForMemlock() int {
+	var rlim unix.Rlimit
+	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
+		return -1
+	}
+	if rlim.Cur == ^uint64(0) {
+		return -1
+	}
+	maxByMemlock := int(rlim.Cur / minMemlockPerWorker)
+	if maxByMemlock < 1 {
+		maxByMemlock = 1
+	}
+	return maxByMemlock
+}
+
 // capWorkersToMemlock returns min(want, RLIMIT_MEMLOCK / minMemlockPerWorker).
 // When the soft limit is unlimited (RLIM_INFINITY), the request is honoured
 // as-is. When the cap forces a reduction, the chosen logger is informed so
@@ -30,20 +53,15 @@ func capWorkersToMemlock(want int, logger *slog.Logger) int {
 	if want <= 1 {
 		return want
 	}
-	var rlim unix.Rlimit
-	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
-		return want
-	}
-	if rlim.Cur == ^uint64(0) {
+	maxByMemlock := MaxWorkersForMemlock()
+	if maxByMemlock == -1 {
 		return want
 	}
-	maxByMemlock := int(rlim.Cur / minMemlockPerWorker)
-	if maxByMemlock < 1 {
-		maxByMemlock = 1
-	}
 	if maxByMemlock >= want {
 		return want
 	}
+	var rlim unix.Rlimit
+	_ = unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim)
 	if logger != nil {
 		logger.Warn("io_uring workers capped by RLIMIT_MEMLOCK",
 			"requested", want,
@@ -231,6 +249,14 @@ func (r *Ring) mmap() error {
 }
 
 // GetSQE returns a pointer to the next available SQE, or nil if the ring is full.
+//
+// NOT SQPOLL-safe: this advances the SHARED SQ tail before the caller writes
+// the SQE payload, which is correct only because io_uring_enter is the kernel
+// sync point on the non-SQPOLL submit path. Under IORING_SETUP_SQPOLL the
+// kernel poll thread reads the tail continuously and could consume a
+// half-written SQE — so SQPOLL must not be enabled without first switching to
+// a deferred local tail + a release-store publish after fill. No tier selects
+// SQPOLL (see optionalTier / #377), so this path is never exercised today.
 func (r *Ring) GetSQE() unsafe.Pointer {
 	var tail, head uint32
 	if r.singleIssuer {
diff --git a/engine/iouring/scaler.go b/engine/iouring/scaler.go
deleted file mode 100644
index d4ef07fd..00000000
--- a/engine/iouring/scaler.go
+++ /dev/null
@@ -1,65 +0,0 @@
-//go:build linux
-
-package iouring
-
-import (
-	"context"
-	"log/slog"
-	"sync/atomic"
-
-	"github.com/goceleris/celeris/engine/scaler"
-)
-
-// iouringScalerSource adapts the iouring Engine to the scaler.Source
-// interface. Generation always returns 0 — iouring isn't a meta-engine
-// and doesn't switch identities at runtime; only the adaptive engine's
-// source returns non-zero generations.
-type iouringScalerSource struct {
-	e           *Engine
-	activeConns *atomic.Int64
-}
-
-func (s *iouringScalerSource) NumWorkers() int      { return s.e.NumWorkers() }
-func (s *iouringScalerSource) ActiveConns() int64   { return s.activeConns.Load() }
-func (s *iouringScalerSource) PauseWorker(i int)    { s.e.PauseWorker(i) }
-func (s *iouringScalerSource) ResumeWorker(i int)   { s.e.ResumeWorker(i) }
-func (s *iouringScalerSource) Generation() uint64   { return 0 }
-func (s *iouringScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger }
-
-// runScaler is started by Engine.Listen when scaler config is enabled.
-// All algorithm logic lives in engine/scaler; this is a thin source
-// adapter so the iouring engine plugs into the shared loop.
-func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) {
-	scaler.Run(ctx, &iouringScalerSource{e: e, activeConns: activeConns}, cfg)
-}
-
-// PauseWorker deactivates worker i. The worker drains in-flight conns
-// and goes SUSPENDED. Asynchronous; returns immediately.
-func (e *Engine) PauseWorker(i int) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	if i < 0 || i >= len(e.workers) {
-		return
-	}
-	e.workers[i].inactive.Store(true)
-}
-
-// ResumeWorker reactivates worker i. Wakes the worker from SUSPENDED if
-// it was already idle.
-func (e *Engine) ResumeWorker(i int) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	if i < 0 || i >= len(e.workers) {
-		return
-	}
-	w := e.workers[i]
-	w.inactive.Store(false)
-	w.listenFDClosed.Store(false)
-	w.wakeMu.Lock()
-	if w.suspended.Load() {
-		close(w.wake)
-		w.wake = make(chan struct{})
-		w.suspended.Store(false)
-	}
-	w.wakeMu.Unlock()
-}
diff --git a/engine/iouring/scaler_test.go b/engine/iouring/scaler_test.go
deleted file mode 100644
index 91819e3d..00000000
--- a/engine/iouring/scaler_test.go
+++ /dev/null
@@ -1,10 +0,0 @@
-//go:build linux
-
-package iouring
-
-// The scaler algorithm itself lives in engine/scaler and has its own
-// test suite there. The iouring engine's scaler.go is just the
-// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on workers
-// slice). The runtime exercises this adapter via the existing engine
-// integration tests (e.g., async_churn_uaf_test.go) and the spike-B
-// strict-matrix runs that ran with CELERIS_DYN_WORKERS=1.
diff --git a/engine/iouring/send_zc_gate_test.go b/engine/iouring/send_zc_gate_test.go
new file mode 100644
index 00000000..dc99302f
--- /dev/null
+++ b/engine/iouring/send_zc_gate_test.go
@@ -0,0 +1,81 @@
+//go:build linux
+
+package iouring
+
+import (
+	"testing"
+	"unsafe"
+)
+
+// TestUseSendZC pins the zero-copy gating policy (celeris#332): ZC is only
+// chosen for unlinked sends whose payload is at least sendZCMinBytes. Below the
+// threshold, or for any linked send, or when the capability is absent, the plain
+// SEND path must win.
+func TestUseSendZC(t *testing.T) {
+	cases := []struct {
+		name   string
+		sendZC bool
+		linked bool
+		n      int
+		want   bool
+	}{
+		{"large-unlinked-capable", true, false, sendZCMinBytes, true},
+		{"above-threshold", true, false, sendZCMinBytes + 1, true},
+		{"just-below-threshold", true, false, sendZCMinBytes - 1, false},
+		{"small-unlinked", true, false, 100, false},
+		{"empty", true, false, 0, false},
+		{"large-linked", true, true, sendZCMinBytes * 4, false},
+		{"large-no-capability", false, false, sendZCMinBytes * 4, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := useSendZC(tc.sendZC, tc.linked, tc.n); got != tc.want {
+				t.Errorf("useSendZC(%v, %v, %d) = %v, want %v",
+					tc.sendZC, tc.linked, tc.n, got, tc.want)
+			}
+		})
+	}
+}
+
+// prepSendSQEOpcode runs prepSendSQE against a freshly zeroed SQE for a worker
+// with ZC enabled and an unlinked send of payload size n, returning the chosen
+// opcode byte.
+func prepSendSQEOpcode(t *testing.T, n int) byte {
+	t.Helper()
+	var sqe [sqeSize]byte
+	w := &Worker{sendZC: true}
+	cs := &connState{fd: 7, sendBuf: make([]byte, n)}
+	w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, false)
+	return sqe[0]
+}
+
+// TestPrepSendSQEGatesBySize verifies the worker async flush path emits a plain
+// SEND for sub-threshold responses (1 CQE, no NOTIF stall) and SEND_ZC only once
+// the payload reaches sendZCMinBytes.
+func TestPrepSendSQEGatesBySize(t *testing.T) {
+	if op := prepSendSQEOpcode(t, 100); op != opSEND {
+		t.Errorf("small response opcode = %d, want opSEND(%d)", op, opSEND)
+	}
+	if op := prepSendSQEOpcode(t, sendZCMinBytes-1); op != opSEND {
+		t.Errorf("just-below-threshold opcode = %d, want opSEND(%d)", op, opSEND)
+	}
+	if op := prepSendSQEOpcode(t, sendZCMinBytes); op != opSENDZC {
+		t.Errorf("at-threshold opcode = %d, want opSENDZC(%d)", op, opSENDZC)
+	}
+}
+
+// TestPrepSendSQELinkedNeverZC guards the link invariant: a linked send must
+// never use SEND_ZC (the NOTIF CQE would break the SEND→RECV chain), regardless
+// of payload size.
+func TestPrepSendSQELinkedNeverZC(t *testing.T) {
+	var sqe [sqeSize]byte
+	w := &Worker{sendZC: true}
+	cs := &connState{fd: 7, sendBuf: make([]byte, sendZCMinBytes*4)}
+	w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, true)
+	if sqe[0] != opSEND {
+		t.Errorf("linked large send opcode = %d, want opSEND(%d)", sqe[0], opSEND)
+	}
+	if sqe[1]&sqeIOLink == 0 {
+		t.Errorf("linked send missing IOSQE_IO_LINK in flags 0x%02x", sqe[1])
+	}
+}
diff --git a/engine/iouring/tier.go b/engine/iouring/tier.go
index 7860c302..ad07d406 100644
--- a/engine/iouring/tier.go
+++ b/engine/iouring/tier.go
@@ -24,9 +24,9 @@ type TierStrategy interface {
 }
 
 // SelectTier returns the highest available tier strategy for the given profile.
-// sqPollIdle is the objective-specific SQPOLL thread idle timeout; if zero,
-// defaults to 2000ms.
-func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) TierStrategy {
+// The sqPollIdle parameter is retained for signature stability but no longer
+// used: SQPOLL is not selected by any tier (see optionalTier doc / #377).
+func SelectTier(profile engine.CapabilityProfile, _ time.Duration) TierStrategy {
 	switch {
 	// DEFER_TASKRUN: completions run in worker's context (no extra kernel thread).
 	// Preferred over SQPOLL because the SQPOLL kernel thread steals CPU from workers.
@@ -40,12 +40,7 @@ func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) Tier
 			multishotRecv:   profile.MultishotRecv,
 		}
 	case profile.IOUringTier >= engine.Optional && profile.SQPoll:
-		idle := uint32(sqPollIdle.Milliseconds())
-		if idle == 0 {
-			idle = 2000
-		}
 		return &optionalTier{
-			sqPollIdle:      idle,
 			deferTaskrun:    profile.DeferTaskrun,
 			fixedFiles:      profile.FixedFiles,
 			sendZC:          profile.SendZC,
@@ -156,7 +151,7 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) {
 	if sqe == nil {
 		return
 	}
-	if t.sendZC && !linked {
+	if useSendZC(t.sendZC, linked, len(buf)) {
 		if t.fixedFiles {
 			prepSendZCFixed(sqe, fd, buf, false)
 		} else {
@@ -171,9 +166,18 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) {
 	setSQEUserData(sqe, encodeUserData(udSend, fd))
 }
 
-// optionalTier: kernel 6.0+, adds SQPOLL, SEND_ZC. With 6.1+: DEFER_TASKRUN, fixed files.
+// optionalTier: kernel 6.0+ with provided buffers but below the High tier
+// (e.g. a 6.x kernel where the provided-buffers probe disabled the High path).
+// Uses the task-run completion model, NOT SQPOLL.
+//
+// SQPOLL is deliberately never used here: celeris runs one io_uring ring per
+// worker, so SQPOLL would spawn one kernel poll thread PER worker (N spinning
+// cores — measured -83% throughput / 75% idle CPU on a 16-worker box), and the
+// dormant SQPOLL submit path has a latent SQ-tail-publish race in GetSQE
+// (the shared tail is advanced before the SQE payload is written, which is
+// safe only because io_uring_enter is the sync point on the non-SQPOLL path).
+// See #377. SQPollIdle returns 0 so the worker never enters the SQPOLL branch.
 type optionalTier struct {
-	sqPollIdle      uint32
 	deferTaskrun    bool
 	fixedFiles      bool
 	sendZC          bool
@@ -183,16 +187,20 @@ type optionalTier struct {
 
 func (t *optionalTier) Tier() engine.Tier { return engine.Optional }
 func (t *optionalTier) SetupFlags() uint32 {
-	// SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN.
-	// SINGLE_ISSUER is compatible and enables SQ head optimization.
-	return setupSQPoll | setupSingleIssuer
+	// Mirror highTier: task-run completions in the worker's own context, no
+	// extra kernel thread. (Was setupSQPoll|setupSingleIssuer — see the type
+	// doc and #377 for why SQPOLL is not used.)
+	if t.deferTaskrun {
+		return setupDeferTaskrun | setupSingleIssuer
+	}
+	return setupCoopTaskrun | setupSingleIssuer
 }
 func (t *optionalTier) SupportsProvidedBuffers() bool { return true }
 func (t *optionalTier) SupportsMultishotAccept() bool { return t.multishotAccept }
 func (t *optionalTier) SupportsMultishotRecv() bool   { return t.multishotRecv }
 func (t *optionalTier) SupportsFixedFiles() bool      { return t.fixedFiles }
 func (t *optionalTier) SupportsSendZC() bool          { return t.sendZC }
-func (t *optionalTier) SQPollIdle() uint32            { return t.sqPollIdle }
+func (t *optionalTier) SQPollIdle() uint32            { return 0 } // SQPOLL disabled — see type doc / #377
 
 func (t *optionalTier) PrepareAccept(ring *Ring, listenFD int) {
 	sqe := ring.GetSQE()
@@ -224,7 +232,7 @@ func (t *optionalTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool)
 	if sqe == nil {
 		return
 	}
-	if t.sendZC && !linked {
+	if useSendZC(t.sendZC, linked, len(buf)) {
 		// SEND_ZC cannot be linked (the notification CQE would break
 		// the link chain), so fall back to regular SEND for linked ops.
 		if t.fixedFiles {
diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go
index c673c2d4..d7f31de9 100644
--- a/engine/iouring/worker.go
+++ b/engine/iouring/worker.go
@@ -71,17 +71,24 @@ const bufRingCountMax = 1 << 15 // 32768 entries × 8 KiB = 256 MiB worst case (
 // the case for very-high-concurrency benchmarks (16k+ connections) where
 // each worker may have more in-flight multishot recvs than the formula
 // anticipates. Setting 0 (or leaving the env var unset) reverts to
-// auto-scaling from Workers × TargetConnsPerWorker.
+// auto-scaling from the per-worker conn target.
 const envPbufCount = "CELERIS_IOURING_PBUF_COUNT"
 
+// defaultConnsPerWorker is the per-worker connection target used to size
+// the provided-buffer ring. The ring is sized at 2 buffers per conn at
+// this target, giving comfortable headroom so the kernel rarely stalls
+// waiting for buffer returns. Operators override the resulting ring size
+// directly via CELERIS_IOURING_PBUF_COUNT.
+const defaultConnsPerWorker = 20
+
 // resolveBufRingCount picks the provided-buffer-ring size for a worker.
 // The default formula is `nextPowerOf2(max(bufRingCountMin, 2 *
-// TargetConnsPerWorker))`, i.e. two buffers per conn in the scaler's
-// steady-state target — enough headroom that the kernel rarely stalls
-// waiting for buffer returns. Above 1024 conns the previous hard-coded
-// 1024 was too small: buffers were reused aggressively, the kernel
-// stalled, and the very behaviour the ring is designed to optimise
-// (multishot recv CQE batching) collapsed into CQE storms (celeris#322).
+// connsPerWorker))`, i.e. two buffers per conn at the per-worker conn
+// target — enough headroom that the kernel rarely stalls waiting for
+// buffer returns. Above 1024 conns the previous hard-coded 1024 was too
+// small: buffers were reused aggressively, the kernel stalled, and the
+// very behaviour the ring is designed to optimise (multishot recv CQE
+// batching) collapsed into CQE storms (celeris#322).
 //
 // The ring is PER-WORKER: NewBufferRing is created once per Worker on its
 // own ring, so the scaling MUST be driven by the per-worker conn target,
@@ -90,7 +97,7 @@ const envPbufCount = "CELERIS_IOURING_PBUF_COUNT"
 // mmap'd RSS per worker and risking the kernel cap on large boxes
 // (celeris#322 follow-up).
 // Operators can override via CELERIS_IOURING_PBUF_COUNT.
-func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorker int) int {
+func resolveBufRingCount(_ resource.ResolvedResources, connsPerWorker int) int {
 	if v := os.Getenv(envPbufCount); v != "" {
 		if n, err := strconv.Atoi(v); err == nil && n > 0 {
 			if n&(n-1) != 0 {
@@ -99,9 +106,9 @@ func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorke
 			return clampBufRingCount(n)
 		}
 	}
-	target := scalerTargetConnsPerWorker
+	target := connsPerWorker
 	if target <= 0 {
-		target = 20 // mirrors scaler.Resolve's default
+		target = defaultConnsPerWorker
 	}
 	scaled := 2 * target
 	if scaled < bufRingCountMin {
@@ -224,11 +231,6 @@ type Worker struct {
 	wake         chan struct{}
 	wakeMu       sync.Mutex
 	suspended    atomic.Bool
-	// inactive is the per-worker pause flag used by the dynamic worker
-	// scaler. ORed with acceptPaused (which is engine-wide) when computing
-	// effective paused state. The scaler flips this to deactivate idle
-	// workers under low load and reactivate them under burst load.
-	inactive atomic.Bool
 	// listenFDClosed signals that the worker has cancelled in-flight
 	// accept SQEs and closed its listen FD in response to acceptPaused
 	// being set. PauseAccept polls this so it only returns once the
@@ -239,11 +241,17 @@ type Worker struct {
 	// RST as it pauses).
 	listenFDClosed atomic.Bool
 
-	reqCount      *atomic.Uint64
-	activeConns   *atomic.Int64
-	errCount      *atomic.Uint64
-	asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300)
-	reqBatch      uint64         // batched request count, flushed to reqCount per iteration
+	reqCount          *atomic.Uint64
+	activeConns       *atomic.Int64
+	errCount          *atomic.Uint64
+	asyncPromoted     *atomic.Uint64 // cumulative inline → dispatch promotions (#300)
+	acceptCount       *atomic.Uint64 // cumulative accepts (engine-wide, shared)
+	closeCount        *atomic.Uint64 // cumulative closes (engine-wide, shared)
+	bytesRead         *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared)
+	bytesWritten      *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared)
+	reqBatch          uint64         // batched request count, flushed to reqCount per iteration
+	bytesReadBatch    uint64         // batched recv bytes, flushed to bytesRead per iteration
+	bytesWrittenBatch uint64         // batched send bytes, flushed to bytesWritten per iteration
 
 	tickCounter uint32
 	cachedNow   int64  // cached time.Now().UnixNano(), refreshed every 64 iterations
@@ -315,7 +323,8 @@ type Worker struct {
 func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler,
 	resolved resource.ResolvedResources,
 	cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64,
-	asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init
+	asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool,
+	acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init
 
 	// Listen socket creation is deferred to run() (after CPU pinning and NUMA
 	// binding) so that the kernel allocates socket internal buffers on the
@@ -342,6 +351,10 @@ func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler,
 		activeConns:   activeConns,
 		errCount:      errCount,
 		asyncPromoted: asyncPromoted,
+		acceptCount:   acceptCount,
+		closeCount:    closeCount,
+		bytesRead:     bytesRead,
+		bytesWritten:  bytesWritten,
 		acceptPaused:  acceptPaused,
 		wake:          make(chan struct{}),
 		ready:         make(chan error, 1),
@@ -431,15 +444,11 @@ func (w *Worker) run(ctx context.Context) {
 	// The ring size scales with the worker's per-conn target (celeris#322):
 	// the previous hard-coded 1024 entries was undersized above ~1024 conns
 	// and produced CQE storms as the kernel stalled waiting for buffer
-	// returns. The formula gives 2 buffers per conn at the scaler's
-	// steady-state target — comfortable headroom without runaway RSS.
+	// returns. The formula gives 2 buffers per conn at the per-worker conn
+	// target — comfortable headroom without runaway RSS.
 	// CELERIS_IOURING_PBUF_COUNT overrides the auto-scaled value.
 	if w.tier.SupportsMultishotRecv() && os.Getenv("CELERIS_IOURING_MULTISHOT_RECV") == "1" {
-		var targetConns int
-		if w.cfg.WorkerScaling != nil {
-			targetConns = w.cfg.WorkerScaling.TargetConnsPerWorker
-		}
-		bufRingCount := resolveBufRingCount(w.resolved, targetConns)
+		bufRingCount := resolveBufRingCount(w.resolved, defaultConnsPerWorker)
 		br, err := NewBufferRing(w.ring, bufRingGroupID, bufRingCount, w.resolved.BufferSize)
 		if err != nil {
 			w.logger.Warn("ring-mapped buffer registration failed, using per-connection buffers",
@@ -481,8 +490,7 @@ func (w *Worker) run(ctx context.Context) {
 		// Cache the atomic load: same value used by the two branches
 		// below and (further down) the SUSPENDED check. Saves 2 atomic
 		// loads per event-loop iteration on the steady-state hot path.
-		// OR with the per-worker inactive flag (dynamic scaler).
-		paused := w.acceptPaused.Load() || w.inactive.Load()
+		paused := w.acceptPaused.Load()
 		if w.listenFD >= 0 && paused {
 			if sqe := w.ring.GetSQE(); sqe != nil {
 				prepCancelFDSkipSuccess(sqe, w.listenFD)
@@ -674,6 +682,17 @@ func (w *Worker) run(ctx context.Context) {
 			w.reqBatch = 0
 		}
 
+		// Flush batched payload-byte counters with the same per-iteration
+		// cadence as reqCount, for the same cache-line-contention reason.
+		if w.bytesReadBatch > 0 {
+			w.bytesRead.Add(w.bytesReadBatch)
+			w.bytesReadBatch = 0
+		}
+		if w.bytesWrittenBatch > 0 {
+			w.bytesWritten.Add(w.bytesWrittenBatch)
+			w.bytesWrittenBatch = 0
+		}
+
 		// Single atomic publish for all batched buffer returns (P0).
 		if w.hasBufReturns {
 			w.bufRing.PublishBuffers()
@@ -782,7 +801,6 @@ func (w *Worker) run(ctx context.Context) {
 		// DRAINING → SUSPENDED: no listen socket, no connections, CQEs processed.
 		// Checked after CQE processing so accept CQEs for connections that
 		// completed before the listen socket close are served, not leaked.
-		// Combined paused: engine-wide OR per-worker (dynamic scaler).
 		//
 		// hasDriverConns gate: connCount only counts HTTP conns. An
 		// EventLoopProvider driver may still have live conns in
@@ -790,9 +808,9 @@ func (w *Worker) run(ctx context.Context) {
 		// park its event loop and starve those driver conns of CQE
 		// servicing. Stay active while any driver conn is registered (v1.5.0
 		// review 2.10).
-		if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && (w.acceptPaused.Load() || w.inactive.Load()) {
+		if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && w.acceptPaused.Load() {
 			w.wakeMu.Lock()
-			if !w.acceptPaused.Load() && !w.inactive.Load() {
+			if !w.acceptPaused.Load() {
 				w.wakeMu.Unlock()
 				continue
 			}
@@ -1184,6 +1202,7 @@ func (w *Worker) onAcceptedFD(ctx context.Context, newFD int, now int64, isFixed
 	}
 	cs.writeFn = w.makeWriteFn(cs)
 	w.activeConns.Add(1)
+	w.acceptCount.Add(1)
 
 	if w.cfg.OnConnect != nil {
 		w.cfg.OnConnect(cs.remoteAddr)
@@ -1225,6 +1244,7 @@ func (w *Worker) hijackConn(fd int) (net.Conn, error) {
 	w.conns[fd] = nil
 	w.connCount--
 	w.activeConns.Add(-1)
+	w.closeCount.Add(1)
 	// Cancel-then-release discipline, hijack variant: a single-shot
 	// recv SQE is virtually always still armed on cs.buf here. The fd
 	// lives on under the caller's net.Conn, so an uncancelled recv would
@@ -1424,7 +1444,7 @@ func (w *Worker) initProtocol(cs *connState) {
 			// process on the first /ws or /events request. The
 			// asyncPromoted gate restricts the Unlock to the dispatch-
 			// goroutine path that actually holds the lock. See celeris#309.
-			if w.async && cs.asyncPromoted && cs.detachMu != nil && !cs.asyncDetachUnlocked {
+			if w.async && cs.asyncPromoted.Load() && cs.detachMu != nil && !cs.asyncDetachUnlocked {
 				cs.asyncDetachUnlocked = true
 				cs.detachMu.Unlock()
 			}
@@ -1573,6 +1593,9 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) {
 	}
 
 	cs.lastActivity = now
+	// c.Res > 0 here (the c.Res <= 0 cases returned above): bytes received
+	// on this recv CQE, regardless of which buffer they landed in.
+	w.bytesReadBatch += uint64(c.Res)
 
 	// Direct-into-bodyBuf path: the previous recv SQE targeted
 	// H1State.bodyBuf (NextRecvBuf). The CQE's Res applies to bodyBuf,
@@ -1725,8 +1748,20 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) {
 	// which point the conn is promoted and its stashed request handed to
 	// the goroutine. This lets sync routes run inline on the worker
 	// (no handoff) on a server that mixes sync + async handlers.
-	if w.async && cs.asyncPromoted && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) {
+	asyncFeed := false
+	if w.async && cs.asyncPromoted.Load() && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) {
 		cs.asyncInMu.Lock()
+		// celeris#364: re-check under asyncInMu — the dispatch goroutine clears
+		// asyncPromoted (reverting the conn to inline) under this same lock. If
+		// it won the race, do NOT feed asyncInBuf (the goroutine is exiting);
+		// fall through to inline with the unconsumed `data` instead.
+		if cs.asyncPromoted.Load() {
+			asyncFeed = true
+		} else {
+			cs.asyncInMu.Unlock()
+		}
+	}
+	if asyncFeed {
 		// Backpressure: drop the conn if the dispatch goroutine is
 		// falling behind. Prevents a pipelining client from ballooning
 		// asyncInBuf without bound.
@@ -1793,7 +1828,7 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) {
 	// hasn't been promoted yet, run ProcessH1 inline in InlineMode so it
 	// bails (ErrAsyncDispatch) when it hits an async route. The flag is
 	// set only around the ProcessH1 call(s) below.
-	tryInline := w.async && !cs.asyncPromoted && cs.h1State != nil &&
+	tryInline := w.async && !cs.asyncPromoted.Load() && cs.h1State != nil &&
 		(w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1)
 	// h1Only mode (Protocol=HTTP1 + EnableH2Upgrade=false): no atomic
 	// Load, no switch dispatch, no upgrade-handling block — ProcessH1
@@ -1891,7 +1926,15 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) {
 	// goes straight to the dispatch path (asyncPromoted guard above).
 	if tryInline {
 		if errors.Is(processErr, conn.ErrAsyncDispatch) {
-			cs.asyncPromoted = true
+			// celeris#364: record the route that forced promotion (single-shot
+			// recv only — the revert path assumes the worker-owned cs.buf recv
+			// model). Set BEFORE asyncPromoted/goroutine start so the dispatch
+			// goroutine observes it (happens-before). Empty path => the
+			// goroutine treats the conn as not revert-eligible.
+			if w.bufRing == nil {
+				cs.promotedMethod, cs.promotedPath = cs.h1State.CurrentRoute()
+			}
+			cs.asyncPromoted.Store(true)
 			w.asyncPromoted.Add(1)
 			stashed := cs.h1State.TakeBufferedBytes()
 			// Flush any inline-handled response (pipelined sync request
@@ -1918,8 +1961,10 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) {
 		// continuation runs on the dispatch goroutine — the partial-state
 		// parse paths must not run inline (only the fresh-parse site
 		// honors the async check).
-		if processErr == nil && cs.h1State.HasPendingData() {
-			cs.asyncPromoted = true
+		if processErr == nil && cs.h1State.HasPendingDispatchState() {
+			// No complete request parsed yet (buffered headers / chunked), so
+			// no route to record — leave promotedPath empty: not revert-eligible.
+			cs.asyncPromoted.Store(true)
 			w.asyncPromoted.Add(1)
 		}
 	}
@@ -2119,6 +2164,11 @@ func (w *Worker) completeSend(cs *connState, fd int, sent int, now int64) {
 		return
 	}
 
+	// sent >= 0 here: payload bytes flushed by this send completion
+	// (covers regular SEND and the SEND_ZC NOTIF path, both of which
+	// reach completeSend with the byte count).
+	w.bytesWrittenBatch += uint64(sent)
+
 	// Partial-send handling, split by whether we issued a plain SEND
 	// (sendBuf only) or a WRITEV (sendBuf + sendBody). Partial WRITEV
 	// responses collapse the remainder into sendBuf so the retry path
@@ -2470,6 +2520,7 @@ func (w *Worker) finishClose(fd int) {
 	w.conns[fd] = nil
 	w.connCount--
 	w.activeConns.Add(-1)
+	w.closeCount.Add(1)
 
 	if w.cfg.OnDisconnect != nil && cs != nil {
 		w.cfg.OnDisconnect(cs.remoteAddr)
@@ -2570,6 +2621,7 @@ func (w *Worker) finishCloseDetached(fd int, cs *connState) {
 	w.conns[fd] = nil
 	w.connCount--
 	w.activeConns.Add(-1)
+	w.closeCount.Add(1)
 
 	if w.cfg.OnDisconnect != nil {
 		w.cfg.OnDisconnect(cs.remoteAddr)
@@ -2688,6 +2740,21 @@ func (w *Worker) promoteConnToAsync(cs *connState, _ int, stashed []byte, c *com
 	}
 }
 
+// canRevertToInline reports whether a promoted conn should be reverted to the
+// inline fast path (celeris#364). True when single-shot recv is in use, the
+// conn recorded the route that promoted it, and that route's promotion has
+// since expired (RouteAsync now false — the route-level TTL de-promotion).
+// Called by runAsyncHandler ONLY while holding asyncInMu with asyncInBuf empty.
+func (w *Worker) canRevertToInline(cs *connState) bool {
+	return w.bufRing == nil && cs.promotedPath != "" && cs.h1State != nil &&
+		cs.h1State.RouteAsync != nil &&
+		// Clean request boundary only: never revert mid-request (a partial body
+		// or buffered headers still accumulating), so h1State ownership flips
+		// back to the worker between requests, exactly like a fresh inline conn.
+		!cs.h1State.HasPendingData() &&
+		!cs.h1State.RouteAsync(cs.promotedMethod, cs.promotedPath)
+}
+
 func (w *Worker) runAsyncHandler(cs *connState) {
 	defer w.asyncWG.Done()
 	defer func() {
@@ -2720,6 +2787,19 @@ func (w *Worker) runAsyncHandler(cs *connState) {
 	for {
 		cs.asyncInMu.Lock()
 		for len(cs.asyncInBuf) == 0 && !cs.asyncClosed.Load() {
+			// celeris#364: revert this conn to inline when the route that
+			// promoted it has de-promoted (its TTL expired). Safe ONLY here:
+			// asyncInBuf is empty (no in-flight input, last response already
+			// written) and we hold asyncInMu, which the worker's feed path
+			// re-acquires and re-checks asyncPromoted against — so clearing it
+			// here cannot race a concurrent feed. The worker owns recv and
+			// resumes the inline fast path on the next CQE.
+			if w.canRevertToInline(cs) {
+				cs.asyncPromoted.Store(false)
+				cs.asyncRun = false
+				cs.asyncInMu.Unlock()
+				return
+			}
 			cs.asyncCond.Wait()
 		}
 		if cs.asyncClosed.Load() {
@@ -3052,10 +3132,13 @@ func (w *Worker) pickRecvTarget(cs *connState) []byte {
 	// completed, so no kernel SQE still targets the old bodyBuf array. The
 	// body path below re-sets the pin when it arms into bodyBuf again.
 	cs.bodyRecvPin = nil
-	// Async mode: the dispatch goroutine owns h1State; the worker cannot
-	// safely observe NextRecvBuf without synchronization. Always use
-	// cs.buf so the goroutine handles body accumulation on its side.
-	if w.async || w.bufRing != nil || cs.h1State == nil {
+	// Async mode: only a PROMOTED conn hands h1State to the dispatch
+	// goroutine, which the worker cannot safely observe NextRecvBuf against.
+	// A non-promoted conn (celeris#356 inline-first) runs ProcessH1 on the
+	// worker itself (tryInline), so the worker owns h1State exactly as the
+	// sync path does and the zero-copy direct-into-bodyBuf recv is safe —
+	// gate the bail on cs.asyncPromoted, not blanket w.async.
+	if (w.async && cs.asyncPromoted.Load()) || w.bufRing != nil || cs.h1State == nil {
 		return cs.buf
 	}
 	if !w.h1Only && engine.Protocol(cs.protocol.Load()) != engine.HTTP1 {
@@ -3289,10 +3372,12 @@ func (w *Worker) flushSend(cs *connState) bool {
 }
 
 // prepSendSQE prepares a SEND or SEND_ZC SQE based on worker capabilities.
-// SEND_ZC is only used for unlinked sends (the notification CQE would break
-// the link chain). Linked sends always use regular SEND.
+// SEND_ZC is only used for unlinked sends at or above sendZCMinBytes (the
+// notification CQE would break the link chain, and on small payloads its extra
+// completion costs more than the avoided memcpy). Smaller and linked sends use
+// regular SEND.
 func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) {
-	if w.sendZC && !linked {
+	if useSendZC(w.sendZC, linked, len(cs.sendBuf)) {
 		if cs.fixedFile {
 			prepSendZCFixed(sqe, cs.fd, cs.sendBuf, false)
 		} else {
@@ -3305,6 +3390,13 @@ func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) {
 	}
 }
 
+// useSendZC decides whether a send should use zero-copy. ZC is only viable for
+// unlinked sends whose payload is large enough that the saved memcpy outweighs
+// the extra NOTIF CQE (see sendZCMinBytes).
+func useSendZC(sendZC, linked bool, n int) bool {
+	return sendZC && !linked && n >= sendZCMinBytes
+}
+
 // flushSendLink is like flushSend but links a RECV SQE after the SEND using
 // IOSQE_IO_LINK. The kernel chains the operations: when SEND completes, RECV
 // starts automatically without another io_uring_enter. This eliminates one
diff --git a/engine/scaler/doc.go b/engine/scaler/doc.go
deleted file mode 100644
index c5ebe959..00000000
--- a/engine/scaler/doc.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Package scaler implements the dynamic worker scaler shared by the
-// iouring, epoll, and adaptive engines. The scaler tracks the active
-// connection count and adjusts how many workers participate in the
-// SO_REUSEPORT group so connections-per-active-worker stays near the
-// configured target — this keeps CQE/event batching density in the
-// sweet spot.
-//
-// Engines plug in via the [Source] interface. Per-engine sources
-// (engine/iouring/scaler.go, engine/epoll/scaler.go, adaptive/scaler.go)
-// adapt the engine to this contract; the algorithm in [Run] is shared.
-//
-// Implementation is Linux-only because the underlying engines that
-// surface a [Source] are themselves Linux-only. This package compiles
-// to an empty stub on non-Linux GOOS so godoc and reflection-based
-// tooling still see the public types and contract.
-package scaler
diff --git a/engine/scaler/scaler.go b/engine/scaler/scaler.go
deleted file mode 100644
index ebd06ea2..00000000
--- a/engine/scaler/scaler.go
+++ /dev/null
@@ -1,285 +0,0 @@
-//go:build linux
-
-// Package-level documentation lives in doc.go (build-unconstrained so
-// pkg.go.dev can render it on non-Linux). Per-engine [Source] sources:
-// iouring + epoll report Generation()=0 always; the adaptive engine's
-// source increments it on each engine switch so the scaler can
-// re-baseline the new active engine's worker pause state.
-
-package scaler
-
-import (
-	"context"
-	"log/slog"
-	"os"
-	"strconv"
-	"time"
-
-	"github.com/goceleris/celeris/resource"
-)
-
-// Source is the engine-side interface the scaler uses to read load and
-// adjust worker activation. Implementations must be safe for concurrent
-// use; the scaler calls these from its own goroutine.
-type Source interface {
-	// NumWorkers returns the total worker pool size (= max active count).
-	// Must be stable across the lifetime of a scaler run.
-	NumWorkers() int
-	// ActiveConns returns the current active connection count. Read every tick.
-	ActiveConns() int64
-	// PauseWorker deactivates worker i. Idempotent.
-	PauseWorker(i int)
-	// ResumeWorker reactivates worker i. Idempotent.
-	ResumeWorker(i int)
-	// Generation returns a counter that increments whenever the
-	// underlying engine identity changes (e.g., adaptive engine switch).
-	// Per-engine sources return 0 always. The scaler uses this to detect
-	// switches and re-baseline the active count on the new engine.
-	Generation() uint64
-	// Logger returns the slog.Logger used for scaler diagnostics. May
-	// return nil to suppress log output.
-	Logger() *slog.Logger
-}
-
-// Config holds the resolved scaler parameters. All fields are mutable
-// before Run starts; the loop reads them once at startup. Use
-// [Resolve] to derive a Config from a resource.Config (handles both
-// the typed WorkerScaling field and the env-var fallback).
-type Config struct {
-	// Enabled gates the entire scaler. Resolve sets this to true when
-	// either the typed config or the CELERIS_DYN_WORKERS env is set.
-	Enabled bool
-	// StartHigh seeds the scaler at NumWorkers active and scales down
-	// on idle. Default true (data-validated; preserves SO_REUSEPORT
-	// distribution at startup, +34-78% over start-low on ramp scenarios).
-	StartHigh bool
-	// MinActive is the floor on active worker count.
-	MinActive int
-	// TargetConnsPerWorker is the scaling target. desired = ceil(conns/Target).
-	TargetConnsPerWorker int
-	// Interval is the tick cadence. Default 250ms.
-	Interval time.Duration
-	// ScaleUpStep is the max workers added per tick (burst limit).
-	ScaleUpStep int
-	// ScaleDownStep is the max workers removed per tick.
-	ScaleDownStep int
-	// ScaleDownHysteresis: scale-down requires desired ≤ active - hyst - 1.
-	ScaleDownHysteresis int
-	// ScaleDownIdleTicks: consecutive sub-threshold ticks needed to scale down.
-	ScaleDownIdleTicks int
-	// Trace logs every scaler decision when true.
-	Trace bool
-}
-
-// Resolve produces a Config from the engine's resource.Config. Typed
-// cfg.WorkerScaling takes precedence; env vars are the legacy fallback.
-// Returns Enabled=false when neither configures the scaler.
-func Resolve(cfg resource.Config, numWorkers int) Config {
-	if cfg.WorkerScaling != nil {
-		return fromTyped(cfg.WorkerScaling, numWorkers)
-	}
-	return fromEnv(numWorkers)
-}
-
-func fromTyped(c *resource.WorkerScalingConfig, numWorkers int) Config {
-	minActive := c.MinActive
-	if minActive == 0 {
-		minActive = numWorkers / 2
-		if minActive < 2 {
-			minActive = 2
-		}
-	}
-	if minActive > numWorkers {
-		minActive = numWorkers
-	}
-	target := c.TargetConnsPerWorker
-	if target == 0 {
-		target = 20
-	}
-	interval := c.Interval
-	if interval == 0 {
-		interval = 250 * time.Millisecond
-	}
-	upStep := c.ScaleUpStep
-	if upStep == 0 {
-		upStep = 2
-	}
-	downStep := c.ScaleDownStep
-	if downStep == 0 {
-		downStep = 1
-	}
-	hyst := c.ScaleDownHysteresis
-	if hyst == 0 {
-		hyst = 1
-	}
-	idleTicks := c.ScaleDownIdleTicks
-	if idleTicks == 0 {
-		idleTicks = 4
-	}
-	return Config{
-		Enabled:              true,
-		StartHigh:            c.Strategy != resource.ScalingStrategyStartLow,
-		MinActive:            minActive,
-		TargetConnsPerWorker: target,
-		Interval:             interval,
-		ScaleUpStep:          upStep,
-		ScaleDownStep:        downStep,
-		ScaleDownHysteresis:  hyst,
-		ScaleDownIdleTicks:   idleTicks,
-		Trace:                c.Trace,
-	}
-}
-
-func fromEnv(numWorkers int) Config {
-	// The CELERIS_DYN_* env vars are the legacy fallback for the typed
-	// [resource.WorkerScalingConfig]. They remain active so deployed
-	// manifests that pre-date v1.4.6 (when the typed config shipped)
-	// keep working without an edit. New code should use
-	// celeris.Config.WorkerScaling directly.
-	getInt := func(k string, def int) int {
-		if v := os.Getenv(k); v != "" {
-			if n, err := strconv.Atoi(v); err == nil {
-				return n
-			}
-		}
-		return def
-	}
-	enabled := getInt("CELERIS_DYN_WORKERS", 0) != 0
-	minActive := getInt("CELERIS_DYN_MIN", numWorkers/2)
-	if minActive < 1 {
-		minActive = 1
-	}
-	if minActive > numWorkers {
-		minActive = numWorkers
-	}
-	return Config{
-		Enabled: enabled,
-		// Defaults to true (data-validated; matches the typed-config
-		// Strategy=ScalingStrategyStartHigh default). start-low pauses
-		// workers BEFORE the engine has finished settling its listen
-		// FDs in the SO_REUSEPORT group, which races against incoming
-		// H2 prior-knowledge dials and produces RST mid-handshake (seen
-		// in the v1.4.1 strict-matrix run on get-json-64k-h2/adaptive
-		// before this change). Users who explicitly want start-low can
-		// set CELERIS_DYN_START_HIGH=0.
-		StartHigh:            os.Getenv("CELERIS_DYN_START_HIGH") != "0",
-		MinActive:            minActive,
-		TargetConnsPerWorker: getInt("CELERIS_DYN_TARGET", 20),
-		Interval:             time.Duration(getInt("CELERIS_DYN_INTERVAL", 250)) * time.Millisecond,
-		ScaleUpStep:          getInt("CELERIS_DYN_UPSTEP", 2),
-		ScaleDownStep:        getInt("CELERIS_DYN_DOWNSTEP", 1),
-		ScaleDownHysteresis:  getInt("CELERIS_DYN_DOWNHYST", 1),
-		ScaleDownIdleTicks:   getInt("CELERIS_DYN_DOWNIDLE", 4),
-		Trace:                os.Getenv("CELERIS_DYN_TRACE") == "1",
-	}
-}
-
-// Run executes the scaler loop until ctx is canceled. Reads load from
-// src.ActiveConns(), pauses/resumes workers via src.PauseWorker /
-// src.ResumeWorker, and re-baselines on Generation changes.
-//
-// Caller must check cfg.Enabled before invoking Run; Run does not
-// short-circuit when disabled.
-func Run(ctx context.Context, src Source, cfg Config) {
-	totalWorkers := src.NumWorkers()
-	var active int
-	if cfg.StartHigh {
-		active = totalWorkers
-	} else {
-		active = cfg.MinActive
-		for i := cfg.MinActive; i < totalWorkers; i++ {
-			src.PauseWorker(i)
-		}
-	}
-
-	if log := src.Logger(); log != nil {
-		log.Info("dynamic worker scaler started",
-			"min_active", cfg.MinActive,
-			"max", totalWorkers,
-			"target_conns_per_worker", cfg.TargetConnsPerWorker,
-			"interval_ms", int(cfg.Interval/time.Millisecond),
-			"start_high", cfg.StartHigh,
-			"up_step", cfg.ScaleUpStep,
-			"down_step", cfg.ScaleDownStep,
-			"down_hyst", cfg.ScaleDownHysteresis,
-			"down_idle_ticks", cfg.ScaleDownIdleTicks)
-	}
-
-	ticker := time.NewTicker(cfg.Interval)
-	defer ticker.Stop()
-	idleTicks := 0
-	lastGen := src.Generation()
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ticker.C:
-			if gen := src.Generation(); gen != lastGen {
-				// Engine switched (only meaningful for adaptive). Re-baseline
-				// the new engine to the current `active` count: 0..active-1
-				// resumed, active..total-1 paused. This is idempotent against
-				// the new engine's existing per-worker state.
-				for i := 0; i < totalWorkers; i++ {
-					if i < active {
-						src.ResumeWorker(i)
-					} else {
-						src.PauseWorker(i)
-					}
-				}
-				lastGen = gen
-				idleTicks = 0
-				continue
-			}
-			active, idleTicks = tick(src, cfg, active, idleTicks)
-		}
-	}
-}
-
-// tick is a single iteration of the scaler decision loop. Split out
-// for unit-testing the algorithm without spinning up a real engine.
-func tick(src Source, cfg Config, active, idleTicks int) (int, int) {
-	totalWorkers := src.NumWorkers()
-	conns := src.ActiveConns()
-	desired := int((conns + int64(cfg.TargetConnsPerWorker) - 1) / int64(cfg.TargetConnsPerWorker))
-	if desired < cfg.MinActive {
-		desired = cfg.MinActive
-	}
-	if desired > totalWorkers {
-		desired = totalWorkers
-	}
-	if cfg.Trace {
-		if log := src.Logger(); log != nil {
-			log.Info("scaler tick",
-				"conns", conns, "desired", desired, "active", active, "idle_ticks", idleTicks)
-		}
-	}
-
-	switch {
-	case desired > active:
-		step := desired - active
-		if step > cfg.ScaleUpStep {
-			step = cfg.ScaleUpStep
-		}
-		for n := 0; n < step; n++ {
-			src.ResumeWorker(active)
-			active++
-		}
-		idleTicks = 0
-	case desired <= active-cfg.ScaleDownHysteresis-1:
-		idleTicks++
-		if idleTicks >= cfg.ScaleDownIdleTicks {
-			step := active - desired
-			if step > cfg.ScaleDownStep {
-				step = cfg.ScaleDownStep
-			}
-			for n := 0; n < step && active > cfg.MinActive; n++ {
-				active--
-				src.PauseWorker(active)
-			}
-			idleTicks = 0
-		}
-	default:
-		idleTicks = 0
-	}
-	return active, idleTicks
-}
diff --git a/engine/scaler/scaler_test.go b/engine/scaler/scaler_test.go
deleted file mode 100644
index bcce2b7d..00000000
--- a/engine/scaler/scaler_test.go
+++ /dev/null
@@ -1,327 +0,0 @@
-//go:build linux
-
-package scaler
-
-import (
-	"context"
-	"log/slog"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/goceleris/celeris/resource"
-)
-
-// TestResolve_Defaults locks in the data-validated defaults captured in
-// the v1.4.1 spike-B sweep: start-high, min=numCPU/2, target=20,
-// interval=250ms, upStep=2, downStep=1, hyst=1, idleTicks=4.
-func TestResolve_Defaults(t *testing.T) {
-	t.Parallel()
-	rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{}}
-	c := Resolve(rcfg, 8)
-	if !c.Enabled {
-		t.Fatal("typed config (non-nil) must enable the scaler")
-	}
-	if !c.StartHigh {
-		t.Errorf("StartHigh: expected true (zero value Strategy = StartHigh), got false")
-	}
-	if c.MinActive != 4 {
-		t.Errorf("MinActive: expected 4 (numCPU/2 with numCPU=8), got %d", c.MinActive)
-	}
-	if c.TargetConnsPerWorker != 20 {
-		t.Errorf("TargetConnsPerWorker: expected 20, got %d", c.TargetConnsPerWorker)
-	}
-	if c.Interval != 250*time.Millisecond {
-		t.Errorf("Interval: expected 250ms, got %v", c.Interval)
-	}
-	if c.ScaleUpStep != 2 {
-		t.Errorf("ScaleUpStep: expected 2, got %d", c.ScaleUpStep)
-	}
-	if c.ScaleDownStep != 1 {
-		t.Errorf("ScaleDownStep: expected 1, got %d", c.ScaleDownStep)
-	}
-	if c.ScaleDownHysteresis != 1 {
-		t.Errorf("ScaleDownHysteresis: expected 1, got %d", c.ScaleDownHysteresis)
-	}
-	if c.ScaleDownIdleTicks != 4 {
-		t.Errorf("ScaleDownIdleTicks: expected 4, got %d", c.ScaleDownIdleTicks)
-	}
-}
-
-// TestResolve_StartLow verifies opt-in to start-low.
-func TestResolve_StartLow(t *testing.T) {
-	t.Parallel()
-	rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{
-		Strategy: resource.ScalingStrategyStartLow,
-	}}
-	c := Resolve(rcfg, 8)
-	if c.StartHigh {
-		t.Errorf("StartHigh: expected false (Strategy=StartLow), got true")
-	}
-}
-
-// TestResolve_MinActiveClamping covers the floor + cap on MinActive.
-func TestResolve_MinActiveClamping(t *testing.T) {
-	t.Parallel()
-	cases := []struct {
-		name       string
-		minActive  int
-		numWorkers int
-		want       int
-	}{
-		{"zero defaults to numCPU/2", 0, 8, 4},
-		{"zero with small pool floors to 2", 0, 2, 2},
-		{"explicit 1 respected", 1, 8, 1},
-		{"above pool clamped to pool", 16, 8, 8},
-		{"explicit 3", 3, 8, 3},
-	}
-	for _, tc := range cases {
-		tc := tc
-		t.Run(tc.name, func(t *testing.T) {
-			t.Parallel()
-			c := Resolve(resource.Config{WorkerScaling: &resource.WorkerScalingConfig{MinActive: tc.minActive}}, tc.numWorkers)
-			if c.MinActive != tc.want {
-				t.Errorf("MinActive: got %d, want %d", c.MinActive, tc.want)
-			}
-		})
-	}
-}
-
-// TestResolve_ConfigBeatsEnv verifies typed config wins over env vars.
-func TestResolve_ConfigBeatsEnv(t *testing.T) {
-	t.Setenv("CELERIS_DYN_TARGET", "999")
-	rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{TargetConnsPerWorker: 25}}
-	c := Resolve(rcfg, 4)
-	if c.TargetConnsPerWorker != 25 {
-		t.Errorf("typed config did not take precedence over env: got %d, want 25", c.TargetConnsPerWorker)
-	}
-	if !c.Enabled {
-		t.Error("typed config should produce Enabled=true")
-	}
-}
-
-// TestResolve_EnvFallback verifies the env-var path is the fallback.
-func TestResolve_EnvFallback(t *testing.T) {
-	t.Setenv("CELERIS_DYN_WORKERS", "1")
-	t.Setenv("CELERIS_DYN_TARGET", "33")
-	c := Resolve(resource.Config{}, 4)
-	if !c.Enabled {
-		t.Fatal("CELERIS_DYN_WORKERS=1 should enable the scaler")
-	}
-	if c.TargetConnsPerWorker != 33 {
-		t.Errorf("env target read incorrectly: got %d, want 33", c.TargetConnsPerWorker)
-	}
-}
-
-// TestResolve_DisabledByDefault verifies the legacy zero-config path.
-func TestResolve_DisabledByDefault(t *testing.T) {
-	t.Setenv("CELERIS_DYN_WORKERS", "")
-	c := Resolve(resource.Config{}, 4)
-	if c.Enabled {
-		t.Errorf("scaler should be disabled when neither env nor config provides it")
-	}
-}
-
-// fakeSource implements Source for unit-testing the algorithm without
-// spinning up an engine. PauseWorker / ResumeWorker calls are tracked
-// per worker.
-type fakeSource struct {
-	mu         sync.Mutex
-	numWorkers int
-	conns      int64
-	gen        uint64
-	pauseLog   []int
-	resumeLog  []int
-	paused     []bool
-}
-
-func newFake(n int) *fakeSource { return &fakeSource{numWorkers: n, paused: make([]bool, n)} }
-
-func (s *fakeSource) NumWorkers() int { return s.numWorkers }
-func (s *fakeSource) ActiveConns() int64 {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	return s.conns
-}
-func (s *fakeSource) PauseWorker(i int) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.pauseLog = append(s.pauseLog, i)
-	if i >= 0 && i < len(s.paused) {
-		s.paused[i] = true
-	}
-}
-func (s *fakeSource) ResumeWorker(i int) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.resumeLog = append(s.resumeLog, i)
-	if i >= 0 && i < len(s.paused) {
-		s.paused[i] = false
-	}
-}
-func (s *fakeSource) Generation() uint64 {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	return s.gen
-}
-func (s *fakeSource) Logger() *slog.Logger { return nil }
-
-func (s *fakeSource) setConns(n int64) {
-	s.mu.Lock()
-	s.conns = n
-	s.mu.Unlock()
-}
-func (s *fakeSource) bumpGen() {
-	s.mu.Lock()
-	s.gen++
-	s.mu.Unlock()
-}
-func (s *fakeSource) numActive() int {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	n := 0
-	for _, p := range s.paused {
-		if !p {
-			n++
-		}
-	}
-	return n
-}
-
-// runAsync starts Run in a goroutine and returns a stop func plus a
-// Done chan so callers can poll the source state without depending on
-// the run-loop's wall-clock progress. Older versions of these tests
-// used a fixed-duration ctx and asserted at the end; under -race or
-// on slow CPUs the time.Ticker can fire fewer than expected times in
-// a 200ms window, leading to flakes (msa2-client hit "got 7" once).
-func runAsync(t *testing.T, src *fakeSource, cfg Config) (stop func(), done <-chan struct{}) {
-	t.Helper()
-	ctx, cancel := context.WithCancel(context.Background())
-	d := make(chan struct{})
-	go func() {
-		defer close(d)
-		Run(ctx, src, cfg)
-	}()
-	return cancel, d
-}
-
-// waitForActive polls src.numActive() until it matches want or
-// timeout elapses. Returns true on success. Takes the place of a
-// fixed-time sleep in scaler tests.
-func waitForActive(t *testing.T, src *fakeSource, want int, timeout time.Duration) bool {
-	t.Helper()
-	deadline := time.Now().Add(timeout)
-	for time.Now().Before(deadline) {
-		if src.numActive() == want {
-			return true
-		}
-		time.Sleep(2 * time.Millisecond)
-	}
-	t.Errorf("numActive: waited %v for %d, last=%d", timeout, want, src.numActive())
-	return false
-}
-
-// TestRun_StartHighScalesDownOnIdle verifies the start-high default
-// behaviour: starts at NumWorkers active, scales down to a floor of
-// MinActive when load is low.
-//
-// Floor: with ScaleDownHysteresis=0 the floor IS MinActive. With
-// hysteresis>0 the floor is MinActive+hysteresis to prevent flapping
-// near the boundary; that's tested in
-// TestRun_HysteresisFloorPreservedAboveMinActive.
-func TestRun_StartHighScalesDownOnIdle(t *testing.T) {
-	src := newFake(8)
-	src.setConns(0) // idle from the start
-	cfg := Config{
-		Enabled: true, StartHigh: true,
-		MinActive: 2, TargetConnsPerWorker: 20,
-		Interval:    5 * time.Millisecond,
-		ScaleUpStep: 2, ScaleDownStep: 1,
-		ScaleDownHysteresis: 0, ScaleDownIdleTicks: 2,
-	}
-	stop, done := runAsync(t, src, cfg)
-	waitForActive(t, src, 2, 2*time.Second)
-	stop()
-	<-done
-}
-
-// TestRun_HysteresisFloorPreservedAboveMinActive verifies that with the
-// default hysteresis=1 the algorithm stops scaling at MinActive+1, not
-// at MinActive. This prevents flapping near the boundary when conn
-// count oscillates ±1 around the threshold.
-func TestRun_HysteresisFloorPreservedAboveMinActive(t *testing.T) {
-	src := newFake(8)
-	src.setConns(0)
-	cfg := Config{
-		Enabled: true, StartHigh: true,
-		MinActive: 2, TargetConnsPerWorker: 20,
-		Interval:    5 * time.Millisecond,
-		ScaleUpStep: 2, ScaleDownStep: 1,
-		ScaleDownHysteresis: 1, ScaleDownIdleTicks: 2,
-	}
-	stop, done := runAsync(t, src, cfg)
-	// Floor is MinActive + hyst = 3.
-	waitForActive(t, src, 3, 2*time.Second)
-	stop()
-	<-done
-}
-
-// TestRun_StartLowScalesUpOnLoad verifies the start-low path: starts at
-// MinActive, scales up to ceil(conns/target) when load arrives.
-func TestRun_StartLowScalesUpOnLoad(t *testing.T) {
-	src := newFake(8)
-	cfg := Config{
-		Enabled: true, StartHigh: false,
-		MinActive: 2, TargetConnsPerWorker: 20,
-		Interval:    5 * time.Millisecond,
-		ScaleUpStep: 4, ScaleDownStep: 1,
-		ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100,
-	}
-	// Set conns so desired = ceil(160/20) = 8 (full pool).
-	src.setConns(160)
-	stop, done := runAsync(t, src, cfg)
-	waitForActive(t, src, 8, 2*time.Second)
-	stop()
-	<-done
-}
-
-// TestRun_GenerationChangeRebaselines verifies that incrementing
-// Generation triggers re-baseline of the active count on the new
-// underlying engine. This is the adaptive switch-handling invariant.
-func TestRun_GenerationChangeRebaselines(t *testing.T) {
-	src := newFake(8)
-	src.setConns(60) // desired = 3, but with start-high we start at 8
-	cfg := Config{
-		Enabled: true, StartHigh: true,
-		MinActive: 2, TargetConnsPerWorker: 20,
-		Interval:    5 * time.Millisecond,
-		ScaleUpStep: 2, ScaleDownStep: 1,
-		ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100,
-	}
-	// Run for a few ticks so the scaler settles. Then bump Generation
-	// — the next tick should re-baseline the (still-fake) source by
-	// pausing/resuming workers to enforce the current `active`.
-	ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
-	defer cancel()
-	go Run(ctx, src, cfg)
-	time.Sleep(15 * time.Millisecond)
-
-	// Reset the logs; the next bump should write a full re-baseline burst.
-	src.mu.Lock()
-	src.pauseLog = nil
-	src.resumeLog = nil
-	src.mu.Unlock()
-	src.bumpGen()
-	time.Sleep(15 * time.Millisecond)
-	cancel()
-	time.Sleep(10 * time.Millisecond)
-
-	src.mu.Lock()
-	pauseLogLen := len(src.pauseLog)
-	resumeLogLen := len(src.resumeLog)
-	src.mu.Unlock()
-	if pauseLogLen+resumeLogLen < src.NumWorkers() {
-		t.Errorf("expected re-baseline to issue Pause+Resume calls covering all %d workers; got %d pauses + %d resumes",
-			src.NumWorkers(), pauseLogLen, resumeLogLen)
-	}
-}
diff --git a/handler.go b/handler.go
index 7c0e618b..1eb3b060 100644
--- a/handler.go
+++ b/handler.go
@@ -131,7 +131,28 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro
 	c.handlers = handlers
 	c.fullPath = fullPath
 
-	if err := c.Next(); err != nil {
+	// celeris#356: an adaptive route (inherited the AsyncHandlers=true default)
+	// runs INLINE here until observed to block. Time this inline run; if the
+	// handler chain exceeds adaptivePromoteThreshold it is genuinely blocking,
+	// so promote the route to async dispatch — future requests then run on a
+	// goroutine instead of stalling the event-loop worker. Non-adaptive configs
+	// (no AsyncHandlers default) hit the empty-map fast path and skip timing.
+	rt := a.server.router
+	if rt.adaptiveRoutes[fullPath] && rt.adaptiveLearning(fullPath) {
+		start := time.Now()
+		err := c.Next()
+		dur := time.Since(start)
+		if dur > adaptiveBlockingThreshold {
+			// Unambiguously a blocking I/O round-trip: promote on the first such
+			// run rather than waiting for adaptivePromoteStreak slow runs.
+			rt.promoteRouteImmediate(fullPath)
+		} else {
+			rt.recordInlineRun(fullPath, dur > adaptivePromoteThreshold)
+		}
+		if err != nil {
+			a.handleError(c, s, err)
+		}
+	} else if err := c.Next(); err != nil {
 		a.handleError(c, s, err)
 	}
 	if c.buffered && !c.written {
@@ -141,6 +162,55 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro
 	return nil
 }
 
+// adaptivePromoteThreshold is the inline handler duration that counts as a
+// "slow" run for adaptive promotion (celeris#356). A non-blocking handler —
+// even a heavy middleware chain — returns in tens of microseconds; a blocking
+// one (DB/cache round-trip) takes hundreds of µs to ms. The bar sits well above
+// the CPU-bound range so a transient GC/scheduling burst cannot push a
+// CPU-bound chain over it for adaptivePromoteStreak consecutive runs and
+// wrongly promote it to the slower async path — a 50µs bar did exactly that,
+// intermittently collapsing iouring-async chain-fullstack (celeris#364).
+// Genuinely-blocking routes are marked .Async() explicitly (opting out of
+// adaptive); auto-promotion is a safety net for an unmarked handler that blocks
+// on EVERY request.
+const adaptivePromoteThreshold = 300 * time.Microsecond
+
+// adaptiveBlockingThreshold is the inline duration that is UNAMBIGUOUSLY a
+// blocking I/O round-trip (not CPU work under contention). A single inline run
+// over this bar promotes the route IMMEDIATELY, skipping the
+// adaptivePromoteStreak hysteresis — a genuinely-blocking handler that an
+// operator forgot to mark .Async() then stalls a worker for at most one request
+// instead of adaptivePromoteStreak of them. The bar sits far above any CPU-bound
+// chain's wall-clock (even under GC/scheduling jitter), so it cannot misfire on
+// a CPU route; the 300µs/streak path still handles the borderline 300µs–2ms band.
+const adaptiveBlockingThreshold = 2 * time.Millisecond
+
+// adaptivePromoteStreak is how many CONSECUTIVE slow inline runs promote an
+// adaptive route to async. The consecutive requirement (a fast run resets the
+// streak) makes a one-off cold start / GC pause harmless, while a handler that
+// blocks on every request promotes within a handful of requests.
+const adaptivePromoteStreak = 8
+
+// adaptiveSettleStreak is how many CONSECUTIVE fast inline runs SETTLE an
+// adaptive route (celeris#361): proven non-blocking, it is removed from the
+// timed path so the hot loop stops paying two time.Now() vDSO calls per
+// request forever. High enough that only consistently-static routes settle (a
+// slow run resets the streak); at scale a static route settles in well under a
+// millisecond. A genuinely-blocking handler should be marked .Async() — it
+// promotes long before it could settle.
+const adaptiveSettleStreak = 256
+
+// adaptivePromoteTTL bounds how long a promotion lasts before the route is
+// re-evaluated inline (celeris#364). Promotion is otherwise terminal — a
+// promoted route runs async and is never re-timed — so a CPU-bound chain that
+// was falsely promoted by a transient load/jitter spike (inline wall-clock
+// crossing adaptivePromoteThreshold under worker contention, not actual
+// blocking) stayed on the ~32%-slower async path until restart. After the TTL
+// the route runs inline again and re-settles if fast, or re-promotes within
+// adaptivePromoteStreak runs if genuinely blocking. The clock is read only for
+// already-promoted routes, so the fast path is unaffected.
+const adaptivePromoteTTL = 5 * time.Second
+
 // recoverAndRelease handles panic recovery and context release. Extracted to a
 // separate noinline function so that HandleStream's stack frame is not inflated
 // by the deferred closure and debug.Stack() call (P5).
diff --git a/internal/conn/h1.go b/internal/conn/h1.go
index ea5b0d7d..415f6164 100644
--- a/internal/conn/h1.go
+++ b/internal/conn/h1.go
@@ -246,6 +246,29 @@ func (s *H1State) HasPendingData() bool {
 	return s.buffer.Len() > 0 || s.bodyNeeded > 0
 }
 
+// HasPendingDispatchState reports whether ProcessH1 left partial state that
+// MUST be continued on the dispatch goroutine (InlineMode=false): buffered
+// partial headers or a chunked body in progress, both of which resume via the
+// buffered parse path that does not re-run the per-route async check. A
+// fixed-length body in progress (bodyNeeded > 0) is deliberately EXCLUDED: its
+// continuation re-parses the already-async-checked (so provably non-async)
+// request and dispatches the handler inline on the worker — exactly as the
+// sync engine does — so it must NOT force the conn onto the slower async
+// dispatch path. Promoting on it permanently poisoned keep-alive conns that
+// hit even one split-across-recvs body (the post-4k regression).
+func (s *H1State) HasPendingDispatchState() bool {
+	return s.buffer.Len() > 0 && s.bodyNeeded <= 0
+}
+
+// CurrentRoute returns the method and path of the request currently parsed into
+// the H1 state. The engine records these when a route forces an async promotion
+// so the dispatch goroutine can revert the connection to inline once that
+// route's promotion expires (celeris#364). req.Method/req.Path are interned
+// (stable) strings, safe to retain past the recv buffer's lifetime.
+func (s *H1State) CurrentRoute() (method, path string) {
+	return s.req.Method, s.req.Path
+}
+
 // UpdateWriteFn replaces the response adapter's write function. Called by
 // OnDetach to route StreamWriter writes through the mutex-guarded writeFn.
 func (s *H1State) UpdateWriteFn(fn func([]byte)) {
diff --git a/middleware/compress/go.mod b/middleware/compress/go.mod
index d14eeb27..d68ab9da 100644
--- a/middleware/compress/go.mod
+++ b/middleware/compress/go.mod
@@ -4,7 +4,7 @@ go 1.26.4
 
 require (
 	github.com/andybalholm/brotli v1.2.1
-	github.com/goceleris/celeris v1.4.4
+	github.com/goceleris/celeris v1.5.3
 	github.com/klauspost/compress v1.18.6
 )
 
diff --git a/middleware/metrics/go.mod b/middleware/metrics/go.mod
index 33d52836..dd707a33 100644
--- a/middleware/metrics/go.mod
+++ b/middleware/metrics/go.mod
@@ -3,10 +3,10 @@ module github.com/goceleris/celeris/middleware/metrics
 go 1.26.4
 
 require (
-	github.com/goceleris/celeris v1.4.4
+	github.com/goceleris/celeris v1.5.3
 	github.com/prometheus/client_golang v1.23.2
 	github.com/prometheus/client_model v0.6.2
-	github.com/prometheus/common v0.68.1
+	github.com/prometheus/common v0.69.0
 )
 
 require (
diff --git a/middleware/metrics/go.sum b/middleware/metrics/go.sum
index b7ee233c..4baf42e7 100644
--- a/middleware/metrics/go.sum
+++ b/middleware/metrics/go.sum
@@ -14,8 +14,8 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h
 github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
-github.com/prometheus/common v0.68.1 h1:omjRRl4QP4komogpXuhfeOiisQg7xdy8VM1UY+pStaY=
-github.com/prometheus/common v0.68.1/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y=
+github.com/prometheus/common v0.69.0 h1:OA85nJQS/T/MaYh/Q2CcgDKSGWqNIgrBDvDH85CuiNk=
+github.com/prometheus/common v0.69.0/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y=
 github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
 github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
diff --git a/middleware/otel/go.mod b/middleware/otel/go.mod
index fc135432..61423c64 100644
--- a/middleware/otel/go.mod
+++ b/middleware/otel/go.mod
@@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/otel
 go 1.26.4
 
 require (
-	github.com/goceleris/celeris v1.4.4
+	github.com/goceleris/celeris v1.5.3
 	go.opentelemetry.io/otel v1.44.0
 	go.opentelemetry.io/otel/metric v1.44.0
 	go.opentelemetry.io/otel/sdk v1.44.0
diff --git a/middleware/protobuf/go.mod b/middleware/protobuf/go.mod
index 948f256a..bad3be60 100644
--- a/middleware/protobuf/go.mod
+++ b/middleware/protobuf/go.mod
@@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/protobuf
 go 1.26.4
 
 require (
-	github.com/goceleris/celeris v1.4.4
+	github.com/goceleris/celeris v1.5.3
 	google.golang.org/protobuf v1.36.11
 )
 
diff --git a/middleware/secure/config.go b/middleware/secure/config.go
index f6151f4c..c630a409 100644
--- a/middleware/secure/config.go
+++ b/middleware/secure/config.go
@@ -67,12 +67,13 @@ type Config struct {
 	CrossOriginResourcePolicy string `yaml:"cross_origin_resource_policy"`
 
 	// CrossOriginEmbedderPolicy sets the Cross-Origin-Embedder-Policy header.
-	// Default: "require-corp".
+	// Default: "" (NOT emitted — opt-in).
 	//
-	// WARNING: "require-corp" blocks cross-origin resources (images, scripts,
-	// etc.) that do not carry a Cross-Origin-Resource-Policy header or valid
-	// CORS headers. If your application loads third-party assets, use
-	// "credentialless" or [Suppress] to avoid breakage.
+	// COEP is off by default because "require-corp" blocks cross-origin
+	// resources (images, scripts, etc.) that lack a Cross-Origin-Resource-Policy
+	// or valid CORS headers — enabling it by default silently breaks many sites,
+	// so like helmet we leave it opt-in. Set "require-corp" or "credentialless"
+	// to enable.
 	CrossOriginEmbedderPolicy string `yaml:"cross_origin_embedder_policy"`
 
 	// XDNSPrefetchControl sets the X-DNS-Prefetch-Control header.
@@ -88,7 +89,9 @@ type Config struct {
 	OriginAgentCluster string `yaml:"origin_agent_cluster"`
 
 	// XDownloadOptions sets the X-Download-Options header.
-	// Default: "noopen".
+	// Default: "" (NOT emitted — opt-in). This header only affected legacy
+	// Internet Explorer downloads and is obsolete on modern browsers; set
+	// "noopen" to restore it.
 	XDownloadOptions string `yaml:"x_download_options"`
 }
 
@@ -103,11 +106,14 @@ var defaultConfig = Config{
 	ReferrerPolicy:            "strict-origin-when-cross-origin",
 	CrossOriginOpenerPolicy:   "same-origin",
 	CrossOriginResourcePolicy: "same-origin",
-	CrossOriginEmbedderPolicy: "require-corp",
+	// COEP off by default (opt-in): require-corp breaks cross-origin resource
+	// loads; enabling it by default is a footgun (matches helmet). #338.
+	CrossOriginEmbedderPolicy: "",
 	XDNSPrefetchControl:       "off",
 	XPermittedCrossDomain:     "none",
 	OriginAgentCluster:        "?1",
-	XDownloadOptions:          "noopen",
+	// X-Download-Options off by default (opt-in): legacy IE-only, obsolete.
+	XDownloadOptions: "",
 }
 
 func applyDefaults(cfg Config) Config {
diff --git a/middleware/secure/doc.go b/middleware/secure/doc.go
index 156acdb3..7c7025a4 100644
--- a/middleware/secure/doc.go
+++ b/middleware/secure/doc.go
@@ -14,14 +14,16 @@
 //   - Referrer-Policy: "strict-origin-when-cross-origin"
 //   - Cross-Origin-Opener-Policy: "same-origin"
 //   - Cross-Origin-Resource-Policy: "same-origin"
-//   - Cross-Origin-Embedder-Policy: "require-corp"
 //   - X-DNS-Prefetch-Control: "off"
 //   - X-Permitted-Cross-Domain-Policies: "none"
 //   - Origin-Agent-Cluster: "?1"
-//   - X-Download-Options: "noopen"
 //
-// Content-Security-Policy and Permissions-Policy are only included when
-// their respective [Config] fields are non-empty.
+// Cross-Origin-Embedder-Policy ("require-corp") and X-Download-Options
+// ("noopen") are OPT-IN (off by default, #338): COEP-by-default breaks
+// cross-origin resource loads (matching helmet, which leaves it off), and
+// X-Download-Options only affected legacy Internet Explorer. Content-Security-
+// Policy and Permissions-Policy are likewise only emitted when their [Config]
+// fields are non-empty.
 //
 // # Usage
 //
@@ -64,8 +66,8 @@
 // # CORS Interaction
 //
 // When using CORS middleware alongside secure, note that the default
-// CrossOriginEmbedderPolicy (require-corp) and CrossOriginResourcePolicy
-// (same-origin) block cross-origin resource loading. APIs serving
-// cross-origin requests should set CrossOriginResourcePolicy: "cross-origin"
-// and CrossOriginEmbedderPolicy: [Suppress].
+// CrossOriginResourcePolicy (same-origin) blocks cross-origin resource
+// loading; APIs serving cross-origin requests should set
+// CrossOriginResourcePolicy: "cross-origin". COEP is off by default so it no
+// longer needs suppressing; enable it explicitly only when isolating the page.
 package secure
diff --git a/middleware/secure/secure_test.go b/middleware/secure/secure_test.go
index 67beefd7..abadfee8 100644
--- a/middleware/secure/secure_test.go
+++ b/middleware/secure/secure_test.go
@@ -28,11 +28,13 @@ func TestDefaultConfigSetsAllHeaders(t *testing.T) {
 	testutil.AssertHeader(t, rec, "referrer-policy", "strict-origin-when-cross-origin")
 	testutil.AssertHeader(t, rec, "cross-origin-opener-policy", "same-origin")
 	testutil.AssertHeader(t, rec, "cross-origin-resource-policy", "same-origin")
-	testutil.AssertHeader(t, rec, "cross-origin-embedder-policy", "require-corp")
+	// COEP (require-corp) and X-Download-Options (legacy IE) are OFF by default
+	// — opt-in (#338); COEP-by-default breaks cross-origin loads.
+	testutil.AssertNoHeader(t, rec, "cross-origin-embedder-policy")
 	testutil.AssertHeader(t, rec, "x-dns-prefetch-control", "off")
 	testutil.AssertHeader(t, rec, "x-permitted-cross-domain-policies", "none")
 	testutil.AssertHeader(t, rec, "origin-agent-cluster", "?1")
-	testutil.AssertHeader(t, rec, "x-download-options", "noopen")
+	testutil.AssertNoHeader(t, rec, "x-download-options")
 }
 
 func TestDefaultConfigOmitsCSPAndPermissionsPolicy(t *testing.T) {
@@ -56,8 +58,8 @@ func TestHeaderDefaultsAndOverrides(t *testing.T) {
 		{"xss-protection custom", &Config{XSSProtection: "1; mode=block"}, "x-xss-protection", "1; mode=block"},
 		{"origin-agent-cluster default", nil, "origin-agent-cluster", "?1"},
 		{"origin-agent-cluster custom", &Config{OriginAgentCluster: "?0"}, "origin-agent-cluster", "?0"},
-		{"x-download-options default", nil, "x-download-options", "noopen"},
-		{"x-download-options custom", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"},
+		{"x-download-options custom (opt-in)", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"},
+		{"coep opt-in", &Config{CrossOriginEmbedderPolicy: "require-corp"}, "cross-origin-embedder-policy", "require-corp"},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -275,8 +277,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) {
 	if cfg.CrossOriginResourcePolicy != "same-origin" {
 		t.Fatalf("CrossOriginResourcePolicy: got %q, want %q", cfg.CrossOriginResourcePolicy, "same-origin")
 	}
-	if cfg.CrossOriginEmbedderPolicy != "require-corp" {
-		t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q", cfg.CrossOriginEmbedderPolicy, "require-corp")
+	if cfg.CrossOriginEmbedderPolicy != "" {
+		t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q (opt-in default, #338)", cfg.CrossOriginEmbedderPolicy, "")
 	}
 	if cfg.XDNSPrefetchControl != "off" {
 		t.Fatalf("XDNSPrefetchControl: got %q, want %q", cfg.XDNSPrefetchControl, "off")
@@ -287,8 +289,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) {
 	if cfg.OriginAgentCluster != "?1" {
 		t.Fatalf("OriginAgentCluster: got %q, want %q", cfg.OriginAgentCluster, "?1")
 	}
-	if cfg.XDownloadOptions != "noopen" {
-		t.Fatalf("XDownloadOptions: got %q, want %q", cfg.XDownloadOptions, "noopen")
+	if cfg.XDownloadOptions != "" {
+		t.Fatalf("XDownloadOptions: got %q, want %q (opt-in default, #338)", cfg.XDownloadOptions, "")
 	}
 }
 
@@ -342,14 +344,14 @@ func TestBuildHeadersSkipsEmptyStrings(t *testing.T) {
 func TestBuildHeadersDefaultCount(t *testing.T) {
 	cfg := applyDefaults(Config{})
 	headers := buildHeaders(cfg)
-	// Default config should produce 11 headers (no CSP, no PermissionsPolicy).
-	// x-content-type-options, x-frame-options, x-xss-protection,
-	// referrer-policy, cross-origin-opener-policy, cross-origin-resource-policy,
-	// cross-origin-embedder-policy, x-dns-prefetch-control, x-permitted-cross-domain-policies,
-	// origin-agent-cluster, x-download-options.
-	// Note: HSTS is not in buildHeaders (runtime check).
-	if len(headers) != 11 {
-		t.Fatalf("expected 11 headers from default config, got %d", len(headers))
+	// Default config produces 9 headers (no CSP, no PermissionsPolicy; COEP +
+	// X-Download-Options are opt-in, #338): x-content-type-options,
+	// x-frame-options, x-xss-protection, referrer-policy,
+	// cross-origin-opener-policy, cross-origin-resource-policy,
+	// x-dns-prefetch-control, x-permitted-cross-domain-policies,
+	// origin-agent-cluster. Note: HSTS is not in buildHeaders (runtime check).
+	if len(headers) != 9 {
+		t.Fatalf("expected 9 headers from default config, got %d", len(headers))
 	}
 }
 
@@ -359,9 +361,9 @@ func TestBuildHeadersWithCSPAndPermissions(t *testing.T) {
 		PermissionsPolicy:     "camera=()",
 	})
 	headers := buildHeaders(cfg)
-	// 11 defaults + CSP + PermissionsPolicy = 13.
-	if len(headers) != 13 {
-		t.Fatalf("expected 13 headers, got %d", len(headers))
+	// 9 defaults + CSP + PermissionsPolicy = 11.
+	if len(headers) != 11 {
+		t.Fatalf("expected 11 headers, got %d", len(headers))
 	}
 }
 
diff --git a/probe/probe_caps_linux_test.go b/probe/probe_caps_linux_test.go
index fcd2234e..84c949e7 100644
--- a/probe/probe_caps_linux_test.go
+++ b/probe/probe_caps_linux_test.go
@@ -16,3 +16,29 @@ func getNice(t *testing.T) (int, error) {
 	t.Helper()
 	return unix.Getpriority(unix.PRIO_PROCESS, 0)
 }
+
+// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the
+// dangerous Setpriority-based probe (finding 5.2): the real read-only
+// implementation must not change the process scheduling priority. We
+// snapshot the current nice value, run the check, and assert it is
+// unchanged. The boolean result itself is environment-dependent and not
+// asserted. Linux-only (lives here with getNice so the cross-platform
+// probe_test.go compiles on darwin/non-linux).
+func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) {
+	sp := defaultProber()
+	if sp.CheckCapSysNice == nil {
+		t.Skip("no CheckCapSysNice on this platform")
+	}
+	before, err := getNice(t)
+	if err != nil {
+		t.Skipf("cannot read nice value: %v", err)
+	}
+	_ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects
+	after, err := getNice(t)
+	if err != nil {
+		t.Fatalf("cannot read nice value after check: %v", err)
+	}
+	if before != after {
+		t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after)
+	}
+}
diff --git a/probe/probe_test.go b/probe/probe_test.go
index 8debb548..60944acd 100644
--- a/probe/probe_test.go
+++ b/probe/probe_test.go
@@ -314,34 +314,6 @@ func TestSQPollGateAtOptionalTier(t *testing.T) {
 	}
 }
 
-// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the
-// dangerous Setpriority-based probe (finding 5.2): the real read-only
-// implementation must not change the process scheduling priority. We
-// snapshot the current nice value, run the check, and assert it is
-// unchanged. The boolean result itself is environment-dependent and not
-// asserted.
-func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("CAP_SYS_NICE check is linux-only")
-	}
-	sp := defaultProber()
-	if sp.CheckCapSysNice == nil {
-		t.Skip("no CheckCapSysNice on this platform")
-	}
-	before, err := getNice(t)
-	if err != nil {
-		t.Skipf("cannot read nice value: %v", err)
-	}
-	_ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects
-	after, err := getNice(t)
-	if err != nil {
-		t.Fatalf("cannot read nice value after check: %v", err)
-	}
-	if before != after {
-		t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after)
-	}
-}
-
 // TestProbeSendfileAndZerocopy pins the sendfile / zerocopy capability
 // flags (celeris#317). Sendfile is unconditional on Linux (kernel
 // 2.6.23+, every distro past 2.6.33) and is now actually wired into the
diff --git a/protocol/h1/parser.go b/protocol/h1/parser.go
index 89b0058e..36e4bf53 100644
--- a/protocol/h1/parser.go
+++ b/protocol/h1/parser.go
@@ -86,15 +86,12 @@ func (p *Parser) ParseRequest(req *Request) (int, error) {
 		return 0, nil
 	}
 
-	remaining := p.buf[p.pos:]
-	// Quick check: if remaining starts with \r\n, headers are empty (no headers).
-	// Otherwise use SIMD-accelerated findHeaderEnd to verify \r\n\r\n is present.
-	if len(remaining) < 2 || remaining[0] != '\r' || remaining[1] != '\n' {
-		if findHeaderEnd(remaining) < 0 {
-			return 0, nil
-		}
-	}
-
+	// No upfront whole-block findHeaderEnd scan: parseHeaders detects an
+	// incomplete block itself (a final line with no CRLF yields lineEnd==-1 →
+	// (false,nil)), and every caller Reset()s parser+req before re-parsing, so
+	// a partial parse is always retried cleanly. Scanning here first would
+	// CRLF-walk the same bytes parseHeaders walks again — pure double work on
+	// the common single-read request.
 	if p.noStringHeaders {
 		req.Headers = req.Headers[:0]
 	} else {
diff --git a/protocol/h2/stream/stream.go b/protocol/h2/stream/stream.go
index 77a95e56..46cf7b9b 100644
--- a/protocol/h2/stream/stream.go
+++ b/protocol/h2/stream/stream.go
@@ -314,6 +314,12 @@ func (s *Stream) resetAndPool() {
 }
 
 // ResetH1Stream performs a lightweight per-request reset for H1 stream reuse.
+// It clears ONLY the fields populateCachedStream + handleH1Request do not
+// unconditionally overwrite on the next request. rawBody is load-bearing: the
+// caller installs it only when the request carries a body, so a bodyless GET
+// would otherwise inherit the previous request's body. Headers/Method/Path/
+// Scheme/Authority/LazyRawHeaders/IsHEAD/EndStream/ResponseWriter/state are all
+// re-set per request (h1.go:852-941), so re-zeroing them here is dead work.
 func ResetH1Stream(s *Stream) {
 	if s.Data != nil {
 		s.Data.Reset()
@@ -321,19 +327,9 @@ func ResetH1Stream(s *Stream) {
 		s.Data = nil
 	}
 	s.rawBody = nil
-	s.Headers = s.hdrBuf[:0]
-	s.LazyRawHeaders = nil
 	s.lazyHeadersBuilt = false
 	s.pseudoMaterialized = false
-	s.Method = ""
-	s.Path = ""
-	s.Scheme = ""
-	s.Authority = ""
 	s.headersSent.Store(false)
-	s.EndStream = false
-	s.ResponseWriter = nil
-	s.IsHEAD = false
-	s.state.Store(int32(StateIdle))
 }
 
 // MaterializeHeaders ensures that every header carried by this H1 stream
diff --git a/protocol/h2/stream/stream_test.go b/protocol/h2/stream/stream_test.go
new file mode 100644
index 00000000..854c6831
--- /dev/null
+++ b/protocol/h2/stream/stream_test.go
@@ -0,0 +1,48 @@
+package stream
+
+import "testing"
+
+// TestResetH1StreamClearsUniqueFields locks in the contract that ResetH1Stream
+// owns: the fields the per-request caller does NOT unconditionally overwrite.
+// rawBody is the load-bearing one — a bodyless GET that follows a request with
+// a body must not inherit the stale body (celeris#346).
+func TestResetH1StreamClearsUniqueFields(t *testing.T) {
+	s := NewH1Stream(1)
+
+	s.SetRawBody([]byte("previous request body"))
+	s.GetBuf().WriteString("buffered data")
+	s.lazyHeadersBuilt = true
+	s.pseudoMaterialized = true
+	s.headersSent.Store(true)
+
+	ResetH1Stream(s)
+
+	if s.rawBody != nil {
+		t.Fatalf("rawBody = %q, want nil (stale body would leak into next request)", s.rawBody)
+	}
+	if s.Data != nil {
+		t.Fatalf("Data = %v, want nil (buffer must be returned to the pool)", s.Data)
+	}
+	if s.lazyHeadersBuilt {
+		t.Fatal("lazyHeadersBuilt = true, want false")
+	}
+	if s.pseudoMaterialized {
+		t.Fatal("pseudoMaterialized = true, want false")
+	}
+	if s.headersSent.Load() {
+		t.Fatal("headersSent = true, want false")
+	}
+}
+
+func BenchmarkResetH1Stream(b *testing.B) {
+	s := NewH1Stream(1)
+	body := []byte("hello")
+	b.ReportAllocs()
+	for b.Loop() {
+		s.SetRawBody(body)
+		s.lazyHeadersBuilt = true
+		s.pseudoMaterialized = true
+		s.headersSent.Store(true)
+		ResetH1Stream(s)
+	}
+}
diff --git a/resource/config.go b/resource/config.go
index 25b0e2f1..32fffdef 100644
--- a/resource/config.go
+++ b/resource/config.go
@@ -83,103 +83,6 @@ type Config struct {
 	// from celeris.Config.EnableH2Upgrade (pointer, may be nil) and Protocol.
 	// Always a concrete value after WithDefaults.
 	EnableH2Upgrade bool
-	// SkipBuiltinScaler suppresses the per-engine dynamic worker scaler
-	// loop. Set by the adaptive engine when it constructs its sub-engines —
-	// adaptive runs ONE higher-level scaler that delegates to the active
-	// sub-engine, so the iouring + epoll built-in scalers must stay quiet
-	// to avoid two scalers fighting over the same worker pool.
-	SkipBuiltinScaler bool
-	// WorkerScaling configures the dynamic worker scaler. As of v1.4.6
-	// (issue #281), [celeris.Config.toResourceConfig] sets this to a
-	// zero-value struct whenever the user-facing field was nil — the
-	// scaler is therefore DEFAULT-ON for any user constructing the
-	// server via celeris.New. Direct callers of resource.Config (i.e.
-	// engine tests + the resource_test suite) still see nil-as-disabled
-	// to keep the legacy contract for that low-level surface.
-	//
-	// When set, the scaler activates and deactivates workers based on
-	// observed load to keep CQE/event batching density in the sweet
-	// spot. See WorkerScalingConfig for tuning details.
-	WorkerScaling *WorkerScalingConfig
-}
-
-// WorkerScalingStrategy selects the seed strategy for the dynamic
-// worker scaler. The zero value (ScalingStrategyStartHigh) is the
-// recommended default: start at NumWorkers active, scale down once
-// load is observably low. This preserves SO_REUSEPORT distribution at
-// startup, which the spike-B sweep showed is dramatically better on
-// ramp / oscil traffic patterns (+34-78 % across all three engines).
-type WorkerScalingStrategy int
-
-const (
-	// ScalingStrategyStartHigh seeds the scaler at NumWorkers active.
-	// Best for production where traffic ramps from idle and bursts.
-	// Zero value — selected when the field is unset.
-	ScalingStrategyStartHigh WorkerScalingStrategy = 0
-	// ScalingStrategyStartLow seeds the scaler at MinActive. Best when
-	// the application has a long idle warmup before any conns arrive
-	// and saving CPU during that idle period matters more than peak
-	// throughput on the first burst.
-	ScalingStrategyStartLow WorkerScalingStrategy = 1
-)
-
-// WorkerScalingConfig controls the dynamic worker scaler used by the
-// iouring, epoll, and adaptive engines. Zero values mean "use the
-// scaler's default" — see field comments for what those are. Pass via
-// celeris.Config.WorkerScaling to enable; nil disables the scaler
-// entirely (the engine runs all configured workers all the time, like
-// versions before the scaler was introduced).
-//
-// The scaler tracks the engine's activeConns counter and adjusts the
-// number of "active" workers (workers participating in the SO_REUSEPORT
-// group) so that conns / active is roughly TargetConnsPerWorker. Scale-up
-// is reactive (next tick after a load increase). Scale-down is hysteretic
-// — must observe ScaleDownIdleTicks consecutive ticks below the
-// hysteresis threshold before reducing one worker.
-type WorkerScalingConfig struct {
-	// Strategy picks the seed-state strategy. Zero value is
-	// ScalingStrategyStartHigh, which is the data-validated default for
-	// most production workloads. See WorkerScalingStrategy for tuning.
-	Strategy WorkerScalingStrategy
-	// MinActive is the floor on the active worker count. The scaler will
-	// never reduce active workers below this. Defaults to max(2, NumCPU/2).
-	// Set to NumCPU to force the scaler to always run at full capacity
-	// (effectively a static-w=NumCPU configuration).
-	MinActive int
-	// TargetConnsPerWorker is the active-worker scaling target. The scaler
-	// computes desired = ceil(activeConns / TargetConnsPerWorker), clamps
-	// to [MinActive, NumWorkers], and steers active toward that. Default 20.
-	// Higher values keep more conns per worker (better batching, less
-	// parallelism). Lower values prefer parallelism over batching.
-	TargetConnsPerWorker int
-	// Interval controls how often the scaler reevaluates active count.
-	// Default 250ms. Lower values respond to load changes faster but burn
-	// more CPU on the controller goroutine.
-	Interval time.Duration
-	// ScaleUpStep is the maximum number of workers the scaler will
-	// resume per tick. Default 2 — wider bursts disrupt SO_REUSEPORT
-	// load balancing more than they help. Bigger values are tempting on
-	// SPIKE workloads but produce worse throughput per the v1.4.1
-	// SPIKE-test sweep (upStep=4 and upStep=8 both lost to upStep=2).
-	ScaleUpStep int
-	// ScaleDownStep is the maximum number of workers the scaler will
-	// pause per tick when load drops. Default 1 — scale-down too quickly
-	// and you can't recover throughput when load comes back.
-	ScaleDownStep int
-	// ScaleDownHysteresis adds a buffer between desired and active
-	// before scale-down fires: scale-down only if desired ≤ active -
-	// ScaleDownHysteresis - 1. Default 1 (so a desired-of-N triggers
-	// scale-down only when active is N+2 or higher).
-	ScaleDownHysteresis int
-	// ScaleDownIdleTicks is how many consecutive sub-threshold ticks
-	// must pass before a single scale-down step fires. Default 4
-	// (= 1 second at the default 250ms interval). Tunes how patient
-	// the scaler is about temporary lulls: a request-rate dip of one
-	// tick will not trigger scale-down.
-	ScaleDownIdleTicks int
-	// Trace logs every scaler decision (active, desired, idle_ticks).
-	// Default false. Use when diagnosing scaling behaviour.
-	Trace bool
 }
 
 // Validate checks all config fields and returns any validation errors.
diff --git a/resource/resource.go b/resource/resource.go
index 756ed1d6..7325ef43 100644
--- a/resource/resource.go
+++ b/resource/resource.go
@@ -1,5 +1,29 @@
 package resource
 
+// WorkloadHint is an OPTIONAL operator declaration of the expected steady-state
+// concurrency, consumed only by the adaptive engine's start-engine decision.
+//
+// Because established connections cannot migrate between epoll and io_uring,
+// the START engine decides keep-alive throughput; and the expected concurrency
+// is unknowable at Listen() time (no connections exist yet). This hint is the
+// ONLY input that can make the adaptive engine START on io_uring. Absent it,
+// the engine defaults to epoll (every server ramps from zero connections — the
+// low-concurrency regime where epoll wins on both throughput and tail latency)
+// and lets the runtime switch route NEW connections up to io_uring under load.
+type WorkloadHint int
+
+const (
+	// WorkloadUnspecified (zero value) leaves the start-engine choice to the
+	// default policy: start epoll, promote new conns to io_uring under load.
+	WorkloadUnspecified WorkloadHint = iota
+	// WorkloadLowConcurrency explicitly declares thin/latency-sensitive traffic
+	// — start (and stay) on epoll.
+	WorkloadLowConcurrency
+	// WorkloadHighConcurrency explicitly declares many h1 keep-alive
+	// connections per worker — start on io_uring (when kernel + memlock allow).
+	WorkloadHighConcurrency
+)
+
 // Resources allows user overrides of default resource values.
 // Zero values mean "use engine default".
 type Resources struct {
@@ -13,6 +37,9 @@ type Resources struct {
 	SocketSend int
 	// MaxConns is the max simultaneous connections per worker (0 = unlimited).
 	MaxConns int
+	// WorkloadHint is an OPTIONAL operator concurrency declaration; the only
+	// input that can make the adaptive engine START on io_uring (see WorkloadHint).
+	WorkloadHint WorkloadHint
 }
 
 // ResolvedResources contains the final computed values after applying defaults,
diff --git a/router.go b/router.go
index a1f6b9e2..50697047 100644
--- a/router.go
+++ b/router.go
@@ -5,8 +5,15 @@ import (
 	"log/slog"
 	"slices"
 	"sync"
+	"sync/atomic"
+	"time"
 )
 
+// nowNano returns the current time in Unix nanoseconds. A package var so the
+// adaptive promotion TTL (celeris#364) can be exercised deterministically in
+// tests without sleeping.
+var nowNano = func() int64 { return time.Now().UnixNano() }
+
 // staticEntry holds the pre-composed handler chain and full path for a fully
 // static route, enabling O(1) map lookup instead of a trie walk.
 type staticEntry struct {
@@ -95,6 +102,35 @@ type router struct {
 	// decide whether the async dispatch infrastructure is needed at
 	// all (a server with zero async routes keeps the inline fast path).
 	asyncRouteCount int
+
+	// adaptiveRoutes (celeris#356) holds the fullPaths of routes that
+	// inherited the server-level AsyncHandlers=true default (rather than an
+	// explicit .Async()). They start INLINE (ring-batched send, the cheap
+	// path) and are promoted to async dispatch only when an inline run is
+	// observed to block — so trivial handlers keep io_uring's syscall
+	// batching while genuinely-blocking handlers still get goroutine
+	// isolation. Built at registration; read-only while serving.
+	adaptiveRoutes map[string]bool
+	// promoted records adaptive fullPaths that have been promoted to async
+	// after sustained blocking inline runs. sync.Map: concurrent reads on the
+	// hot path, rare writes at promotion time.
+	promoted sync.Map
+	// slowStreak tracks consecutive slow inline runs per adaptive fullPath
+	// (fullPath -> *atomic.Int32). Hysteresis: a one-off cold/GC outlier must
+	// not poison a fast route; only a handler that blocks on EVERY run (DB/
+	// cache) accumulates adaptivePromoteStreak slow runs and gets promoted. A
+	// fast run resets the streak.
+	slowStreak sync.Map
+	// fastStreak tracks consecutive FAST inline runs per adaptive fullPath
+	// (fullPath -> *atomic.Int32). Once a route is fast on adaptiveSettleStreak
+	// consecutive runs it is provably non-blocking and gets SETTLED — future
+	// requests skip the per-request inline timing (two time.Now() vDSO calls)
+	// entirely (celeris#361). A slow run resets it.
+	fastStreak sync.Map
+	// settled holds adaptive fullPaths proven non-blocking (see fastStreak).
+	// A settled route is no longer timed/promotable; it runs inline like a
+	// plain sync route. Explicit .Async()/.Sync() clears it (setAsync).
+	settled sync.Map
 }
 
 // Route is an opaque handle to a registered route. Use the Name method to
@@ -205,6 +241,22 @@ func (r *Route) Sync() *Route {
 	return r.setAsync(false)
 }
 
+// UsesDriver marks a route whose handler performs a blocking backend round-trip
+// via a celeris driver (postgres / redis / memcached) opened WithEngine(srv).
+// It is exactly equivalent to .Async() — the route is dispatched off the worker
+// so the blocking call parks the handler goroutine on netpoll instead of
+// stalling an I/O worker — but reads as intent at the call site and is the
+// recommended way to mark driver routes:
+//
+//	srv.GET("/users/:id", getUser).UsesDriver()   // == .Async(), clearer intent
+//
+// The adaptive safety net (Config.AsyncHandlers=true) only auto-promotes
+// handlers slower than ~300µs, so a fast localhost driver call (sub-300µs) would
+// otherwise keep blocking a worker every request — mark such routes explicitly.
+func (r *Route) UsesDriver() *Route {
+	return r.setAsync(true)
+}
+
 // setAsync is the shared implementation of [Route.Async] and [Route.Sync].
 // Updates the node, the router's asyncRouteCount, and the static-fast-path
 // map entry so all three observers see the same async flag.
@@ -212,11 +264,28 @@ func (r *Route) setAsync(want bool) *Route {
 	if r.node == nil {
 		return r
 	}
-	if r.node.async != want && r.router != nil {
-		if want {
-			r.router.asyncRouteCount++
-		} else if r.router.asyncRouteCount > 0 {
-			r.router.asyncRouteCount--
+	if r.router != nil {
+		// celeris#356: an explicit .Async()/.Sync() overrides the adaptive
+		// (server-default) classification — the route is no longer
+		// inline-first-with-promotion, so drop it from the adaptive set and
+		// clear any prior promotion.
+		if r.router.adaptiveRoutes[r.path] {
+			delete(r.router.adaptiveRoutes, r.path)
+			r.router.promoted.Delete(r.path)
+			r.router.settled.Delete(r.path)
+			r.router.fastStreak.Delete(r.path)
+			r.router.slowStreak.Delete(r.path)
+			// Adaptive routes were already counted (they may promote): keep the
+			// count when the explicit choice is async, drop it when sync.
+			if !want && r.router.asyncRouteCount > 0 {
+				r.router.asyncRouteCount--
+			}
+		} else if r.node.async != want {
+			if want {
+				r.router.asyncRouteCount++
+			} else if r.router.asyncRouteCount > 0 {
+				r.router.asyncRouteCount--
+			}
 		}
 	}
 	r.node.async = want
@@ -234,8 +303,102 @@ func (r *Route) setAsync(want bool) *Route {
 
 func newRouter() *router {
 	return &router{
-		namedRoutes: make(map[string]*Route),
+		namedRoutes:    make(map[string]*Route),
+		adaptiveRoutes: make(map[string]bool),
+	}
+}
+
+// isPromoted reports whether an adaptive route (celeris#356) is currently
+// promoted to async dispatch.
+//
+// Promotion is REVERSIBLE (celeris#364): it expires after adaptivePromoteTTL.
+// Once expired, the route is dropped from the promoted set and its slow streak
+// is cleared, so the next request runs INLINE again and is re-timed — both the
+// routing decision (adaptivePromoted) and the per-request timing decision
+// (adaptiveLearning) key off this method, so they flip back together. A route
+// that is genuinely blocking re-promotes within adaptivePromoteStreak runs; a
+// route that was promoted by a transient load/jitter spike (a CPU-bound chain
+// whose inline wall-clock briefly crossed the threshold under contention) stays
+// inline. Without this, a single spike pinned a route to the slower async path
+// until process restart (the intermittent ~32% chain collapse).
+//
+// The clock is read only for routes actually in the promoted set, so learning
+// and settled routes pay nothing here.
+func (r *router) isPromoted(fullPath string) bool {
+	v, ok := r.promoted.Load(fullPath)
+	if !ok {
+		return false
 	}
+	if nowNano()-v.(int64) > int64(adaptivePromoteTTL) {
+		// Expired: re-enter the learning/inline path to re-evaluate.
+		r.promoted.Delete(fullPath)
+		if sv, ok := r.slowStreak.Load(fullPath); ok {
+			sv.(*atomic.Int32).Store(0)
+		}
+		return false
+	}
+	return true
+}
+
+// promoteRoute marks an adaptive route as async after sustained blocking inline
+// runs, stamped with the promotion time so isPromoted can expire it
+// (celeris#364). Re-promotion refreshes the stamp.
+func (r *router) promoteRoute(fullPath string) {
+	r.promoted.Store(fullPath, nowNano())
+}
+
+// promoteRouteImmediate promotes an adaptive route after a single
+// unambiguously-blocking inline run (> adaptiveBlockingThreshold), skipping the
+// adaptivePromoteStreak hysteresis. The fast streak is cleared so a burst of
+// prior fast runs cannot settle the route once the promotion TTL expires and it
+// re-enters the learning path.
+func (r *router) promoteRouteImmediate(fullPath string) {
+	if fv, ok := r.fastStreak.Load(fullPath); ok {
+		fv.(*atomic.Int32).Store(0)
+	}
+	r.promoteRoute(fullPath)
+}
+
+// recordInlineRun feeds one inline-run observation into the adaptive
+// classifier (celeris#356). A fast run resets the slow streak; a slow run
+// increments it, and once a route is slow on adaptivePromoteStreak CONSECUTIVE
+// runs it is promoted to async. The consecutive requirement makes a single cold
+// start / GC pause harmless while a genuinely-blocking handler (slow on every
+// run) promotes within a handful of requests.
+func (r *router) recordInlineRun(fullPath string, slow bool) {
+	if !slow {
+		if v, ok := r.slowStreak.Load(fullPath); ok {
+			v.(*atomic.Int32).Store(0)
+		}
+		// celeris#361: a route fast on adaptiveSettleStreak CONSECUTIVE runs is
+		// provably non-blocking — settle it so adaptiveLearning short-circuits
+		// and handler.go stops timing every request forever.
+		fv, _ := r.fastStreak.LoadOrStore(fullPath, new(atomic.Int32))
+		if fv.(*atomic.Int32).Add(1) >= adaptiveSettleStreak {
+			r.settled.Store(fullPath, struct{}{})
+		}
+		return
+	}
+	if v, ok := r.fastStreak.Load(fullPath); ok {
+		v.(*atomic.Int32).Store(0)
+	}
+	v, _ := r.slowStreak.LoadOrStore(fullPath, new(atomic.Int32))
+	if v.(*atomic.Int32).Add(1) >= adaptivePromoteStreak {
+		r.promoteRoute(fullPath)
+	}
+}
+
+// adaptiveLearning reports whether an adaptive route is still being observed —
+// i.e. neither settled (proven non-blocking, celeris#361) nor promoted (proven
+// blocking, celeris#356). Only learning routes pay the per-request inline
+// timing in handler.go; once decided, the hot path skips it. Steady state is a
+// single sync.Map.Load (settled), the same lookup cost the prior isPromoted
+// check carried, but without the two time.Now() vDSO calls per request.
+func (r *router) adaptiveLearning(fullPath string) bool {
+	if _, ok := r.settled.Load(fullPath); ok {
+		return false
+	}
+	return !r.isPromoted(fullPath)
 }
 
 // addRoute registers a route inheriting the server-level async default.
@@ -253,6 +416,15 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc,
 	validatePath(path)
 
 	async := r.resolveAsync(as)
+	// celeris#356: a route that inherits the server-level AsyncHandlers=true
+	// default (rather than an explicit .Async()) starts INLINE and is promoted
+	// to async only when an inline run is observed to block. Explicit
+	// .Async(true)/.Async(false) is honored verbatim.
+	adaptive := as == asyncDefault && r.defaultAsync
+	if adaptive {
+		async = false
+		r.adaptiveRoutes[path] = true
+	}
 
 	root := r.getTree(method)
 	if root == nil {
@@ -271,7 +443,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc,
 		root.async = async
 		route.node = root
 		r.setStaticEntry(method, "/", staticEntry{handlers: handlers, fullPath: "/", async: async})
-		if async {
+		if async || adaptive {
 			r.asyncRouteCount++
 		}
 		return route
@@ -296,7 +468,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc,
 		r.setStaticEntry(method, path, staticEntry{handlers: handlers, fullPath: path, async: async})
 	}
 
-	if async {
+	if async || adaptive {
 		r.asyncRouteCount++
 	}
 
@@ -321,19 +493,28 @@ func (r *router) routeAsync(method, path string) bool {
 	if idx >= 0 {
 		if m := r.staticRoutes[idx]; m != nil {
 			if e, ok := m[path]; ok {
-				return e.async
+				return e.async || r.adaptivePromoted(e.fullPath)
 			}
 		}
 	} else if r.customStatic != nil {
 		if m := r.customStatic[method]; m != nil {
 			if e, ok := m[path]; ok {
-				return e.async
+				return e.async || r.adaptivePromoted(e.fullPath)
 			}
 		}
 	}
 	var params Params
-	_, _, async := r.find(method, path, &params)
-	return async
+	_, fullPath, async := r.find(method, path, &params)
+	return async || r.adaptivePromoted(fullPath)
+}
+
+// adaptivePromoted reports whether an adaptive (server-default-async) route has
+// been promoted to async dispatch after a blocking inline run (celeris#356).
+// The adaptiveRoutes map is read-only while serving, so the lookup is a plain
+// (lock-free) map read followed by a sync.Map load only for adaptive routes;
+// non-adaptive configs (no AsyncHandlers default) keep the empty-map fast path.
+func (r *router) adaptivePromoted(fullPath string) bool {
+	return r.adaptiveRoutes[fullPath] && r.isPromoted(fullPath)
 }
 
 // warnDuplicateRoute emits a warning when a route is registered twice for
diff --git a/router_async_test.go b/router_async_test.go
index 87e03bd4..0185159a 100644
--- a/router_async_test.go
+++ b/router_async_test.go
@@ -21,16 +21,27 @@ func TestRouteAsync_ServerDefaultSync(t *testing.T) {
 	}
 }
 
-// TestRouteAsync_ServerDefaultAsync verifies routes inherit an async
-// server default (Config.AsyncHandlers=true) when not overridden.
-func TestRouteAsync_ServerDefaultAsync(t *testing.T) {
+// TestRouteAsync_ServerDefaultAdaptive verifies the celeris#356 contract: a
+// route that inherits the AsyncHandlers=true default (not an explicit .Async())
+// is ADAPTIVE — it starts INLINE (routeAsync=false, the ring-batched fast path)
+// and is promoted to async only after a blocking inline run. hasAsyncRoutes
+// stays true because adaptive routes may promote and need the async infra.
+func TestRouteAsync_ServerDefaultAdaptive(t *testing.T) {
 	s := New(Config{AsyncHandlers: true})
 	s.GET("/ping", noopHandler)
-	if !s.router.routeAsync("GET", "/ping") {
-		t.Fatal("route should inherit async server default")
+	if s.router.routeAsync("GET", "/ping") {
+		t.Fatal("adaptive route should start INLINE (not async) until it blocks")
+	}
+	if !s.router.adaptiveRoutes["/ping"] {
+		t.Fatal("/ping should be registered adaptive under AsyncHandlers=true")
 	}
 	if !s.router.hasAsyncRoutes() {
-		t.Fatal("hasAsyncRoutes should be true when default is async")
+		t.Fatal("hasAsyncRoutes should be true (adaptive routes may promote)")
+	}
+	// A blocking inline run promotes the route; it then resolves async.
+	s.router.promoteRoute("/ping")
+	if !s.router.routeAsync("GET", "/ping") {
+		t.Fatal("after promotion the adaptive route should resolve async")
 	}
 }
 
@@ -51,18 +62,47 @@ func TestRouteAsync_RouteOverrideOn(t *testing.T) {
 	}
 }
 
-// TestRouteAsync_RouteOverrideOff forces a single route sync on an
-// async-default server.
+// TestRouteAsync_RouteOverrideOff verifies that on an async-default server, an
+// inherited route is adaptive (inline-first, celeris#356) while an explicit
+// .Async(false) route is hard-sync and never adaptive.
 func TestRouteAsync_RouteOverrideOff(t *testing.T) {
 	s := New(Config{AsyncHandlers: true})
-	s.GET("/db", noopHandler)
-	s.GET("/cached", noopHandler).Async(false)
-	if !s.router.routeAsync("GET", "/db") {
-		t.Fatal("/db should inherit async default")
+	s.GET("/db", noopHandler)                  // inherits default → adaptive
+	s.GET("/cached", noopHandler).Async(false) // explicit sync → never adaptive
+	if s.router.routeAsync("GET", "/db") {
+		t.Fatal("/db inherits the async default → adaptive, starts inline")
+	}
+	if !s.router.adaptiveRoutes["/db"] {
+		t.Fatal("/db should be adaptive")
 	}
 	if s.router.routeAsync("GET", "/cached") {
 		t.Fatal("/cached should be forced sync via Async(false)")
 	}
+	if s.router.adaptiveRoutes["/cached"] {
+		t.Fatal("/cached is explicitly sync → must not be adaptive")
+	}
+}
+
+// TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive is the celeris#356
+// no-regression guard for blocking handlers. A route explicitly marked
+// .Async() on an async-default server (exactly how the probatorium driver
+// routes — /cache, /db, /mc — register) must be hard-async, NOT adaptive:
+// it must never enter the inline-first window, so a blocking handler never
+// runs inline and stalls a worker. This is what makes #356 safe to ship
+// without a driver-backend regression: trivial routes inherit (adaptive →
+// inline win) while explicitly-async blocking routes stay always-async.
+func TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/cache/:key", noopHandler).Async() // blocking driver route
+	if s.router.adaptiveRoutes["/cache/:key"] {
+		t.Fatal("explicit .Async() route must be removed from the adaptive set")
+	}
+	if _, promoted := s.router.promoted.Load("/cache/:key"); promoted {
+		t.Fatal("explicit .Async() route must carry no promotion state")
+	}
+	if !s.router.routeAsync("GET", "/cache/42") {
+		t.Fatal("explicit .Async() route must resolve hard-async (never inline)")
+	}
 }
 
 // TestRouteAsync_GroupInherit verifies group-level Async applies to its
@@ -170,3 +210,177 @@ func TestRouteAsync_ResolverInterface(t *testing.T) {
 		t.Fatal("adapter RouteAsync should report unmatched sync")
 	}
 }
+
+// TestRouteAsync_AdaptiveHysteresis verifies celeris#356 promotion hysteresis:
+// a single slow inline run (cold start / GC) must NOT promote a route, but
+// adaptivePromoteStreak consecutive slow runs (a genuinely-blocking handler)
+// must. A fast run resets the streak.
+func TestRouteAsync_AdaptiveHysteresis(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/h", noopHandler)
+	rt := s.router
+	// One slow outlier, then a fast run → no promotion.
+	rt.recordInlineRun("/h", true)
+	rt.recordInlineRun("/h", false)
+	if rt.routeAsync("GET", "/h") {
+		t.Fatal("a single slow run must not promote (cold-start hysteresis)")
+	}
+	// Sustained slowness (blocking handler) → promotion after the streak.
+	for i := 0; i < adaptivePromoteStreak; i++ {
+		rt.recordInlineRun("/h", true)
+	}
+	if !rt.routeAsync("GET", "/h") {
+		t.Fatalf("%d consecutive slow runs must promote to async", adaptivePromoteStreak)
+	}
+}
+
+// TestRouteAsync_AdaptiveSettles verifies celeris#361: a route fast on
+// adaptiveSettleStreak CONSECUTIVE inline runs SETTLES (leaves the timed
+// learning path so the hot loop stops paying two time.Now() per request), and a
+// slow run before the streak completes resets it. A settled route still runs
+// inline (not async).
+func TestRouteAsync_AdaptiveSettles(t *testing.T) {
+	s := New(Config{AsyncHandlers: true})
+	route := s.GET("/s", noopHandler)
+	rt := s.router
+
+	// One short of the streak → still learning.
+	for i := 0; i < adaptiveSettleStreak-1; i++ {
+		rt.recordInlineRun("/s", false)
+	}
+	if !rt.adaptiveLearning("/s") {
+		t.Fatal("route must still be learning before the settle streak completes")
+	}
+
+	// A slow run resets the fast streak; the next near-full run must NOT settle.
+	rt.recordInlineRun("/s", true)
+	for i := 0; i < adaptiveSettleStreak-1; i++ {
+		rt.recordInlineRun("/s", false)
+	}
+	if !rt.adaptiveLearning("/s") {
+		t.Fatal("a slow run must reset the fast streak — route not settled yet")
+	}
+
+	// One more fast run completes the streak → settle.
+	rt.recordInlineRun("/s", false)
+	if rt.adaptiveLearning("/s") {
+		t.Fatalf("route must settle after %d consecutive fast runs", adaptiveSettleStreak)
+	}
+	if rt.routeAsync("GET", "/s") {
+		t.Fatal("a settled route runs INLINE, not async")
+	}
+
+	// An explicit override clears the settled state (setAsync).
+	route.Sync()
+	if _, settled := rt.settled.Load("/s"); settled {
+		t.Fatal("explicit .Sync() must clear the settled state")
+	}
+}
+
+// stubNowNano installs a controllable clock for the adaptive promotion TTL
+// (celeris#364) and returns a pointer to advance it plus a restore func.
+func stubNowNano() (clock *int64, restore func()) {
+	old := nowNano
+	var c int64
+	nowNano = func() int64 { return c }
+	return &c, func() { nowNano = old }
+}
+
+// TestRouteAsync_PromotionExpires verifies celeris#364: promotion is reversible.
+// A route promoted by a transient spike must de-promote after adaptivePromoteTTL
+// and run inline again, and the de-promotion must reset the slow streak so a
+// single later slow run does not immediately re-promote.
+func TestRouteAsync_PromotionExpires(t *testing.T) {
+	clock, restore := stubNowNano()
+	defer restore()
+
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/d", noopHandler)
+	rt := s.router
+
+	for i := 0; i < adaptivePromoteStreak; i++ {
+		rt.recordInlineRun("/d", true)
+	}
+	if !rt.routeAsync("GET", "/d") {
+		t.Fatal("expected promotion after the slow streak")
+	}
+
+	// Within the TTL → still promoted.
+	*clock += int64(adaptivePromoteTTL) - 1
+	if !rt.routeAsync("GET", "/d") {
+		t.Fatal("promotion must persist within the TTL")
+	}
+
+	// Past the TTL → de-promoted, route runs inline again.
+	*clock += 2
+	if rt.routeAsync("GET", "/d") {
+		t.Fatal("promotion must expire after the TTL (route runs inline again)")
+	}
+	if _, ok := rt.promoted.Load("/d"); ok {
+		t.Fatal("expired promotion must be removed from the promoted set")
+	}
+
+	// The slow streak was reset: one slow run must NOT immediately re-promote.
+	rt.recordInlineRun("/d", true)
+	if rt.routeAsync("GET", "/d") {
+		t.Fatal("de-promotion must reset the slow streak (one slow run must not re-promote)")
+	}
+}
+
+// TestRouteAsync_DePromotedRouteCanSettle verifies that after a falsely-promoted
+// route de-promotes, sustained fast runs SETTLE it (proven non-blocking, inline
+// forever) — i.e. re-evaluation works end to end.
+func TestRouteAsync_DePromotedRouteCanSettle(t *testing.T) {
+	clock, restore := stubNowNano()
+	defer restore()
+
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/d", noopHandler)
+	rt := s.router
+
+	for i := 0; i < adaptivePromoteStreak; i++ {
+		rt.recordInlineRun("/d", true)
+	}
+	*clock += int64(adaptivePromoteTTL) + 1
+	if rt.routeAsync("GET", "/d") {
+		t.Fatal("expected de-promotion at expiry")
+	}
+
+	for i := 0; i < adaptiveSettleStreak; i++ {
+		rt.recordInlineRun("/d", false)
+	}
+	if rt.adaptiveLearning("/d") {
+		t.Fatal("a de-promoted route that runs fast must settle (inline forever)")
+	}
+	if rt.routeAsync("GET", "/d") {
+		t.Fatal("a settled route runs inline, not async")
+	}
+}
+
+// TestRouteAsync_RePromotesAfterExpiryWhenBlocking verifies a genuinely-blocking
+// adaptive route re-promotes after the TTL re-evaluation — de-promotion must not
+// pin a blocking handler to the inline path.
+func TestRouteAsync_RePromotesAfterExpiryWhenBlocking(t *testing.T) {
+	clock, restore := stubNowNano()
+	defer restore()
+
+	s := New(Config{AsyncHandlers: true})
+	s.GET("/b", noopHandler)
+	rt := s.router
+
+	for i := 0; i < adaptivePromoteStreak; i++ {
+		rt.recordInlineRun("/b", true)
+	}
+	*clock += int64(adaptivePromoteTTL) + 1
+	if rt.routeAsync("GET", "/b") {
+		t.Fatal("expected de-promotion at expiry")
+	}
+
+	// Still blocking → re-promotes after a fresh streak.
+	for i := 0; i < adaptivePromoteStreak; i++ {
+		rt.recordInlineRun("/b", true)
+	}
+	if !rt.routeAsync("GET", "/b") {
+		t.Fatal("a still-blocking route must re-promote after expiry")
+	}
+}
diff --git a/server.go b/server.go
index 6cf14e09..515f1e99 100644
--- a/server.go
+++ b/server.go
@@ -21,7 +21,7 @@ import (
 )
 
 // Version is the semantic version of the celeris module.
-const Version = "1.5.0"
+const Version = "1.5.3"
 
 // ErrAlreadyStarted is returned when Start or StartWithContext is called on a
 // server that is already running.
@@ -476,7 +476,16 @@ func (s *Server) EventLoopProvider() engine.EventLoopProvider {
 //     (their net.TCPConn goroutines keep running regardless of which
 //     sub-engine is active).
 func (s *Server) AsyncHandlers() bool {
-	if !s.config.AsyncHandlers {
+	// Report the EFFECTIVE async state, not the raw Config flag: the server
+	// stands up async-dispatch infrastructure when the server-level default is
+	// set OR any route opted in via .Async() (doPrepare flips the engine cfg the
+	// same way via hasAsyncRoutes). WithEngine drivers consult this to pick their
+	// netpoll-park fast path; keying on the raw config alone put a driver called
+	// from a per-route-.Async() handler on the slower mini-loop path — the
+	// documented footgun. NOTE: hasAsyncRoutes reflects routes registered so far,
+	// so open WithEngine drivers AFTER registering .Async() routes (or set
+	// Config.AsyncHandlers=true) to be order-independent.
+	if !s.config.AsyncHandlers && !s.router.hasAsyncRoutes() {
 		return false
 	}
 	switch s.config.Engine {
diff --git a/server_test.go b/server_test.go
index 73fa0538..aa98294e 100644
--- a/server_test.go
+++ b/server_test.go
@@ -11,7 +11,6 @@ import (
 
 	"github.com/goceleris/celeris/engine"
 	"github.com/goceleris/celeris/protocol/h2/stream"
-	"github.com/goceleris/celeris/resource"
 )
 
 func TestServerEngineInfo(t *testing.T) {
@@ -1009,43 +1008,6 @@ func TestConfigZeroValuesNotMapped(t *testing.T) {
 	}
 }
 
-// TestConfigWorkerScalingDefaultOn pins the v1.4.6 default-on contract
-// for issue #281: nil Config.WorkerScaling resolves to a non-nil,
-// zero-value resource.WorkerScalingConfig — the scaler activates
-// automatically with the data-validated defaults
-// (Strategy=StartHigh, MinActive=max(2,NumCPU/2),
-// TargetConnsPerWorker=20, Interval=250ms, ScaleUpStep=2,
-// ScaleDownStep=1, ScaleDownHysteresis=1, ScaleDownIdleTicks=4).
-//
-// This brings the "just-works" public design intent (Adaptive engine,
-// Auto protocol) into alignment for the third major default.
-func TestConfigWorkerScalingDefaultOn(t *testing.T) {
-	cfg := Config{Addr: ":8080"} // no WorkerScaling field set
-	rc := cfg.toResourceConfig()
-	if rc.WorkerScaling == nil {
-		t.Fatal("nil Config.WorkerScaling did not resolve to a non-nil resource.WorkerScalingConfig — scaler default-on regressed")
-	}
-	if *rc.WorkerScaling != (resource.WorkerScalingConfig{}) {
-		t.Fatalf("expected zero-value WorkerScalingConfig (so scaler picks data-validated defaults), got %+v", *rc.WorkerScaling)
-	}
-}
-
-// TestConfigWorkerScalingExplicitPreserved pins that user-supplied
-// values are passed through verbatim: the default-on path only fires
-// when the user did NOT configure the scaler.
-func TestConfigWorkerScalingExplicitPreserved(t *testing.T) {
-	want := &resource.WorkerScalingConfig{
-		Strategy:             resource.ScalingStrategyStartLow,
-		MinActive:            7,
-		TargetConnsPerWorker: 99,
-	}
-	cfg := Config{Addr: ":8080", WorkerScaling: want}
-	rc := cfg.toResourceConfig()
-	if rc.WorkerScaling != want {
-		t.Fatalf("explicit WorkerScaling pointer was not preserved: got %p, want %p", rc.WorkerScaling, want)
-	}
-}
-
 // Phase 2.1 tests
 
 func TestRouteUseMiddleware(t *testing.T) {
diff --git a/test/drivercmp/memcached/go.mod b/test/drivercmp/memcached/go.mod
index 144cb857..df597ee0 100644
--- a/test/drivercmp/memcached/go.mod
+++ b/test/drivercmp/memcached/go.mod
@@ -3,7 +3,7 @@ module github.com/goceleris/celeris/test/drivercmp/memcached
 go 1.26.4
 
 require (
-	github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf
+	github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c
 	github.com/goceleris/celeris v0.0.0
 )
 
diff --git a/test/drivercmp/memcached/go.sum b/test/drivercmp/memcached/go.sum
index 581e931c..0f9dd4b5 100644
--- a/test/drivercmp/memcached/go.sum
+++ b/test/drivercmp/memcached/go.sum
@@ -1,4 +1,4 @@
-github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I=
-github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c=
+github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8=
+github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c=
 golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw=
 golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
diff --git a/test/perfmatrix/go.mod b/test/perfmatrix/go.mod
index 954c9d03..2caad41f 100644
--- a/test/perfmatrix/go.mod
+++ b/test/perfmatrix/go.mod
@@ -4,12 +4,12 @@ go 1.26.4
 
 require (
 	github.com/HdrHistogram/hdrhistogram-go v1.2.0
-	github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf
+	github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c
 	github.com/cloudwego/hertz v0.10.5
 	github.com/gin-gonic/gin v1.12.0
 	github.com/go-chi/chi/v5 v5.3.0
 	github.com/goceleris/celeris v1.4.15
-	github.com/goceleris/loadgen v1.4.8
+	github.com/goceleris/loadgen v1.4.9
 	github.com/gofiber/fiber/v3 v3.3.0
 	github.com/google/uuid v1.6.0
 	github.com/gorilla/sessions v1.4.0
diff --git a/test/perfmatrix/go.sum b/test/perfmatrix/go.sum
index efbc701a..b04f85db 100644
--- a/test/perfmatrix/go.sum
+++ b/test/perfmatrix/go.sum
@@ -18,8 +18,8 @@ github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eT
 github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
 github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
 github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
-github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I=
-github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c=
+github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8=
+github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c=
 github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
 github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
 github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
@@ -76,8 +76,8 @@ github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
 github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
 github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM=
 github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
-github.com/goceleris/loadgen v1.4.8 h1:r162NrVoLxuCQ3IlhWp+nqAgeFikHImlUSagJbP98m8=
-github.com/goceleris/loadgen v1.4.8/go.mod h1:BtjUHc0ULnqa2LsSoJNzDdBt05xUx5jajeF6XnJfFJA=
+github.com/goceleris/loadgen v1.4.9 h1:Kd/AmLHP520Su3azQ9tCNoc6tsaeEf7Nx8ECr4AdYfg=
+github.com/goceleris/loadgen v1.4.9/go.mod h1:Olg2awQufUnRemRlCvFPFL6Ww3byUd+UvZYQAMJm6Co=
 github.com/gofiber/fiber/v3 v3.3.0 h1:QBd3sYCqdy6Qs5gJYzSw4I4SbqL204jPqpdub/ueiw8=
 github.com/gofiber/fiber/v3 v3.3.0/go.mod h1:YH7/TAoRaU4kF8slDCtQuFJ1NzC+3MtxUI4KfvQtaIA=
 github.com/gofiber/schema v1.7.1 h1:oSJBKdgP8JeIME4TQSAqlNKTU2iBB+2RNmKi8Nsc+TI=