diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index addbc320..02cb57c4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: name: Lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -64,7 +64,7 @@ jobs: name: Unit (root + middleware sub-modules) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -107,7 +107,7 @@ jobs: name: Conformance runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -144,7 +144,7 @@ jobs: CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -164,7 +164,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -187,7 +187,7 @@ jobs: name: Vulnerability Check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/.github/workflows/drivers.yml b/.github/workflows/drivers.yml index 9f4b9d5c..fc48462a 100644 --- a/.github/workflows/drivers.yml +++ b/.github/workflows/drivers.yml @@ -91,7 +91,7 @@ jobs: env: CELERIS_PG_DSN: postgres://celeris:celeris@127.0.0.1:5432/celeristest?sslmode=disable steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -132,7 +132,7 @@ jobs: env: CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -173,7 +173,7 @@ jobs: env: CELERIS_REDIS_ADDR: 127.0.0.1:6379 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -210,7 +210,7 @@ jobs: env: CELERIS_MEMCACHED_ADDR: 127.0.0.1:11211 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -263,7 +263,7 @@ jobs: --health-timeout 5s --health-retries 10 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -296,7 +296,7 @@ jobs: REDIS_IMAGE: redis:${{ matrix.redis }} CELERIS_REDIS_CLUSTER_ADDRS: "127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002" steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" @@ -382,7 +382,7 @@ jobs: CELERIS_REDIS_SENTINEL_ADDRS: "127.0.0.1:26379,127.0.0.1:26380,127.0.0.1:26381" CELERIS_REDIS_SENTINEL_MASTER: mymaster steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 690f35a2..0a87ec26 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,7 +31,7 @@ jobs: needs: [validate-tag, ci] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 with: fetch-depth: 0 - name: Create sub-module tags @@ -70,7 +70,7 @@ jobs: needs: [validate-tag, ci, tag-submodules] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v7 - uses: actions/setup-go@v6 with: go-version: "1.26.4" diff --git a/SECURITY.md b/SECURITY.md index 29345ae4..0b103a0f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,14 +2,58 @@ ## Supported Versions -| Version | Supported | -|---------------|-----------| -| >= 1.4.0 | Yes | -| < 1.4.0 | No | - -Security updates are issued only for the 1.4.x line. Earlier versions -(1.3.x and below) no longer receive fixes, including critical ones — -upgrade to the latest 1.4.x to remain covered. +| Version | Supported | +|----------|-----------| +| >= 1.5.0 | Yes | +| 1.4.x | No | +| < 1.4.0 | No | + +Security updates are issued only for the 1.5.x line. The 1.4.x line and +earlier (1.3.x and below) no longer receive fixes, including critical +ones — upgrade to the latest 1.5.x to remain covered. + +### v1.5.x Security Improvements + +The 1.5.x line is the core-engine performance milestone (io_uring / epoll / +adaptive). Several of its changes are memory-safety or DoS-posture +relevant: + +- **io_uring use-after-free hardening under churn**: the io_uring engine + serializes close behind in-flight completions (cancel-then-release) and + tags each completion's `user_data` with a 16-bit per-connection + generation counter. A stale CQE arriving after a socket's slot has been + recycled is now detected by the generation mismatch and dropped instead + of being misrouted to the new connection occupying that slot — closing a + gen-collision window that could cross-wire two connections' buffers or + corrupt the heap under sustained POST / connection-churn load. + +- **H1 parser request-smuggling hardening**: the H1 request parser was + hardened against request-smuggling vectors and RFC 9110/9112 framing + violations (conflicting / duplicate `Content-Length`, `Transfer-Encoding` + vs `Content-Length` ambiguity, malformed chunk framing), so a + front-end / back-end desync cannot be induced through celeris. + +- **Latent data-race / memory-safety fixes**: two races that are + memory-safety bugs under the Go memory model were closed — a + `Start` / `Shutdown` race on the server CPU monitor, and the epoll + async-detach path's `h1State.Detached` flag, now an `atomic.Bool` + published with a release barrier so a detaching connection cannot be + observed half-initialized by the engine loop. + +- **`middleware/secure` default hardening (behavior change)**: + `Cross-Origin-Embedder-Policy` (`require-corp`) and `X-Download-Options` + (`noopen`) are now **off by default** and opt-in, matching Helmet's + posture. The previous COEP default silently broke cross-origin resources + without a corresponding security win for most apps; set + `CrossOriginEmbedderPolicy: "require-corp"` (or `"credentialless"`) + explicitly where cross-origin isolation is required. The default + secure-header count drops from 11 to 9 — **audit your deployment if you + relied on the implicit COEP header**. + +- **Go toolchain bump (1.26.3 → 1.26.4)**: every `go.mod` in the repo + (and the loadgen sub-module) moves to `go 1.26.4` for the stdlib + security fixes in that patch release; CI pins the explicit patch version + so a stale runner cache cannot regress. ### v1.4.2 Security Improvements @@ -164,7 +208,7 @@ posture is conservative: ## Historical (unsupported) -Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.4.x to remain covered. +Per-version security notes for the 1.3.x line and earlier are preserved in the git history of this file (`git log SECURITY.md`). Those releases no longer receive fixes — upgrade to the latest 1.5.x to remain covered. ## Reporting a Vulnerability diff --git a/adaptive/controller.go b/adaptive/controller.go index 1687795a..145d1a0d 100644 --- a/adaptive/controller.go +++ b/adaptive/controller.go @@ -9,6 +9,30 @@ import ( "github.com/goceleris/celeris/engine" ) +// The adaptive engine starts on epoll and switches to io_uring under load +// using a DIRECT conns-per-worker policy (no benchmark fingerprinting, no +// CPU monitor required). The thresholds come from the empirical crossover on +// this hardware: epoll and io_uring tie up to ~16 conns/worker; io_uring +// pulls ahead above ~20/worker and keeps scaling while epoll plateaus +// (io_uring ~+14% at 64 conns/worker); epoll wins at ~1 conn (lower latency). +// +// Policy: +// - On epoll, switch UP to io_uring when conns/worker sustains the up +// threshold for sustainTicks consecutive ticks, OR snap immediately when +// conns/worker crosses the heavy-load high-watermark (the fast path). +// - On io_uring, revert DOWN to epoll when conns/worker sustains BELOW the +// down threshold for sustainTicks ticks. The down threshold sits well +// under the up threshold so the band between them is a hysteresis zone +// that prevents flapping. +// - Large-payload workloads (avg bytes/req above largePayloadBytes) are +// link-bound — the engines tie — so an io_uring switch is suppressed to +// avoid pointless churn. +// - A safety revert fires if io_uring is active and the error rate climbs +// above errorRevertRate, regardless of load. +// +// The oscillation lock (3 switches in 5 min → 5 min lock) and the post-switch +// cooldown bound any residual thrash and hold io_uring after the fast snap. + type controllerState struct { activeIsPrimary bool lastSwitch time.Time @@ -17,21 +41,50 @@ type controllerState struct { switchCount int locked bool lockUntil time.Time - lastActiveScore map[engine.EngineType]float64 - lastActiveTime map[engine.EngineType]time.Time + + // upTicks / downTicks count consecutive evaluations that satisfy the + // switch-up / switch-down condition; a normal switch needs sustainTicks + // of them, the heavy-load fast path needs only one. Reset on a switch + // (recordSwitch) and whenever the condition lapses. + upTicks int + downTicks int } type controller struct { - primary engine.Engine - secondary engine.Engine + primary engine.Engine // epoll (low-conns winner / starting engine) + secondary engine.Engine // io_uring (high-conns winner) sampler TelemetrySampler - weights ScoreWeights state controllerState evalInterval time.Duration cooldown time.Duration - threshold float64 - logger *slog.Logger + + upThreshold float64 // conns/worker: epoll → io_uring + downThreshold float64 // conns/worker: io_uring → epoll (hysteresis low edge) + highWatermark float64 // conns/worker: heavy-load fast-path snap + largePayloadBytes float64 // avg bytes/req above which io_uring is suppressed + errorRevertRate float64 // io_uring error rate above which we revert to epoll + sustainTicks int // consecutive ticks required for a normal switch + + // connSwitchEnabled gates the conns-per-worker UP switch (epoll→io_uring). + // Production sets it true ONLY on the epoll-start path with io_uring viable + // and a non-h2c protocol: there, a sustained high-concurrency ramp should + // promote NEW connections to io_uring (it wins ≥~24 conns/worker for h1 + // small payloads). It is false when io_uring is the start engine (nothing + // better to switch up to), when io_uring is unviable, or for h2c. The + // always-on error-revert below is independent of this flag. The engine sets + // it directly from the profile in New(). + connSwitchEnabled bool + + // loadDownRevert gates the LOAD-driven io_uring→epoll revert (evaluateDown). + // It is OFF in production: because pinned conns never migrate, reverting on + // a load dip strands established io_uring keep-alives and routes new conns + // back to epoll mid-ramp — pure harm. The always-on error-revert is + // independent of this flag. Defaults true in newController so the + // conns-per-worker unit tests exercise evaluateDown; New() sets it false. + loadDownRevert bool + + logger *slog.Logger } func newController(primary, secondary engine.Engine, sampler TelemetrySampler, logger *slog.Logger) *controller { @@ -39,20 +92,34 @@ func newController(primary, secondary engine.Engine, sampler TelemetrySampler, l primary: primary, secondary: secondary, sampler: sampler, - weights: DefaultWeights(), - evalInterval: 5 * time.Second, + evalInterval: 1 * time.Second, cooldown: 30 * time.Second, - threshold: 0.15, - logger: logger, + // Thresholds from the epoll-vs-io_uring sweep: io_uring overtakes epoll + // at ~24 conns/worker for h1 small payloads (was 20); the heavy-load + // fast-path snaps only well past the crossover (48); large payloads are + // link-bound (engines tie) above 8 KB so suppress the switch there. + upThreshold: 24.0, + downThreshold: 12.0, + highWatermark: 48.0, + largePayloadBytes: 8192.0, + errorRevertRate: 0.05, + sustainTicks: 2, + // Both default ON so the conns-per-worker unit tests exercise the policy; + // the production New() path sets connSwitchEnabled from the kernel/feature + // profile and loadDownRevert=false (pinning makes load-revert harmful). + connSwitchEnabled: true, + loadDownRevert: true, + logger: logger, state: controllerState{ activeIsPrimary: true, - lastActiveScore: make(map[engine.EngineType]float64), - lastActiveTime: make(map[engine.EngineType]time.Time), }, } } -// evaluate checks whether a switch is warranted. Returns true if a switch should occur. +// evaluate decides whether a switch is warranted given the current load. It +// returns true when the engine should switch to its standby. The decision is +// driven entirely by conns-per-worker (with payload-size and error-rate +// refinements); the frozen check, oscillation lock and cooldown gate it. func (c *controller) evaluate(now time.Time, frozen bool) bool { if frozen { return false @@ -70,112 +137,111 @@ func (c *controller) evaluate(now time.Time, frozen bool) bool { return false } - var active, standby engine.Engine - if c.state.activeIsPrimary { - active = c.primary - standby = c.secondary - } else { - active = c.secondary - standby = c.primary - } - - activeSnap := c.sampler.Sample(active) - baselineActiveScore := computeScore(activeSnap, c.weights) + active := c.activeEngine() + snap := c.sampler.Sample(active) + cpw := snap.ConnsPerWorker - // io_uring bias: the modeled io_uring advantage given the current workload - // (connection count + CPU pressure). Zero outside the empirical sweet spot. - bias := ioUringBias(activeSnap) - - // Apply the bias to the ACTIVE score: reinforce io_uring when it is already - // active (resist leaving it), lightly penalise epoll when conditions favor - // io_uring (encourage leaving it). - activeScore := baselineActiveScore if active.Type() == engine.IOUring { - activeScore *= (1.0 + bias) // Bonus: io_uring already active, reinforce - } else if bias > 0 { - activeScore *= (1.0 - bias*0.5) // Penalty: epoll active but conditions favor io_uring + // Safety error-revert is ALWAYS active, independent of connSwitchEnabled: + // if io_uring starts erroring on this deployment, fall back to epoll. + if snap.ErrorRate > c.errorRevertRate { + c.state.downTicks = 0 + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap) + return true + } + if !c.connSwitchEnabled || !c.loadDownRevert { + // Load-driven down-revert is disabled in production: pinned conns + // never migrate, so reverting only strands io_uring keep-alives and + // routes new conns to epoll mid-ramp. Only the error-revert above moves us. + return false + } + return c.evaluateDown(snap, cpw) } - - // Store active's (biased) score for historical reference. - c.state.lastActiveScore[active.Type()] = activeScore - c.state.lastActiveTime[active.Type()] = now - - // Seed standby with 80% of active if no history exists. - if _, ok := c.state.lastActiveScore[standby.Type()]; !ok { - c.state.lastActiveScore[standby.Type()] = activeScore * 0.80 - c.state.lastActiveTime[standby.Type()] = now + // epoll active. + if !c.connSwitchEnabled { + return false } + return c.evaluateUp(snap, cpw) +} - // Standby estimate. Two independent signals, combined by max: - // - // 1. Historical: the last directly-observed score for the standby engine, - // decayed at 1%/sec. This drives switching when the ACTIVE engine - // genuinely degrades below a previously-measured standby. - // - // 2. Bias-modeled: the standby score implied by the io_uring bias for the - // CURRENT workload, recomputed EACH tick. When io_uring is the standby - // and conditions favor it, the standby is modeled as the unbiased active - // baseline scaled up by the bias; when epoll is the standby (io_uring - // active and favored) it is scaled DOWN so we do not switch back. This - // is what makes an organic epoll→io_uring switch reachable — the - // historical-only path could never exceed the threshold (max attainable - // ratio ~0.70 < 1+threshold). - standbyScore := c.historicalScore(standby.Type(), now) - if modeled := c.biasModeledStandbyScore(standby.Type(), baselineActiveScore, bias); modeled > standbyScore { - standbyScore = modeled - } +// evaluateUp runs while epoll is active and considers a switch UP to io_uring. +func (c *controller) evaluateUp(snap TelemetrySnapshot, cpw float64) bool { + // Large payloads are link-bound: the engines tie, so never switch up. + // Keep upTicks pinned at zero so a later small-payload burst restarts the + // sustain count from scratch. + largePayload := snap.BytesPerReq >= c.largePayloadBytes - if standbyScore > activeScore*(1.0+c.threshold) { - c.logger.Info("switch recommended", - "active", active.Type().String(), - "standby", standby.Type().String(), - "active_score", activeScore, - "standby_score", standbyScore, - ) + switch { + case !largePayload && cpw >= c.highWatermark: + // Heavy-load fast path: snap immediately on a single tick. + c.state.upTicks++ + c.state.downTicks = 0 + c.logSwitch("epoll", "io_uring", "heavy-load fast path", cpw, snap) return true + case !largePayload && cpw >= c.upThreshold: + c.state.upTicks++ + c.state.downTicks = 0 + if c.state.upTicks >= c.sustainTicks { + c.logSwitch("epoll", "io_uring", "sustained high load", cpw, snap) + return true + } + default: + c.state.upTicks = 0 } - + c.state.downTicks = 0 return false } -// biasModeledStandbyScore models the standby engine's score for the CURRENT -// workload from the io_uring bias, using the unbiased active baseline as the -// reference point. The sign of the adjustment depends on which engine is the -// standby: -// -// - io_uring standby: when conditions favor io_uring (bias>0) it is modeled -// as bias-better than the active baseline → standby = baseline*(1+bias). -// - epoll standby (io_uring active and favored): epoll is modeled as -// bias-worse → standby = baseline*(1-bias), so a favorable-for-io_uring -// workload never recommends switching back to epoll. -// -// Recomputed every tick so the estimate tracks live conditions rather than a -// stale seed. -func (c *controller) biasModeledStandbyScore(standby engine.EngineType, baselineActiveScore, bias float64) float64 { - if bias <= 0 { - return baselineActiveScore +// evaluateDown runs while io_uring is active and considers a revert to epoll. +func (c *controller) evaluateDown(snap TelemetrySnapshot, cpw float64) bool { + // Safety revert: an error storm on io_uring beats any load consideration. + if snap.ErrorRate > c.errorRevertRate { + c.state.downTicks = 0 + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "error-rate safety revert", cpw, snap) + return true } - if standby == engine.IOUring { - return baselineActiveScore * (1.0 + bias) + + if cpw < c.downThreshold { + c.state.downTicks++ + if c.state.downTicks >= c.sustainTicks { + c.state.upTicks = 0 + c.logSwitch("io_uring", "epoll", "sustained low load", cpw, snap) + return true + } + } else { + c.state.downTicks = 0 } - return baselineActiveScore * (1.0 - bias) + c.state.upTicks = 0 + return false } -// historicalScore returns the last known score for an engine type, decayed -// at 1% per second since the score was recorded. -func (c *controller) historicalScore(et engine.EngineType, now time.Time) float64 { - score, ok := c.state.lastActiveScore[et] - if !ok { - return 0 +func (c *controller) activeEngine() engine.Engine { + if c.state.activeIsPrimary { + return c.primary } - elapsed := now.Sub(c.state.lastActiveTime[et]).Seconds() - return score * max(0, 1.0-0.01*elapsed) + return c.secondary +} + +func (c *controller) logSwitch(from, to, reason string, cpw float64, snap TelemetrySnapshot) { + c.logger.Info("engine switch recommended", + "from", from, + "to", to, + "reason", reason, + "conns_per_worker", cpw, + "bytes_per_req", snap.BytesPerReq, + "error_rate", snap.ErrorRate, + "active_connections", snap.ActiveConnections, + ) } // recordSwitch updates controller state after a switch has been performed. func (c *controller) recordSwitch(now time.Time) { c.state.activeIsPrimary = !c.state.activeIsPrimary c.state.lastSwitch = now + c.state.upTicks = 0 + c.state.downTicks = 0 c.state.switchTimes[c.state.switchIdx%len(c.state.switchTimes)] = now c.state.switchIdx++ diff --git a/adaptive/controller_test.go b/adaptive/controller_test.go index 8d216e5e..beaf4c6d 100644 --- a/adaptive/controller_test.go +++ b/adaptive/controller_test.go @@ -3,92 +3,287 @@ package adaptive import ( + "log/slog" "testing" "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/resource" ) -// TestControllerOrganicSwitch verifies that, in the io_uring sweet spot -// (high connection count + high CPU), the controller eventually recommends an -// epoll→io_uring switch driven purely by the io_uring bias — no pre-seeded -// standby history, no active degradation. -// -// This FAILS against the pre-fix logic: the standby was seeded at -// activeScore*0.80 and only decayed, so standby/active maxed out at ~0.70 and -// could never clear the 1+threshold (1.15) bar. The bias-modeled standby -// estimate makes the switch reachable. -func TestControllerOrganicSwitch(t *testing.T) { - primary := newMockEngine(engine.Epoll) // active - secondary := newMockEngine(engine.IOUring) // standby +// newCtrlEpollActive returns a controller with epoll active (primary) plus the +// synthetic sampler driving its telemetry, cooldown disabled for unit testing. +func newCtrlEpollActive() (*controller, *syntheticSampler) { sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger()) + c.cooldown = 0 + return c, sampler +} + +// newCtrlIOUringActive returns a controller with io_uring active (primary). +func newCtrlIOUringActive() (*controller, *syntheticSampler) { + sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.IOUring), newMockEngine(engine.Epoll), sampler, testLogger()) + c.cooldown = 0 + return c, sampler +} + +func testLogger() *slog.Logger { return slog.New(slog.DiscardHandler) } + +// TestConnSwitchDisabledGate verifies the kernel-aware gate: with conns-per-worker +// switching OFF (the production default — the feature-gated start engine is +// authoritative on io_uring-best/epoll-best kernels), neither the epoll up-switch +// nor the io_uring down-revert fires regardless of load, but the io_uring +// error-rate safety revert STILL fires. Guards the warmup-dip revert bug where an +// idle/ramp dip on a 7.0 box reverted io_uring→epoll and stranded the load. +func TestConnSwitchDisabledGate(t *testing.T) { + now := time.Now() + + c, sampler := newCtrlEpollActive() + c.connSwitchEnabled = false + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64}) + for i := 0; i < 5; i++ { + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("epoll up-switch fired with switching disabled (tick %d)", i) + } + } + + c2, s2 := newCtrlIOUringActive() + c2.connSwitchEnabled = false + s2.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) + for i := 0; i < 5; i++ { + if c2.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("io_uring down-revert fired with switching disabled (tick %d)", i) + } + } + + c3, s3 := newCtrlIOUringActive() + c3.connSwitchEnabled = false + s3.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.5}) + if !c3.evaluate(now, false) { + t.Fatal("error-rate safety revert did NOT fire with switching disabled") + } +} + +// TestUpSwitchRequiresSustain verifies the normal epoll→io_uring switch needs +// sustainTicks (2) consecutive ticks at or above the up threshold (24). +func TestUpSwitchRequiresSustain(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24}) + + now := time.Now() + if c.evaluate(now, false) { + t.Fatal("one tick at the up threshold must not switch (sustainTicks=2)") + } + if c.state.upTicks != 1 { + t.Fatalf("upTicks = %d, want 1 after first qualifying tick", c.state.upTicks) + } + if !c.evaluate(now.Add(time.Second), false) { + t.Fatal("second consecutive tick at the up threshold must switch") + } +} - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 +// TestUpSwitchSustainResetsOnDip verifies a dip below the up threshold resets +// the sustain counter, so the count must restart. +func TestUpSwitchSustainResetsOnDip(t *testing.T) { + c, sampler := newCtrlEpollActive() + now := time.Now() - // Active epoll snapshot lands squarely in io_uring's empirical sweet spot. - sampler.Set(engine.Epoll, TelemetrySnapshot{ - ThroughputRPS: 1000, - ActiveConnections: 2048, - CPUUtilization: 0.9, - }) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26}) + if c.evaluate(now, false) { + t.Fatal("first tick must not switch") + } + // Dip into the hysteresis band — resets upTicks. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 15}) + if c.evaluate(now.Add(time.Second), false) { + t.Fatal("dip must not switch") + } + if c.state.upTicks != 0 { + t.Fatalf("upTicks = %d, want 0 after a dip below the up threshold", c.state.upTicks) + } + // Back above threshold: needs two more ticks now. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 26}) + if c.evaluate(now.Add(2*time.Second), false) { + t.Fatal("first tick after reset must not switch") + } + if !c.evaluate(now.Add(3*time.Second), false) { + t.Fatal("second tick after reset must switch") + } +} + +// TestFastPathSnapsImmediately verifies conns/worker at/above the high +// watermark (48) switches on a single tick. +func TestFastPathSnapsImmediately(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 48}) + + if !c.evaluate(time.Now(), false) { + t.Fatal("conns/worker at the high watermark must snap on one tick") + } +} - if e.ActiveEngine().Type() != engine.Epoll { - t.Fatal("expected epoll active initially") +// TestHysteresisBandNoFlap verifies the 12–24 band switches neither way. +func TestHysteresisBandNoFlap(t *testing.T) { + for _, cpw := range []float64{12, 14, 16, 18, 19.9} { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: cpw}) + now := time.Now() + for i := range 4 { + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatalf("cpw=%.1f inside hysteresis band must not switch up", cpw) + } + } } +} + +// TestLargePayloadSuppression verifies a large average payload suppresses the +// switch even well above the high watermark. +func TestLargePayloadSuppression(t *testing.T) { + c, sampler := newCtrlEpollActive() + // Exactly at the threshold counts as large (>=). + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8192}) now := time.Now() - switched := false - // Advance well past any cooldown/observation window and let the estimate - // settle over a few ticks. for i := range 5 { - if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - switched = true - break + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("large payload must suppress io_uring switch") } } - if !switched { - t.Fatal("expected organic epoll→io_uring switch in the io_uring sweet spot") - } -} - -// TestControllerNoSwitchOutsideSweetSpot is the inverse: low CPU or too few -// connections yields zero bias, so the controller must NOT recommend a switch -// (no degradation, no favorable conditions). -func TestControllerNoSwitchOutsideSweetSpot(t *testing.T) { - cases := []struct { - name string - snap TelemetrySnapshot - }{ - { - name: "low CPU", - snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 2048, CPUUtilization: 0.10}, - }, - { - name: "too few connections", - snap: TelemetrySnapshot{ThroughputRPS: 1000, ActiveConnections: 32, CPUUtilization: 0.9}, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - primary := newMockEngine(engine.Epoll) - secondary := newMockEngine(engine.IOUring) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - - sampler.Set(engine.Epoll, tc.snap) - - now := time.Now() - for i := range 5 { - if e.ctrl.evaluate(now.Add(time.Duration(i+1)*time.Minute), false) { - t.Fatalf("unexpected switch outside io_uring sweet spot (%s)", tc.name) - } - } - }) + + // Drop below the large-payload threshold → fast path fires. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 8191}) + if !c.evaluate(now.Add(10*time.Second), false) { + t.Fatal("small payload above high watermark must switch") + } +} + +// TestRevertRequiresSustain verifies the io_uring→epoll revert needs +// sustainTicks below the down threshold (12). +func TestRevertRequiresSustain(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 8}) + + now := time.Now() + if c.evaluate(now, false) { + t.Fatal("one low tick must not revert (sustainTicks=2)") + } + if !c.evaluate(now.Add(time.Second), false) { + t.Fatal("second consecutive low tick must revert") + } +} + +// TestErrorRevertImmediate verifies an error rate above errorRevertRate (0.05) +// reverts io_uring→epoll immediately regardless of load. +func TestErrorRevertImmediate(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 64, ErrorRate: 0.06}) + + if !c.evaluate(time.Now(), false) { + t.Fatal("error rate above errorRevertRate must revert on one tick") + } +} + +// TestNoRevertWhenBusy verifies io_uring stays put above the down threshold +// with a clean error rate. +func TestNoRevertWhenBusy(t *testing.T) { + c, sampler := newCtrlIOUringActive() + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 16, ErrorRate: 0.0}) + + now := time.Now() + for i := range 5 { + if c.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not revert while busy with a clean error rate") + } + } +} + +// TestCooldownGatesRevert verifies the cooldown blocks a revert immediately +// after a switch, but the first switch (no prior switch) is not gated. +func TestCooldownGatesRevert(t *testing.T) { + c, sampler := newCtrlEpollActive() + c.cooldown = 1 * time.Hour + + now := time.Now() + // First switch is not gated (lastSwitch is zero). + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) + if !c.evaluate(now, false) { + t.Fatal("first switch must not be cooldown-gated") + } + c.recordSwitch(now) + + // io_uring now active and idle, but the revert is cooldown-gated. + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) + if c.evaluate(now.Add(time.Minute), false) { + t.Fatal("revert within cooldown must be blocked") + } +} + +// TestFrozenBlocksSwitch verifies the frozen flag short-circuits evaluate. +func TestFrozenBlocksSwitch(t *testing.T) { + c, sampler := newCtrlEpollActive() + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 100}) + if c.evaluate(time.Now(), true) { + t.Fatal("frozen must block all switches") + } +} + +// TestOscillationLockExpiry verifies 3 switches in 5 minutes locks the +// controller, and the lock expires after 5 minutes. +func TestOscillationLockExpiry(t *testing.T) { + sampler := newSyntheticSampler() + c := newController(newMockEngine(engine.Epoll), newMockEngine(engine.IOUring), sampler, testLogger()) + c.cooldown = 0 + + now := time.Now() + for range 3 { + c.recordSwitch(now) + now = now.Add(time.Second) + } + if !c.state.locked { + t.Fatal("expected lock after 3 switches within 5 minutes") + } + + // Locked: a strong load must not switch. + sampler.Set(c.activeEngine().Type(), TelemetrySnapshot{ConnsPerWorker: 100}) + if c.evaluate(now, false) { + t.Fatal("locked controller must not switch") + } + + // After lock expiry, evaluation resumes (lock cleared on the next eval). + after := c.state.lockUntil.Add(time.Second) + _ = c.evaluate(after, false) + if c.state.locked { + t.Fatal("lock should clear once lockUntil has passed") + } +} + +// TestRecordSwitchResetsTicks verifies recordSwitch zeroes both tick counters. +func TestRecordSwitchResetsTicks(t *testing.T) { + c, _ := newCtrlEpollActive() + c.state.upTicks = 5 + c.state.downTicks = 3 + c.recordSwitch(time.Now()) + if c.state.upTicks != 0 || c.state.downTicks != 0 { + t.Fatalf("recordSwitch must reset ticks, got up=%d down=%d", c.state.upTicks, c.state.downTicks) + } + if c.state.activeIsPrimary { + t.Fatal("recordSwitch must toggle activeIsPrimary") + } +} + +// guards against a config drift in the documented thresholds. +func TestDefaultThresholds(t *testing.T) { + c, _ := newCtrlEpollActive() + if c.upThreshold != 24 || c.downThreshold != 12 || c.highWatermark != 48 { + t.Fatalf("threshold drift: up=%.0f down=%.0f hwm=%.0f", c.upThreshold, c.downThreshold, c.highWatermark) + } + if c.largePayloadBytes != 8192 || c.errorRevertRate != 0.05 || c.sustainTicks != 2 { + t.Fatalf("policy drift: largePayload=%.0f errRevert=%.3f sustain=%d", + c.largePayloadBytes, c.errorRevertRate, c.sustainTicks) + } + if c.evalInterval != time.Second { + t.Fatalf("evalInterval = %v, want 1s", c.evalInterval) + } + if c.cooldown != 0 { // overridden to 0 by the helper + t.Fatalf("test helper should zero cooldown, got %v", c.cooldown) } } diff --git a/adaptive/driver_provider_test.go b/adaptive/driver_provider_test.go index dddffd18..510c01b8 100644 --- a/adaptive/driver_provider_test.go +++ b/adaptive/driver_provider_test.go @@ -12,6 +12,7 @@ import ( "golang.org/x/sys/unix" "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" ) @@ -29,6 +30,14 @@ var _ stream.Handler = noopHandler{} // (we need both primary + secondary up for a meaningful switch test). func newBoundAdaptive(t *testing.T) (*Engine, func()) { t.Helper() + // These tests need BOTH sub-engines for a meaningful switch. Since New() + // now LAZILY builds the standby (and falls back to an epoll start when + // io_uring is unavailable), a no-io_uring host would let New() succeed yet + // fail the first switch when the io_uring standby cannot be built. Skip up + // front when io_uring is genuinely unavailable. + if !probe.Probe().IOUringTier.Available() { + t.Skip("io_uring unavailable: switch test needs both sub-engines") + } cfg := resource.Config{ Addr: "127.0.0.1:0", Protocol: engine.HTTP1, @@ -347,13 +356,8 @@ func TestAdaptiveSwitchAfterDriverQuiescence(t *testing.T) { e, stop := newBoundAdaptive(t) defer stop() - // Seed historical scores so ForceSwitch actually flips active. - now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 100 - e.ctrl.state.lastActiveScore[engine.IOUring] = 500 - e.ctrl.state.lastActiveTime[engine.Epoll] = now - e.ctrl.state.lastActiveTime[engine.IOUring] = now - + // ForceSwitch bypasses the decision policy and flips the active engine + // directly, so no telemetry seeding is needed here. local, peer := socketpairNonblocking(t) defer func() { _ = unix.Close(peer) }() defer func() { _ = unix.Close(local) }() diff --git a/adaptive/engine.go b/adaptive/engine.go index 10fe6644..f12ca2cf 100644 --- a/adaptive/engine.go +++ b/adaptive/engine.go @@ -8,6 +8,7 @@ import ( "fmt" "log/slog" "net" + "os" "sync" "sync/atomic" "time" @@ -15,7 +16,7 @@ import ( "github.com/goceleris/celeris/engine" "github.com/goceleris/celeris/engine/epoll" "github.com/goceleris/celeris/engine/iouring" - "github.com/goceleris/celeris/engine/scaler" + "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" ) @@ -26,9 +27,21 @@ var ( ) // Engine is an adaptive meta-engine that switches between io_uring and epoll. +// +// The two sub-engine slots map to a fixed protocol direction the controller +// keys off: primary is ALWAYS the epoll engine (the controller's +// activeIsPrimary==true means epoll is active) and secondary is ALWAYS the +// io_uring engine. On the public New() path only the START engine is built +// eagerly; the other slot stays nil until the first switch actually needs it +// (see buildStandby + performSwitch). Under the default policy the start engine +// is epoll, so the io_uring standby is built lazily — and only if a sustained +// high-concurrency ramp promotes new conns to it; an engine that never switches +// never constructs its standby, so that heap never exists. newFromEngines +// (tests) populates BOTH slots eagerly, exercising the standby-already-exists +// switch path. type Engine struct { - primary engine.Engine // io_uring - secondary engine.Engine // epoll + primary engine.Engine // epoll (nil until built when it is the lazy standby) + secondary engine.Engine // io_uring (nil until built when it is the lazy standby) active atomic.Pointer[engine.Engine] ctrl *controller cfg resource.Config @@ -39,12 +52,30 @@ type Engine struct { frozen atomic.Bool logger *slog.Logger + // startType is the engine type chosen for the eager start engine. The + // standby is the other type; buildStandby constructs it on demand. + startType engine.EngineType + + // buildStandby constructs the LAZY standby sub-engine on first switch. + // It captures cfg + handler (+ cpuMon for the sampler symmetry) and is + // nil on the newFromEngines (tests) path where both engines are eager. + buildStandby func() (engine.Engine, error) + + // listenCtx / listenWG are captured by Listen so performSwitch can start a + // freshly-built standby's Listen goroutine under the SAME context and wait + // group as the active engine. Shutdown then joins it implicitly via the + // wait group (wg.Wait in Listen) — a never-built standby added nothing to + // the group, so there is nothing to join. Guarded by mu (performSwitch + // holds mu across the whole switch; Listen sets these once under mu). + listenCtx context.Context + listenWG *sync.WaitGroup + // freezeCooldown is the duration to suppress further switches after a switch. // Zero means no cooldown (default). freezeCooldown time.Duration // listenMu guards listenCancel/listenDone, which let Shutdown deterministically - // stop and JOIN the evaluation/scaler goroutines started by Listen. Without + // stop and JOIN the evaluation-loop goroutine started by Listen. Without // this, Shutdown could return (sub-engines stopped) while the eval loop is // still mid-Sample on the CPU monitor the server is about to close. listenMu sync.Mutex @@ -63,12 +94,96 @@ type Engine struct { switchRejected atomic.Uint64 // telemetry: how many switches were blocked by driver FDs } -// New creates a new adaptive engine with epoll as primary and io_uring as secondary. -// Epoll starts first because it has lower H2 latency on current kernels (single-pass -// read→process→write vs io_uring's two-iteration CQE model). The controller may -// switch to io_uring if telemetry indicates it would perform better for the workload. -// Both sub-engines get the full resource config. This is safe because standby -// workers are fully suspended (zero CPU, zero connections, listen sockets closed). +// ioUringViable reports whether io_uring is worth running at all on this host: +// the kernel must expose the fast tier AND RLIMIT_MEMLOCK must be able to fund +// the requested worker count. These are the two t0-knowable disqualifiers from +// the epoll-vs-io_uring sweep: +// +// - Kernel/feature: io_uring loses to epoll on old kernels (missing the +// fast-path setup flags); require the "bundles" era (>6.10) OR the 6.1+ +// fast tier (DEFER_TASKRUN + SINGLE_ISSUER + MULTISHOT_RECV + PROVIDED_BUFFERS). +// - Memlock: io_uring's provided-buffer rings need locked pages per worker +// (minMemlockPerWorker). If RLIMIT_MEMLOCK can't fund the requested workers, +// io_uring caps to a fraction of them and its throughput collapses; epoll +// does not memlock buffer rings, so it keeps all workers. In that case +// io_uring is never the right engine. +func ioUringViable(p engine.CapabilityProfile, cfg resource.Config) bool { + bundlesEra := p.KernelMajor > 6 || (p.KernelMajor == 6 && p.KernelMinor >= 10) + fastTier := p.DeferTaskrun && p.SingleIssuer && p.MultishotRecv && p.ProvidedBuffers + if !bundlesEra && !fastTier { + return false + } + wantWorkers := cfg.Resources.Resolve().Workers + if maxW := maxWorkersForMemlock(); maxW != -1 && maxW < wantWorkers { + return false + } + return true +} + +// maxWorkersForMemlock is the io_uring memlock worker-ceiling probe behind a var +// so tests can inject a low cap without mutating the process RLIMIT_MEMLOCK. +var maxWorkersForMemlock = iouring.MaxWorkersForMemlock + +// chooseStartEngine selects which sub-engine the adaptive meta-engine starts +// (and builds eagerly), from facts knowable at Listen() time only. +// +// THE PINNING CONSTRAINT: an established connection cannot migrate between +// epoll and io_uring, so the START engine decides keep-alive throughput; the +// runtime switch can only route NEW connections. And the workload's +// concurrency — the thing that actually decides which engine wins — is +// unknowable here (no connections exist yet). So the start decision is gated +// only on t0-knowable disqualifiers, with a safe default: +// +// 1. env override CELERIS_ADAPTIVE_START=iouring|epoll (operator escape hatch). +// 2. io_uring not viable (old kernel / missing fast tier / memlock too low) → epoll. +// 3. configured Protocol == H2C → epoll (io_uring's win is h1-small-payload only; +// h2c never benefits — its framing/HPACK cost dwarfs the engine delta). +// 4. explicit operator WorkloadHint == HighConcurrency → io_uring (the ONLY +// input that can express a high-concurrency expectation up front). +// 5. DEFAULT → epoll. Every server ramps from zero connections, i.e. the +// low-concurrency regime where epoll wins on throughput AND tail latency; +// the runtime switch then promotes new conns to io_uring if sustained +// high load develops. +// +// This flips the previous default (io_uring on modern kernels): io_uring now +// wins the start only on an explicit high-concurrency hint, because the +// benchmark-shaped "saturating burst at t0" is the only case where defaulting +// io_uring helps, and it costs the common low/mid-conc + latency cases. +func chooseStartEngine(p engine.CapabilityProfile, cfg resource.Config) engine.EngineType { + switch os.Getenv("CELERIS_ADAPTIVE_START") { + case "iouring": + return engine.IOUring + case "epoll": + return engine.Epoll + case "auto", "": + // fall through to the policy below + default: + // Unknown value: fall through to auto rather than fail hard. + } + + if !ioUringViable(p, cfg) { + return engine.Epoll + } + if cfg.Protocol == engine.H2C { + return engine.Epoll + } + if cfg.Resources.WorkloadHint == resource.WorkloadHighConcurrency { + return engine.IOUring + } + return engine.Epoll +} + +// New creates a new adaptive engine. Only the START engine is built and +// Listen'd eagerly; the other engine (the standby) is constructed lazily on the +// first switch that actually needs it. The start engine is chosen by +// chooseStartEngine from the probed io_uring capabilities (feature-gated, with a +// CELERIS_ADAPTIVE_START env override). +// +// Both sub-engines bind the SAME SO_REUSEPORT port so the adaptive switch is +// transparent: resolvePort pins a concrete port up front, and the lazily-built +// standby reuses it. Building only the start engine eliminates the parked +// standby's GC-rooted heap — on a modern kernel that starts on io_uring and +// never reverts, the epoll standby is never constructed (≈0 standby tax). // // cpuMon is an engine.CPUMonitor (the public interface); when non-nil it // supplies the live sampler with CPU utilization data so the io_uring bias can @@ -92,47 +207,110 @@ func New(cfg resource.Config, handler stream.Handler, cpuMon engine.CPUMonitor) } } - // Suppress the per-engine built-in scaler in both sub-engines — - // adaptive runs ONE higher-level scaler that delegates to whichever - // sub-engine is currently active. Two scalers fighting over the same - // worker pool produced -54 % to +118 % variance on pinning tests - // during the spike-B exploration; gating this way eliminates that. - subCfg := cfg - subCfg.SkipBuiltinScaler = true + // probe.Probe() reads kernel version + io_uring setup feature bits WITHOUT + // constructing an engine, so it is cheap enough for the start decision. + profile := probe.Probe() + startType := chooseStartEngine(profile, cfg) - primary, err := epoll.New(subCfg, handler) - if err != nil { - return nil, fmt.Errorf("epoll sub-engine: %w", err) + sampler := newLiveSampler(cpuMon) + logger := cfg.Logger + if logger == nil { + logger = slog.Default() } - secondary, err := iouring.New(subCfg, handler) - if err != nil { - return nil, fmt.Errorf("io_uring sub-engine: %w", err) + // Constructors for each slot. The standby's constructor is stored on the + // Engine and only invoked on the first switch. The io_uring constructor + // does not take cpuMon (iouring.New has no such parameter); cpuMon already + // feeds the shared sampler via newLiveSampler above. + buildEpoll := func() (engine.Engine, error) { + eng, err := epoll.New(cfg, handler) + if err != nil { + return nil, fmt.Errorf("epoll sub-engine: %w", err) + } + return eng, nil + } + buildIOUring := func() (engine.Engine, error) { + eng, err := iouring.New(cfg, handler) + if err != nil { + return nil, fmt.Errorf("io_uring sub-engine: %w", err) + } + return eng, nil } - - sampler := newLiveSampler(cpuMon) - logger := cfg.Logger e := &Engine{ - primary: primary, - secondary: secondary, cfg: cfg, handler: handler, logger: logger, + startType: startType, } - e.ctrl = newController(primary, secondary, sampler, logger) - - // Start with primary (epoll) for all protocols. Epoll has better H2 - // throughput on current kernels and matches H1 performance. - var initialActive engine.Engine = primary - e.ctrl.state.activeIsPrimary = true - e.active.Store(&initialActive) + var startEngine engine.Engine + if startType == engine.IOUring { + // io_uring is the eager start; epoll is the lazy standby. + // io_uring construction can fail on a kernel that probed as capable + // but cannot actually set up the ring (e.g. low RLIMIT_MEMLOCK). Fall + // back to starting on epoll rather than failing New outright. + eng, err := buildIOUring() + if err != nil { + logger.Warn("io_uring start engine unavailable, falling back to epoll start", "error", err) + e.startType = engine.Epoll + eng, err = buildEpoll() + if err != nil { + return nil, err + } + startEngine = eng + e.primary = eng + e.buildStandby = buildIOUring + } else { + startEngine = eng + e.secondary = eng + e.buildStandby = buildEpoll + } + } else { + // epoll is the eager start; io_uring is the lazy standby. + eng, err := buildEpoll() + if err != nil { + return nil, err + } + startEngine = eng + e.primary = eng + e.buildStandby = buildIOUring + } + // The controller needs BOTH engine TYPES to decide switch direction even + // while the standby engine is nil, but it only ever dereferences the + // ACTIVE engine (activeEngine()). Pass the start engine for the active slot + // and nil for the lazy standby slot — newController stores them; activeIsPrimary + // records which slot the start engine occupies (primary==epoll). + e.ctrl = newController(e.primary, e.secondary, sampler, logger) + e.ctrl.state.activeIsPrimary = e.startType == engine.Epoll + // Re-enable the conns-per-worker UP switch ONLY on the epoll-start path with + // io_uring viable and a non-h2c protocol. Rationale from the sweep: + // - When we START on epoll (the new default), a sustained high-concurrency + // ramp should promote NEW connections to io_uring (it wins ≥~24 conns/ + // worker for h1 small payloads). The switch routes new SYNs only — + // pinned conns stay on epoll — so it helps ramps/churn, and is inert for + // a pure keep-alive burst (which is fine; that case wants WorkloadHint). + // - When we START on io_uring there is nothing better to switch UP to, and + // a load-driven DOWN-revert would only strand pinned io_uring conns — so + // leave switching OFF there (the always-on error-revert still applies). + // - h2c never benefits from io_uring, so never switch up for it. + // The controller's load-driven DOWN-revert is disabled regardless (pinning); + // only the always-on io_uring error-revert can move us back to epoll. + e.ctrl.connSwitchEnabled = e.startType == engine.Epoll && + ioUringViable(profile, cfg) && + cfg.Protocol != engine.H2C + // Load-driven down-revert is always off in production (pinning makes it + // harmful); only the always-on io_uring error-revert can return us to epoll. + e.ctrl.loadDownRevert = false + + e.active.Store(&startEngine) return e, nil } -// newFromEngines creates an adaptive engine from pre-built engines (for testing). +// newFromEngines creates an adaptive engine from pre-built engines (for +// testing). BOTH slots are populated eagerly and buildStandby is left nil, so +// performSwitch exercises the "standby already exists" path (no lazy build). func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, cfg resource.Config) *Engine { logger := cfg.Logger if logger == nil { @@ -144,6 +322,7 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, secondary: secondary, cfg: cfg, logger: logger, + startType: engine.Epoll, } e.ctrl = newController(primary, secondary, sampler, logger) @@ -155,13 +334,15 @@ func newFromEngines(primary, secondary engine.Engine, sampler TelemetrySampler, return e } -// Listen starts both sub-engines and the evaluation loop. +// Listen starts ONLY the active sub-engine and the evaluation loop. The standby +// is built and Listen'd lazily by performSwitch on the first switch (joined +// under the same ctx + wait group captured here). func (e *Engine) Listen(ctx context.Context) error { innerCtx, innerCancel := context.WithCancel(ctx) defer innerCancel() // Publish the cancel + a done channel so Shutdown can stop and join the - // goroutines this Listen owns (eval loop, scaler) before the server closes + // goroutine this Listen owns (the eval loop) before the server closes // shared resources such as the CPU monitor. done := make(chan struct{}) e.listenMu.Lock() @@ -172,23 +353,25 @@ func (e *Engine) Listen(ctx context.Context) error { var wg sync.WaitGroup - errCh := make(chan error, 2) + // Publish ctx + wg so performSwitch can launch the lazily-built standby's + // Listen goroutine under the same lifetime (Shutdown joins it via wg.Wait). + e.mu.Lock() + e.listenCtx = innerCtx + e.listenWG = &wg + e.mu.Unlock() - wg.Go(func() { - if err := e.primary.Listen(innerCtx); err != nil { - errCh <- fmt.Errorf("primary (epoll): %w", err) - } - }) + errCh := make(chan error, 2) + active := *e.active.Load() wg.Go(func() { - if err := e.secondary.Listen(innerCtx); err != nil { - errCh <- fmt.Errorf("secondary (io_uring): %w", err) + if err := active.Listen(innerCtx); err != nil { + errCh <- fmt.Errorf("active (%s): %w", active.Type().String(), err) } }) - // Wait for both engines to bind their addresses. + // Wait for the ACTIVE engine to bind its address. // io_uring may need multiple tier fallback attempts, so allow ample time — - // but if either sub-engine has already returned an error to errCh + // but if the active sub-engine has already returned an error to errCh // (e.g. ENOMEM at io_uring_setup under low RLIMIT_MEMLOCK), surface it // immediately instead of waiting out the deadline. deadline := time.Now().Add(20 * time.Second) @@ -198,7 +381,7 @@ func (e *Engine) Listen(ctx context.Context) error { defer bindWait.Stop() var startErr error bindLoop: - for e.primary.Addr() == nil || e.secondary.Addr() == nil { + for active.Addr() == nil { select { case startErr = <-errCh: break bindLoop @@ -213,50 +396,22 @@ bindLoop: wg.Wait() return fmt.Errorf("sub-engine startup failed: %w", startErr) } - if e.primary.Addr() == nil || e.secondary.Addr() == nil { + if active.Addr() == nil { innerCancel() wg.Wait() - return fmt.Errorf("sub-engines failed to initialize within 20s deadline") - } - - // Pause standby engine's accept BEFORE publishing Addr so the - // SO_REUSEPORT group only contains the active engine by the time - // callers start dialing. Without this gate, a burst of incoming - // connections in the window between secondary.Listen() succeeding - // and PauseAccept taking effect lands some dials on the standby's - // accept queue; closing that queue then RSTs them. H1 clients - // reconnect transparently but H2 prior-knowledge clients see the - // dial fail mid-handshake. Read c.state.activeIsPrimary under - // switchMu to avoid racing with performSwitch → recordSwitch (a - // concurrent ForceSwitch or controller tick can fire before Listen - // finishes its own setup). - e.switchMu.Lock() - activeIsPrimary := e.ctrl.state.activeIsPrimary - e.switchMu.Unlock() - if activeIsPrimary { - if ac, ok := e.secondary.(engine.AcceptController); ok { - if err := ac.PauseAccept(); err != nil { - innerCancel() - wg.Wait() - return fmt.Errorf("pause secondary: %w", err) - } - } - } else { - if ac, ok := e.primary.(engine.AcceptController); ok { - if err := ac.PauseAccept(); err != nil { - innerCancel() - wg.Wait() - return fmt.Errorf("pause primary: %w", err) - } - } + return fmt.Errorf("active sub-engine failed to initialize within 20s deadline") } - addr := e.primary.Addr() + // No standby to pause: only the active engine is in the SO_REUSEPORT group, + // so publishing Addr cannot expose a dial to a phantom standby listener. + // (The original pause-standby-before-publish-Addr step guarded that window; + // with a lazy standby there is no standby listening here.) + addr := active.Addr() e.addr.Store(&addr) e.logger.Info("adaptive engine listening", "addr", e.cfg.Addr, - "active", (*e.active.Load()).Type().String(), + "active", active.Type().String(), ) // Start evaluation loop. @@ -264,22 +419,10 @@ bindLoop: e.runEvalLoop(innerCtx) }) - // Start the higher-level dynamic worker scaler. This is the only - // worker scaler that runs in an adaptive setup — sub-engines have - // their built-in scalers suppressed via Config.SkipBuiltinScaler. - // Typed cfg.WorkerScaling takes precedence over env vars. The - // algorithm lives in engine/scaler; adaptive provides a switch-aware - // Source via adaptive/scaler.go. - if scalerCfg := scaler.Resolve(e.cfg, e.cfg.Resources.Resolve().Workers); scalerCfg.Enabled { - wg.Go(func() { - e.runScaler(innerCtx, scalerCfg) - }) - } - select { case <-innerCtx.Done(): // Parent context cancelled, or Shutdown cancelled innerCtx directly to - // stop and join the eval/scaler goroutines. + // stop and join the eval-loop goroutine. case err := <-errCh: innerCancel() wg.Wait() @@ -310,6 +453,47 @@ func (e *Engine) runEvalLoop(ctx context.Context) { } } +// buildAndStartStandby constructs the lazy standby sub-engine, launches its +// Listen goroutine under the same ctx + wait group Listen captured (so Shutdown +// joins it), and waits — bounded — for it to bind the shared SO_REUSEPORT port. +// The caller holds e.mu. wantType is purely for error messages. On any failure +// it returns an error and the engine state is left untouched (no slot stored), +// so the current active keeps serving. +func (e *Engine) buildAndStartStandby(wantType engine.EngineType) (engine.Engine, error) { + if e.buildStandby == nil { + return nil, fmt.Errorf("no standby builder for %s", wantType.String()) + } + if e.listenCtx == nil || e.listenWG == nil { + return nil, fmt.Errorf("cannot build standby before Listen has started") + } + + built, err := e.buildStandby() + if err != nil { + return nil, fmt.Errorf("build %s standby: %w", wantType.String(), err) + } + + ctx := e.listenCtx + wg := e.listenWG + wg.Go(func() { + if lerr := built.Listen(ctx); lerr != nil { + e.logger.Warn("lazy standby Listen returned error", + "standby", built.Type().String(), "error", lerr) + } + }) + + // Wait (bounded) for the standby to bind the shared port — once Addr() is + // non-nil it has joined the SO_REUSEPORT group and is accepting, so the + // resume-before-pause overlap is real and connections are never dropped. + deadline := time.Now().Add(5 * time.Second) + for built.Addr() == nil { + if time.Now().After(deadline) || ctx.Err() != nil { + return nil, fmt.Errorf("%s standby failed to bind within 5s", wantType.String()) + } + time.Sleep(5 * time.Millisecond) + } + return built, nil +} + func (e *Engine) performSwitch() { e.mu.Lock() defer e.mu.Unlock() @@ -339,21 +523,92 @@ func (e *Engine) performSwitch() { e.freezeState.Unlock() return } + // Release freezeState across the (possibly slow) lazy standby build + + // Listen + bind-wait below; re-acquired before the active.Store commit. + // Holding it across a multi-second build would block driver + // register/unregister flows (same reasoning as the PauseAccept release + // at the end of this function). e.mu (held for the whole function) + // already serialises performSwitch against itself, so no other switch + // can race the build. + e.freezeState.Unlock() now := time.Now() + // Determine the direction. activeIsPrimary toggles on recordSwitch, so it + // always reflects the engine we are switching AWAY from. e.switchMu.Lock() + switchingFromPrimary := e.ctrl.state.activeIsPrimary + e.switchMu.Unlock() + + // Resolve the standby slot for this direction. On the lazy New() path the + // target slot may be nil and must be built + Listen'd now (it binds the + // shared SO_REUSEPORT port and joins the accept pool). On the + // newFromEngines (tests) path both slots are pre-populated and buildStandby + // is nil, so the build is skipped. + freshlyBuilt := false + if switchingFromPrimary { + // primary (active) → secondary (standby). + if e.secondary == nil { + built, err := e.buildAndStartStandby(engine.IOUring) + if err != nil { + e.logger.Warn("aborting switch: lazy standby build failed; staying on current active", + "standby", engine.IOUring.String(), "error", err) + return + } + // Publish the built engine to both the Engine and controller + // slots under switchMu so a concurrent evaluate (ForceSwitch + // racing the eval loop) never reads a torn controller slot. + e.switchMu.Lock() + e.secondary = built + e.ctrl.secondary = built + e.switchMu.Unlock() + freshlyBuilt = true + } + } else { + // secondary (active) → primary (standby). + if e.primary == nil { + built, err := e.buildAndStartStandby(engine.Epoll) + if err != nil { + e.logger.Warn("aborting switch: lazy standby build failed; staying on current active", + "standby", engine.Epoll.String(), "error", err) + return + } + e.switchMu.Lock() + e.primary = built + e.ctrl.primary = built + e.switchMu.Unlock() + freshlyBuilt = true + } + } + var newActive, newStandby engine.Engine - if e.ctrl.state.activeIsPrimary { - // Switching: primary → secondary. + if switchingFromPrimary { newActive = e.secondary newStandby = e.primary } else { - // Switching: secondary → primary. newActive = e.primary newStandby = e.secondary } - e.switchMu.Unlock() + + // Re-acquire freezeState for the commit and RE-CHECK driverFDs: a driver + // may have registered during the build window above. If so, abort — but the + // freshly-built standby stays cached for the next attempt. Pause its accept + // first so it does not sit in the SO_REUSEPORT pool alongside the (still + // active) old engine; the next switch ResumeAccepts it. + e.freezeState.Lock() + if e.driverFDs.Load() > 0 { + e.switchRejected.Add(1) + e.logger.Warn("refusing engine switch: driver FDs registered during standby build", + "driver_fds", e.driverFDs.Load(), + ) + e.freezeState.Unlock() + if freshlyBuilt { + if ac, ok := newActive.(engine.AcceptController); ok { + _ = ac.PauseAccept() + } + } + return + } // Resume new active BEFORE pausing old — this creates a brief overlap // where both engines listen (via SO_REUSEPORT), which is correct. The @@ -423,8 +678,8 @@ func (e *Engine) maybeThawLocked() { // Shutdown gracefully shuts down both sub-engines. // -// It first cancels and JOINS the goroutines started by Listen (the evaluation -// loop and worker scaler), so that no controller tick can still be sampling +// It first cancels and JOINS the goroutine started by Listen (the evaluation +// loop), so that no controller tick can still be sampling // telemetry — including the CPU monitor the server closes immediately after // Shutdown returns — by the time this function completes. Only then are the // sub-engines shut down. This is purely a join/sequencing concern; it does not @@ -447,16 +702,40 @@ func (e *Engine) Shutdown(ctx context.Context) error { } } - return errors.Join( - e.primary.Shutdown(ctx), - e.secondary.Shutdown(ctx), - ) + // Only shut down engines that exist. On the lazy New() path the standby + // slot is nil if no switch ever built it; cancelling listenCtx (above) + // already unwound the active engine's Listen goroutine and any lazily + // started standby Listen goroutine (both share that ctx + wait group). + e.mu.Lock() + primary := e.primary + secondary := e.secondary + e.mu.Unlock() + + var errs []error + if primary != nil { + errs = append(errs, primary.Shutdown(ctx)) + } + if secondary != nil { + errs = append(errs, secondary.Shutdown(ctx)) + } + return errors.Join(errs...) } -// Metrics aggregates metrics from both sub-engines. +// Metrics aggregates metrics from whichever sub-engines exist. On the lazy +// New() path a never-built standby is nil and contributes nothing. func (e *Engine) Metrics() engine.EngineMetrics { - pm := e.primary.Metrics() - sm := e.secondary.Metrics() + e.mu.Lock() + primary := e.primary + secondary := e.secondary + e.mu.Unlock() + + var pm, sm engine.EngineMetrics + if primary != nil { + pm = primary.Metrics() + } + if secondary != nil { + sm = secondary.Metrics() + } // Both sub-engines were built from the same handler + cfg, so their // AsyncRoutes counts are identical; take one (not the sum) for the // adaptive view. AsyncPromotedConns IS additive — promotions on the diff --git a/adaptive/engine_test.go b/adaptive/engine_test.go index 8bd0df0f..cf277ba0 100644 --- a/adaptive/engine_test.go +++ b/adaptive/engine_test.go @@ -4,6 +4,7 @@ package adaptive import ( "context" + "errors" "net" "sync" "sync/atomic" @@ -99,92 +100,168 @@ func TestInitialBias(t *testing.T) { } } -func TestSwitchTrigger(t *testing.T) { +// newAdaptiveStartingOnEpoll builds an adaptive engine whose active engine is +// epoll (primary), the production starting configuration. The synthetic +// sampler returned lets the test drive conns-per-worker for the active engine. +func newAdaptiveStartingOnEpoll(t *testing.T) (*Engine, *syntheticSampler) { + t.Helper() + primary := newMockEngine(engine.Epoll) // active + secondary := newMockEngine(engine.IOUring) // standby + sampler := newSyntheticSampler() + e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1}) + e.ctrl.cooldown = 0 + return e, sampler +} + +// newAdaptiveStartingOnIOUring builds an adaptive engine whose active engine is +// io_uring (primary), for exercising the revert-to-epoll path. +func newAdaptiveStartingOnIOUring(t *testing.T) (*Engine, *syntheticSampler) { + t.Helper() primary := newMockEngine(engine.IOUring) secondary := newMockEngine(engine.Epoll) sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.evalInterval = 1 * time.Millisecond + e := newFromEngines(primary, secondary, sampler, resource.Config{Protocol: engine.HTTP1}) e.ctrl.cooldown = 0 + return e, sampler +} - // Make active (io_uring) score 100. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100, ErrorRate: 0.01}) +// TestSwitchTrigger: sustained high conns/worker on the active epoll engine +// triggers an epoll→io_uring switch after sustainTicks ticks. +func TestSwitchTrigger(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 24}) - // Pre-seed standby historical score (epoll was previously active with score 150). - e.ctrl.state.lastActiveScore[engine.Epoll] = 150 - e.ctrl.state.lastActiveTime[engine.Epoll] = time.Now() + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatal("expected epoll initially") + } + + now := time.Now() + // First tick arms the sustain counter but must not switch yet. + if e.ctrl.evaluate(now, false) { + t.Fatal("switch must require sustainTicks consecutive ticks, not one") + } + // Second tick crosses sustainTicks → switch. + if !e.ctrl.evaluate(now.Add(time.Second), false) { + t.Fatal("expected switch after sustained high conns/worker") + } + e.performSwitch() if e.ActiveEngine().Type() != engine.IOUring { - t.Fatal("expected io_uring initially") + t.Errorf("expected io_uring after switch, got %v", e.ActiveEngine().Type()) + } +} + +// TestSwitchFastPath: conns/worker above the high watermark snaps to io_uring +// on a single tick (no sustain wait). +func TestSwitchFastPath(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) + + if !e.ctrl.evaluate(time.Now(), false) { + t.Fatal("expected heavy-load fast-path snap on a single tick") } +} + +// TestNoSwitchInHysteresisBand: conns/worker between the down and up +// thresholds (12–20) must not switch in either direction. +func TestNoSwitchInHysteresisBand(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 16}) - // Run evaluation — should trigger switch (150 > 100*1.15). now := time.Now() - switched := e.ctrl.evaluate(now, false) - if !switched { - t.Fatal("expected switch to be recommended") + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not switch up inside the 12–20 hysteresis band") + } } +} - e.performSwitch() +// TestLargePayloadSuppressesSwitch: even far above the high watermark, a +// large average payload (link-bound) must suppress the io_uring switch. +func TestLargePayloadSuppressesSwitch(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 64, BytesPerReq: 32768}) - if e.ActiveEngine().Type() != engine.Epoll { - t.Errorf("expected epoll after switch, got %v", e.ActiveEngine().Type()) + now := time.Now() + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("large payload (link-bound) must suppress io_uring switch") + } } } -func TestHysteresis(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() +// TestRevertOnLowLoad: io_uring active, conns/worker drops below the down +// threshold for sustainTicks → revert to epoll. +func TestRevertOnLowLoad(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 4}) + + now := time.Now() + if e.ctrl.evaluate(now, false) { + t.Fatal("revert must require sustainTicks below the down threshold") + } + if !e.ctrl.evaluate(now.Add(time.Second), false) { + t.Fatal("expected revert to epoll after sustained low load") + } +} - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 1 * time.Hour // Very long cooldown to test blocking. +// TestErrorRevert: an error rate above errorRevertRate forces a revert from +// io_uring to epoll regardless of load (even at high conns/worker). +func TestErrorRevert(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.10}) + + if !e.ctrl.evaluate(time.Now(), false) { + t.Fatal("expected error-rate safety revert from io_uring") + } +} + +// TestNoRevertWhenIOUringBusy: io_uring active with conns/worker above the +// down threshold and no errors must stay on io_uring. +func TestNoRevertWhenIOUringBusy(t *testing.T) { + e, sampler := newAdaptiveStartingOnIOUring(t) + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 24}) - // Pre-seed standby historical score so initial switch triggers. now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now + for i := range 5 { + if e.ctrl.evaluate(now.Add(time.Duration(i)*time.Second), false) { + t.Fatal("must not revert while io_uring is still busy") + } + } +} - // Trigger initial switch. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) +// TestCooldownGate: after a switch, a revert is blocked until the cooldown +// window elapses, even when the revert condition holds. +func TestCooldownGate(t *testing.T) { + e, sampler := newAdaptiveStartingOnEpoll(t) + e.ctrl.cooldown = 1 * time.Hour + now := time.Now() + // Fast-path switch epoll→io_uring. + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) if !e.ctrl.evaluate(now, false) { - t.Fatal("expected initial switch") + t.Fatal("expected initial fast-path switch") } e.performSwitch() - // Immediately try to switch back — should be blocked by cooldown. - sampler.Set(engine.Epoll, TelemetrySnapshot{ThroughputRPS: 100}) - + // Now io_uring active and idle — but cooldown blocks the revert. + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 0}) if e.ctrl.evaluate(now.Add(1*time.Second), false) { - t.Error("switch should be blocked by cooldown") + t.Error("revert should be blocked by cooldown") } } func TestConnectionDraining(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() + e, sampler := newAdaptiveStartingOnEpoll(t) + primary := e.primary.(*mockEngine) + secondary := e.secondary.(*mockEngine) - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 - - // Initial state: primary active, secondary should be paused. - // Simulate the initial pause that Listen() would do. + // Simulate the initial pause that Listen() would do on the standby. _ = secondary.PauseAccept() secondary.pauseCalls.Store(0) // reset counter - // Pre-seed standby historical score. now := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now - - // Trigger switch. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path if !e.ctrl.evaluate(now, false) { t.Fatal("expected switch") @@ -202,28 +279,19 @@ func TestConnectionDraining(t *testing.T) { } func TestOscillationLock(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + e, sampler := newAdaptiveStartingOnEpoll(t) now := time.Now() - // Perform 3 switches rapidly. + // Perform 3 switches rapidly via the fast path; each iteration sets a + // load that triggers the active engine's switch direction. for range 3 { - activeType := e.ActiveEngine().Type() - sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50}) - other := engine.Epoll - if activeType == engine.Epoll { - other = engine.IOUring + switch e.ActiveEngine().Type() { + case engine.Epoll: + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // switch up + case engine.IOUring: + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) // error revert } - // Pre-seed standby historical score for each iteration. - e.ctrl.state.lastActiveScore[other] = 200 - e.ctrl.state.lastActiveTime[other] = now - if !e.ctrl.evaluate(now, false) { t.Fatal("expected switch before lock") } @@ -231,16 +299,13 @@ func TestOscillationLock(t *testing.T) { now = now.Add(10 * time.Second) } - // Fourth switch should be locked. - activeType := e.ActiveEngine().Type() - sampler.Set(activeType, TelemetrySnapshot{ThroughputRPS: 50}) - other := engine.Epoll - if activeType == engine.Epoll { - other = engine.IOUring + // Fourth switch should be locked (3 switches within 5 minutes). + switch e.ActiveEngine().Type() { + case engine.Epoll: + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) + case engine.IOUring: + sampler.Set(engine.IOUring, TelemetrySnapshot{ConnsPerWorker: 40, ErrorRate: 0.5}) } - e.ctrl.state.lastActiveScore[other] = 200 - e.ctrl.state.lastActiveTime[other] = now - if e.ctrl.evaluate(now, false) { t.Error("expected oscillation lock to prevent switch") } @@ -250,123 +315,164 @@ func TestOscillationLock(t *testing.T) { } func TestOverloadFreeze(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + e, sampler := newAdaptiveStartingOnEpoll(t) now := time.Now() - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50}) - - // Pre-seed standby historical score. - e.ctrl.state.lastActiveScore[engine.Epoll] = 200 - e.ctrl.state.lastActiveTime[engine.Epoll] = now + sampler.Set(engine.Epoll, TelemetrySnapshot{ConnsPerWorker: 50}) // fast-path load // Freeze switching. e.FreezeSwitching() - if e.ctrl.evaluate(now, e.frozen.Load()) { t.Error("expected freeze to block evaluation") } // Unfreeze. e.UnfreezeSwitching() - if !e.ctrl.evaluate(now, e.frozen.Load()) { t.Error("expected evaluation to proceed after unfreeze") } } -func TestHistoricalScoreDecay(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) +// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor +// is implemented on *Engine (post-v1.0.0). - base := time.Now() - e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0 - e.ctrl.state.lastActiveTime[engine.Epoll] = base +// --- Part A: feature-gated start-engine selection ------------------------ - // At t=0, score should be 100. - got := e.ctrl.historicalScore(engine.Epoll, base) - if got != 100.0 { - t.Errorf("at t=0: score = %f, want 100.0", got) - } +// chooseStartEngine + env-override + ioUringViable coverage now lives in +// start_test.go (the policy was redesigned: default epoll, io_uring only on an +// explicit high-concurrency hint with a viable kernel + memlock + non-h2c). - // At t=50s, score should be 50 (50% decay). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(50*time.Second)) - if got != 50.0 { - t.Errorf("at t=50s: score = %f, want 50.0", got) - } +// --- Part B: lazy standby build on first switch + reuse ------------------ - // At t=100s, score should be 0 (100% decay). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(100*time.Second)) - if got != 0.0 { - t.Errorf("at t=100s: score = %f, want 0.0", got) - } +// newLazyAdaptive builds an adaptive engine in the LAZY shape: epoll is the +// eager active (primary), io_uring is nil and built on demand by a counting +// buildStandby. listenCtx/listenWG are populated as Listen would, so +// performSwitch can launch + bind-wait the lazily-built standby. The returned +// counter pointer records how many times buildStandby ran. +func newLazyAdaptive(t *testing.T) (*Engine, *syntheticSampler, *mockEngine, *int) { + t.Helper() + active := newMockEngine(engine.Epoll) + sampler := newSyntheticSampler() - // At t=200s, score should still be 0 (clamped). - got = e.ctrl.historicalScore(engine.Epoll, base.Add(200*time.Second)) - if got != 0.0 { - t.Errorf("at t=200s: score = %f, want 0.0", got) + var builtCount int + lazyStandby := newMockEngine(engine.IOUring) + e := &Engine{ + primary: active, // epoll active + secondary: nil, // io_uring lazy standby + cfg: resource.Config{Protocol: engine.HTTP1}, + logger: testLogger(), + startType: engine.Epoll, } + e.buildStandby = func() (engine.Engine, error) { + builtCount++ + return lazyStandby, nil + } + e.ctrl = newController(e.primary, e.secondary, sampler, e.logger) + e.ctrl.state.activeIsPrimary = true + e.ctrl.cooldown = 0 + var ap engine.Engine = active + e.active.Store(&ap) + + // Wire the Listen-owned ctx + wait group so performSwitch can start the + // lazily-built standby's Listen goroutine. + ctx, cancel := context.WithCancel(t.Context()) + t.Cleanup(cancel) + var wg sync.WaitGroup + // Hold the wait group above zero for the test's lifetime (mirrors the + // eval-loop goroutine being a wg member in production), so wg.Go from + // performSwitch is always called while the counter is positive. + wg.Add(1) + t.Cleanup(func() { wg.Done() }) + e.listenCtx = ctx + e.listenWG = &wg + + return e, sampler, lazyStandby, &builtCount } -func TestHistoricalScoreSeeding(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() - - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 +// TestLazyStandbyBuiltOnFirstSwitchAndReused verifies the lazy New() path +// builds the standby exactly once (on the first switch) and reuses it on a +// subsequent switch back. +func TestLazyStandbyBuiltOnFirstSwitchAndReused(t *testing.T) { + e, _, lazyStandby, builtCount := newLazyAdaptive(t) - // Active (io_uring) has score 100. No standby history yet. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 100}) + if e.secondary != nil { + t.Fatal("standby must be nil before the first switch") + } + if *builtCount != 0 { + t.Fatalf("buildStandby ran %d times before any switch, want 0", *builtCount) + } - now := time.Now() - e.ctrl.evaluate(now, false) + // First switch: epoll -> io_uring. Must build + cache + activate the standby. + e.performSwitch() - // Standby should be seeded at 80% of active. - standbyScore, ok := e.ctrl.state.lastActiveScore[engine.Epoll] - if !ok { - t.Fatal("expected standby score to be seeded") + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after first switch, want 1", *builtCount) } - - // Active score = 1.0*100 - 2.0*0 = 100. 80% = 80. - if standbyScore != 80.0 { - t.Errorf("standby seed score = %f, want 80.0", standbyScore) + if e.secondary == nil { + t.Fatal("standby must be cached after the first switch") + } + if e.ActiveEngine().Type() != engine.IOUring { + t.Fatalf("active = %v after first switch, want io_uring", e.ActiveEngine().Type()) + } + // The lazily-built standby must have been Listen'd (its goroutine ran). + select { + case <-lazyStandby.listenStarted: + case <-time.After(2 * time.Second): + t.Fatal("lazy standby Listen was never started") + } + // New active (io_uring) must have been resumed; old active (epoll) paused. + if lazyStandby.resumeCalls.Load() == 0 { + t.Error("expected ResumeAccept on the freshly-built (now active) standby") + } + if e.primary.(*mockEngine).pauseCalls.Load() == 0 { + t.Error("expected PauseAccept on the old active (epoll)") } -} -func TestSwitchAfterActiveDegrades(t *testing.T) { - primary := newMockEngine(engine.IOUring) - secondary := newMockEngine(engine.Epoll) - sampler := newSyntheticSampler() + // Second switch: io_uring -> epoll. The standby slot for THIS direction is + // epoll (e.primary, already non-nil), so no new build happens. The io_uring + // engine stays cached too. + e.performSwitch() - cfg := resource.Config{Protocol: engine.HTTP1} - e := newFromEngines(primary, secondary, sampler, cfg) - e.ctrl.cooldown = 0 + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after second switch, want 1 (reuse)", *builtCount) + } + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatalf("active = %v after second switch, want epoll", e.ActiveEngine().Type()) + } + if e.secondary != lazyStandby { + t.Error("io_uring standby must remain cached (same instance) after switching back") + } - now := time.Now() + // Third switch: epoll -> io_uring again. Reuses the cached io_uring standby. + e.performSwitch() + if *builtCount != 1 { + t.Fatalf("buildStandby ran %d times after third switch, want 1 (reuse)", *builtCount) + } + if e.ActiveEngine().Type() != engine.IOUring { + t.Fatalf("active = %v after third switch, want io_uring", e.ActiveEngine().Type()) + } +} - // Pre-seed standby (epoll) with a strong historical score. - e.ctrl.state.lastActiveScore[engine.Epoll] = 100.0 - e.ctrl.state.lastActiveTime[engine.Epoll] = now +// TestLazyStandbyBuildFailureAbortsSwitch verifies a buildStandby error aborts +// the switch cleanly: the active engine is unchanged and the standby slot stays +// nil. +func TestLazyStandbyBuildFailureAbortsSwitch(t *testing.T) { + e, _, _, _ := newLazyAdaptive(t) + e.buildStandby = func() (engine.Engine, error) { + return nil, errBuildFailed + } - // Active (io_uring) degrades to 50 RPS. - sampler.Set(engine.IOUring, TelemetrySnapshot{ThroughputRPS: 50}) + e.performSwitch() - // Standby historical = 100 * (1 - 0.01*0) = 100. - // Active score = 50. 100 > 50*1.15 = 57.5 → switch. - if !e.ctrl.evaluate(now, false) { - t.Error("expected switch when active degrades below standby historical") + if e.ActiveEngine().Type() != engine.Epoll { + t.Fatalf("active changed to %v after a failed build; must stay epoll", e.ActiveEngine().Type()) + } + if e.secondary != nil { + t.Fatal("standby slot must stay nil after a failed build") + } + if e.ctrl.state.activeIsPrimary != true { + t.Fatal("controller direction must be unchanged after a failed build") } } -// TestSwitchTriggersFreezeSuppression is deferred until SetFreezeSuppressor -// is implemented on *Engine (post-v1.0.0). +var errBuildFailed = errors.New("build failed") diff --git a/adaptive/scaler_h2_dial_race_test.go b/adaptive/pauseaccept_h2_dial_race_test.go similarity index 72% rename from adaptive/scaler_h2_dial_race_test.go rename to adaptive/pauseaccept_h2_dial_race_test.go index 5dfaafaa..28e0ecb0 100644 --- a/adaptive/scaler_h2_dial_race_test.go +++ b/adaptive/pauseaccept_h2_dial_race_test.go @@ -16,23 +16,27 @@ import ( "github.com/goceleris/celeris/resource" ) -// TestAdaptiveScaler_H2DialNoRSTRace mirrors TestAdaptiveH2DialNoRSTRace -// with the dynamic worker scaler enabled. Locks in that the scaler's -// start-high default does not pause workers before the listen FDs -// settle in the SO_REUSEPORT group — which would RST in-flight H2 -// prior-knowledge handshakes mid-flush. +// TestAdaptivePauseAccept_H2DialNoRSTRace is a second angle on the +// H2-dial-RST race that TestAdaptiveH2DialNoRSTRace guards, run with a +// larger (4) worker pool so the standby engine has more listen FDs to +// evict from the SO_REUSEPORT group before Addr() is published. The +// race is the same: if PauseAccept on the standby has not synchronously +// removed every listen FD from the routing pool by the time adaptive +// exposes Addr, a burst of dials gets split across active and standby, +// and the standby's FD close RSTs the conns that landed on it — fatal +// for H2 prior-knowledge handshakes mid-flush. // // Iterations bound at 3: the race only fires on engine spin-up, so we -// just need "scaler + H2 dial burst" coverage in addition to the -// no-scaler test's larger budget. -func TestAdaptiveScaler_H2DialNoRSTRace(t *testing.T) { +// just need an additional burst against a wider pool in addition to the +// main test's larger budget. +func TestAdaptivePauseAccept_H2DialNoRSTRace(t *testing.T) { const iterations = 3 for i := 0; i < iterations; i++ { - runScalerH2Once(t, i) + runPauseAcceptH2Once(t, i) } } -func runScalerH2Once(t *testing.T, iter int) { +func runPauseAcceptH2Once(t *testing.T, iter int) { t.Helper() ln, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { @@ -47,9 +51,8 @@ func runScalerH2Once(t *testing.T, iter int) { Protocol: engine.Auto, EnableH2Upgrade: true, Resources: resource.Resources{ - Workers: 4, // 4 workers so MinActive=2 has 2 paused at start + Workers: 4, }, - WorkerScaling: &resource.WorkerScalingConfig{}, // zero value → start-high } e, err := New(cfg, &h2PrefaceHandler{}, nil) if err != nil { diff --git a/adaptive/scaler.go b/adaptive/scaler.go deleted file mode 100644 index b136d923..00000000 --- a/adaptive/scaler.go +++ /dev/null @@ -1,114 +0,0 @@ -//go:build linux - -package adaptive - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" -) - -// adaptiveScalerSource adapts the adaptive Engine to the scaler.Source -// interface. The active sub-engine can change at runtime -// (performSwitch); this source watches for that and routes -// PauseWorker / ResumeWorker calls to whichever engine is active right -// now. Generation increments on each switch so the shared scaler loop -// can re-baseline the new active engine's worker pause state. -// -// Both sub-engines must implement engine.WorkerScaler — verified via a -// type-assertion in the constructor. Both must have the same NumWorkers; -// if they ever diverge (defensive), the smaller pool is used as the cap. -type adaptiveScalerSource struct { - e *Engine - primary engine.WorkerScaler - standby engine.WorkerScaler - gen atomic.Uint64 - - // lastActive tracks which sub-engine was active on the previous - // scaler-source method call. When activeFor() observes a different - // engine, gen is incremented so the scaler.Run loop notices and - // re-baselines. - lastActive atomic.Pointer[engine.Engine] -} - -// newAdaptiveScalerSource attempts to construct a Source. Returns nil if -// either sub-engine fails the engine.WorkerScaler assertion (defensive -// — should not happen in production since both iouring and epoll -// implement it). -func newAdaptiveScalerSource(e *Engine) *adaptiveScalerSource { - primary, ok1 := e.primary.(engine.WorkerScaler) - standby, ok2 := e.secondary.(engine.WorkerScaler) - if !ok1 || !ok2 { - return nil - } - return &adaptiveScalerSource{e: e, primary: primary, standby: standby} -} - -// activeFor returns the WorkerScaler view of the engine that is -// currently active in adaptive. Bumps Generation if the active engine -// changed since the previous call, so scaler.Run re-baselines. -func (s *adaptiveScalerSource) activeFor() engine.WorkerScaler { - cur := *s.e.active.Load() - prev := s.lastActive.Load() - if prev == nil || *prev != cur { - s.lastActive.Store(&cur) - s.gen.Add(1) - } - if cur == s.e.secondary { - return s.standby - } - return s.primary -} - -func (s *adaptiveScalerSource) NumWorkers() int { - a, b := s.primary.NumWorkers(), s.standby.NumWorkers() - if a < b { - return a - } - return b -} - -func (s *adaptiveScalerSource) ActiveConns() int64 { - // Sum BOTH sub-engines' active conns, not just the currently-active - // one. During an adaptive switch, in-flight conns on the OLD - // sub-engine continue to be served until their clients close — the - // scaler must count them so the desired-worker calculation doesn't - // undershoot the real CPU load (#300 G1). - return s.e.primary.Metrics().ActiveConnections + - s.e.secondary.Metrics().ActiveConnections -} - -func (s *adaptiveScalerSource) PauseWorker(i int) { - s.activeFor().PauseWorker(i) -} - -func (s *adaptiveScalerSource) ResumeWorker(i int) { - s.activeFor().ResumeWorker(i) -} - -func (s *adaptiveScalerSource) Generation() uint64 { - // Refresh: a tick reading Generation should also drive the - // active-engine detection so we don't miss a switch that happens - // between two ticks where neither Pause/Resume nor ActiveConns ran. - s.activeFor() - return s.gen.Load() -} - -func (s *adaptiveScalerSource) Logger() *slog.Logger { return s.e.logger } - -// runScaler is started by Engine.Listen when the scaler is enabled. The -// algorithm itself lives in engine/scaler; this just wires up the -// adaptive-flavoured source. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config) { - src := newAdaptiveScalerSource(e) - if src == nil { - if e.logger != nil { - e.logger.Warn("adaptive scaler disabled: sub-engine does not implement WorkerScaler") - } - return - } - scaler.Run(ctx, src, cfg) -} diff --git a/adaptive/scaler_test.go b/adaptive/scaler_test.go deleted file mode 100644 index 5d5bec6b..00000000 --- a/adaptive/scaler_test.go +++ /dev/null @@ -1,116 +0,0 @@ -//go:build linux - -package adaptive - -import ( - "context" - "sync" - "testing" - "time" - - "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" - "github.com/goceleris/celeris/resource" -) - -// scalableMockEngine extends mockEngine with the engine.WorkerScaler -// interface so the adaptive scaler can exercise its -// PauseWorker / ResumeWorker delegation logic against a controllable -// fake. Pause/Resume calls are tracked per-worker and per-direction so -// tests can assert the scaler routed work to the active engine and -// not the standby. -type scalableMockEngine struct { - *mockEngine - numWorkers int - pauseLog []int - resumeLog []int - logMu sync.Mutex -} - -func newScalableMockEngine(et engine.EngineType, n int) *scalableMockEngine { - return &scalableMockEngine{mockEngine: newMockEngine(et), numWorkers: n} -} - -func (m *scalableMockEngine) NumWorkers() int { return m.numWorkers } - -func (m *scalableMockEngine) PauseWorker(i int) { - m.logMu.Lock() - m.pauseLog = append(m.pauseLog, i) - m.logMu.Unlock() -} - -func (m *scalableMockEngine) ResumeWorker(i int) { - m.logMu.Lock() - m.resumeLog = append(m.resumeLog, i) - m.logMu.Unlock() -} - -func (m *scalableMockEngine) snapshotLog() (pause, resume []int) { - m.logMu.Lock() - defer m.logMu.Unlock() - pause = append(pause, m.pauseLog...) - resume = append(resume, m.resumeLog...) - return -} - -// TestAdaptiveScaler_DelegatesToActiveEngine spins the higher-level -// scaler with two mock engines, sets connection counts on the active -// one, and verifies the scaler calls PauseWorker / ResumeWorker on -// the active engine — never on the standby. This is the architectural -// invariant that the v1.4.1 adaptive scaler refactor exists to enforce -// (the pre-refactor design ran two scalers, producing -54 % to +118 % -// pinning-test variance — see PR #257). -func TestAdaptiveScaler_DelegatesToActiveEngine(t *testing.T) { - primary := newScalableMockEngine(engine.Epoll, 8) - secondary := newScalableMockEngine(engine.IOUring, 8) - sampler := newSyntheticSampler() - cfg := resource.Config{} - e := newFromEngines(primary, secondary, sampler, cfg) - - // Active = primary (matches newFromEngines default). - // Set activeConns high enough that desired = 8 (max), so the scaler - // will call ResumeWorker on workers 4..7 of the active engine. - primary.SetMetrics(engine.EngineMetrics{ActiveConnections: 200}) - secondary.SetMetrics(engine.EngineMetrics{ActiveConnections: 0}) - - scfg := scaler.Config{ - Enabled: true, - MinActive: 4, - TargetConnsPerWorker: 20, - Interval: 10 * time.Millisecond, - ScaleUpStep: 4, // burst-resume to active=8 in one tick - ScaleDownStep: 1, - ScaleDownHysteresis: 1, - ScaleDownIdleTicks: 100, // never scale-down within the test budget - } - - ctx, cancel := context.WithCancel(t.Context()) - defer cancel() - go e.runScaler(ctx, scfg) - - // Give the scaler a few ticks to ramp up. - time.Sleep(100 * time.Millisecond) - cancel() - // Tiny grace for the scaler goroutine to observe the cancel. - time.Sleep(20 * time.Millisecond) - - pPause, pResume := primary.snapshotLog() - sPause, sResume := secondary.snapshotLog() - - // Initial state pauses workers 4..7 on the active engine. Then - // scaler resumes 4..7 on the ACTIVE (primary) engine to reach - // desired=8. - if len(pPause) < 4 { - t.Errorf("primary expected ≥4 PauseWorker calls (init), got %d: %v", len(pPause), pPause) - } - if len(pResume) == 0 { - t.Errorf("primary expected ResumeWorker calls (active scaling up), got 0") - } - // Standby (secondary) should NEVER see ResumeWorker — that's the - // invariant. The scaler routes scale-up to the active engine only. - if len(sResume) != 0 { - t.Errorf("secondary should NOT see ResumeWorker calls; got %v", sResume) - } - // And no spurious calls on the secondary either (it's the standby). - _ = sPause -} diff --git a/adaptive/score.go b/adaptive/score.go deleted file mode 100644 index 85034d55..00000000 --- a/adaptive/score.go +++ /dev/null @@ -1,83 +0,0 @@ -//go:build linux - -package adaptive - -// ScoreWeights defines the weighting for each telemetry signal in score -// computation. Higher throughput weight favors faster engines; higher error -// weight penalizes unreliable ones. -type ScoreWeights struct { - // Throughput is the weight applied to requests-per-second in the score. - Throughput float64 - // ErrorRate is the penalty weight applied to the error fraction. - ErrorRate float64 -} - -// DefaultWeights returns the default score weights. -func DefaultWeights() ScoreWeights { - return ScoreWeights{ - Throughput: 1.0, - ErrorRate: 2.0, - } -} - -// computeScore produces a weighted score from a telemetry snapshot. -// Higher is better. -func computeScore(snap TelemetrySnapshot, w ScoreWeights) float64 { - return w.Throughput*snap.ThroughputRPS - w.ErrorRate*snap.ErrorRate -} - -// ioUringBias returns a positive value when conditions favor io_uring over -// epoll, based on benchmark data. The bias is added to io_uring's score (or -// subtracted from epoll's) to enable proactive switching without requiring -// a throughput advantage. -// -// Signals from 3-run median benchmarks (2026-03-27): -// - io_uring CPU efficiency: 2-8% less CPU at 256-4096 connections -// - io_uring p99 tail: 17-39% better at 1024-4096 connections -// - Throughput: equivalent (±2%) at 256-4096 connections -// - io_uring loses at <64 connections (fixed overhead) and >8192 on x86 -// -// Pre-v1.5.0 this function was effectively dead: liveSampler.cpuMon was -// never assigned (celeris#316), so snap.CPUUtilization was always 0 and -// cpuFactor was always 0, making the function return 0 unconditionally. -// v1.5.0 wires the CPU monitor at engine construction; the bias now fires -// when CPU is above 30% and conns land in the 128-8192 sweet spot. -// -// The conn-factor falloff above 8192 is the empirical cost structure on -// x86 — see the bench data above — not a bug. If #318 identifies a -// different cause (e.g. a PbufRing bottleneck, fixed by #322), update -// this comment to point at the new finding. -func ioUringBias(snap TelemetrySnapshot) float64 { - conns := snap.ActiveConnections - cpu := snap.CPUUtilization - - // No bias at very low or very high connection counts. - if conns < 128 || conns > 8192 { - return 0 - } - - // Base bias: proportional to connection count in the sweet spot. - // Peaks at 1024-4096 connections where io_uring's advantages are strongest. - var connFactor float64 - switch { - case conns < 256: - connFactor = 0.1 - case conns < 1024: - connFactor = 0.3 - case conns <= 4096: - connFactor = 0.5 // Peak: io_uring's best range - default: - connFactor = 0.2 // 4096-8192: still beneficial but declining - } - - // CPU factor: bias increases when CPU-bound (io_uring's efficiency advantage - // matters most when cores are saturated). No bias below 30% CPU. - cpuFactor := 0.0 - if cpu > 0.3 { - cpuFactor = (cpu - 0.3) / 0.7 // 0.0 at 30% CPU, 1.0 at 100% - } - - // Combined bias: connection suitability × CPU pressure. - // Maximum bias is 0.5 (50% of throughput score). - return connFactor * cpuFactor -} diff --git a/adaptive/score_test.go b/adaptive/score_test.go deleted file mode 100644 index 599c89da..00000000 --- a/adaptive/score_test.go +++ /dev/null @@ -1,55 +0,0 @@ -//go:build linux - -package adaptive - -import "testing" - -func TestIoUringBiasConnFactorFalloff(t *testing.T) { - tests := []struct { - name string - conns int64 - cpu float64 - wantBias float64 - tolerance float64 - }{ - {name: "below 128: zero", conns: 64, cpu: 0.9, wantBias: 0.0}, - {name: "at 128 boundary: connFactor 0.1", conns: 128, cpu: 1.0, wantBias: 0.1 * (1.0 - 0.3) / 0.7}, - {name: "256 conns peak: 0.5", conns: 2048, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7}, - {name: "4096 boundary: 0.5 still", conns: 4096, cpu: 1.0, wantBias: 0.5 * (1.0 - 0.3) / 0.7}, - {name: "4097: drops to 0.2", conns: 4097, cpu: 1.0, wantBias: 0.2 * (1.0 - 0.3) / 0.7}, - {name: "above 8192: zero", conns: 9000, cpu: 0.9, wantBias: 0.0}, - {name: "low CPU: zero cpuFactor", conns: 2048, cpu: 0.20, wantBias: 0.5 * 0.0}, - {name: "exactly 30% CPU: zero cpuFactor", conns: 2048, cpu: 0.30, wantBias: 0.5 * 0.0}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - snap := TelemetrySnapshot{ - ActiveConnections: tt.conns, - CPUUtilization: tt.cpu, - } - got := ioUringBias(snap) - if absDiff(got, tt.wantBias) > tt.tolerance { - t.Errorf("ioUringBias(conns=%d, cpu=%.2f) = %v, want %v ± %v", - tt.conns, tt.cpu, got, tt.wantBias, tt.tolerance) - } - }) - } -} - -func TestComputeScoreTwoTerm(t *testing.T) { - w := DefaultWeights() - snap := TelemetrySnapshot{ThroughputRPS: 1000, ErrorRate: 0.05} - got := computeScore(snap, w) - want := 1.0*1000.0 - 2.0*0.05 - if got != want { - t.Errorf("computeScore = %v, want %v", got, want) - } -} - -func absDiff(a, b float64) float64 { - d := a - b - if d < 0 { - return -d - } - return d -} diff --git a/adaptive/start_test.go b/adaptive/start_test.go new file mode 100644 index 00000000..71aaa498 --- /dev/null +++ b/adaptive/start_test.go @@ -0,0 +1,92 @@ +//go:build linux + +package adaptive + +import ( + "testing" + + "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/resource" +) + +// viableProfile is a modern kernel with the full io_uring fast tier. +func viableProfile() engine.CapabilityProfile { + return engine.CapabilityProfile{ + KernelMajor: 6, KernelMinor: 12, + DeferTaskrun: true, SingleIssuer: true, MultishotRecv: true, ProvidedBuffers: true, + } +} + +// oldProfile is a pre-fast-tier kernel (io_uring not worth running). +func oldProfile() engine.CapabilityProfile { + return engine.CapabilityProfile{KernelMajor: 5, KernelMinor: 15} +} + +// withMemlock overrides the memlock probe for the duration of a test. +func withMemlock(t *testing.T, maxWorkers int) { + t.Helper() + prev := maxWorkersForMemlock + maxWorkersForMemlock = func() int { return maxWorkers } + t.Cleanup(func() { maxWorkersForMemlock = prev }) +} + +// TestChooseStartEngine_GateOrder asserts the new start-engine policy: the +// DEFAULT is epoll (the flipped default), io_uring only on an explicit +// high-concurrency hint with a viable kernel + memlock + non-h2c protocol. +func TestChooseStartEngine_GateOrder(t *testing.T) { + cfg := func(proto engine.Protocol, hint resource.WorkloadHint) resource.Config { + return resource.Config{Protocol: proto, Resources: resource.Resources{WorkloadHint: hint}} + } + tests := []struct { + name string + profile engine.CapabilityProfile + cfg resource.Config + memlock int // -1 = no cap + want engine.EngineType + }{ + {"default-is-epoll (viable, no hint)", viableProfile(), cfg(engine.HTTP1, resource.WorkloadUnspecified), -1, engine.Epoll}, + {"low-conc hint -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadLowConcurrency), -1, engine.Epoll}, + {"high-conc hint + viable -> iouring", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.IOUring}, + {"high-conc hint but old kernel -> epoll", oldProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), -1, engine.Epoll}, + {"high-conc hint but h2c -> epoll", viableProfile(), cfg(engine.H2C, resource.WorkloadHighConcurrency), -1, engine.Epoll}, + {"high-conc hint but memlock-starved -> epoll", viableProfile(), cfg(engine.HTTP1, resource.WorkloadHighConcurrency), 1, engine.Epoll}, + {"auto protocol + high-conc hint -> iouring", viableProfile(), cfg(engine.Auto, resource.WorkloadHighConcurrency), -1, engine.IOUring}, + } + t.Setenv("CELERIS_ADAPTIVE_START", "") // neutralize any ambient override + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + withMemlock(t, tc.memlock) + if got := chooseStartEngine(tc.profile, tc.cfg); got != tc.want { + t.Fatalf("chooseStartEngine = %v, want %v", got, tc.want) + } + }) + } +} + +// TestChooseStartEngine_EnvOverride asserts the env escape hatch wins over policy. +func TestChooseStartEngine_EnvOverride(t *testing.T) { + withMemlock(t, -1) + t.Setenv("CELERIS_ADAPTIVE_START", "iouring") + if got := chooseStartEngine(oldProfile(), resource.Config{Protocol: engine.H2C}); got != engine.IOUring { + t.Fatalf("env=iouring should force IOUring even on old/h2c, got %v", got) + } + t.Setenv("CELERIS_ADAPTIVE_START", "epoll") + if got := chooseStartEngine(viableProfile(), resource.Config{Resources: resource.Resources{WorkloadHint: resource.WorkloadHighConcurrency}}); got != engine.Epoll { + t.Fatalf("env=epoll should force Epoll even with high-conc hint, got %v", got) + } +} + +// TestIOUringViable covers the two t0 disqualifiers. +func TestIOUringViable(t *testing.T) { + withMemlock(t, -1) + if !ioUringViable(viableProfile(), resource.Config{}) { + t.Fatal("viable profile + unlimited memlock should be viable") + } + if ioUringViable(oldProfile(), resource.Config{}) { + t.Fatal("old kernel should be non-viable") + } + withMemlock(t, 1) // 1 worker < resolved workers -> non-viable + if ioUringViable(viableProfile(), resource.Config{}) { + t.Fatal("memlock-starved should be non-viable") + } +} diff --git a/adaptive/telemetry.go b/adaptive/telemetry.go index 992d74d2..5b52d1a1 100644 --- a/adaptive/telemetry.go +++ b/adaptive/telemetry.go @@ -22,6 +22,20 @@ type TelemetrySnapshot struct { // CPUUtilization is the estimated CPU usage fraction (0.0-1.0). Read from // the live sampler's CPUMonitor; zero when no monitor is wired. CPUUtilization float64 + // ConnsPerWorker is ActiveConnections divided by the engine's worker + // count. This is the PRIMARY load signal driving engine selection: + // epoll and io_uring tie at low conns/worker, but io_uring pulls ahead + // and keeps scaling above ~20/worker while epoll plateaus. + ConnsPerWorker float64 + // AcceptRate is the new-connection arrival rate (accepts/sec) over the + // last sampling interval, derived like ThroughputRPS. A secondary load + // signal: a high accept rate indicates connection churn. + AcceptRate float64 + // BytesPerReq is the average payload size (read+written bytes per + // request) over the last interval. When this exceeds the controller's + // large-payload threshold the workload is link-bound — the engines tie + // — so the controller suppresses an io_uring switch to avoid churn. + BytesPerReq float64 } // TelemetrySampler produces telemetry snapshots from an engine. @@ -54,6 +68,11 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot { ActiveConnections: m.ActiveConnections, } + // ConnsPerWorker is a point-in-time ratio (not a delta), so it needs no + // prior sample. max(Workers, 1) guards the pre-Listen window where the + // worker count is still zero. + snap.ConnsPerWorker = float64(m.ActiveConnections) / float64(max(m.Workers, 1)) + prev, hasPrev := s.prevMetrics[et] prevT, hasT := s.prevTime[et] if hasPrev && hasT { @@ -61,9 +80,13 @@ func (s *liveSampler) Sample(e engine.Engine) TelemetrySnapshot { if elapsed > 0 { deltaReqs := m.RequestCount - prev.RequestCount deltaErrs := m.ErrorCount - prev.ErrorCount + deltaAccepts := m.AcceptCount - prev.AcceptCount + deltaBytes := (m.BytesRead + m.BytesWritten) - (prev.BytesRead + prev.BytesWritten) snap.ThroughputRPS = float64(deltaReqs) / elapsed + snap.AcceptRate = float64(deltaAccepts) / elapsed if deltaReqs > 0 { snap.ErrorRate = float64(deltaErrs) / float64(deltaReqs) + snap.BytesPerReq = float64(deltaBytes) / float64(deltaReqs) } } } diff --git a/async_improvements_test.go b/async_improvements_test.go new file mode 100644 index 00000000..e72a2a5c --- /dev/null +++ b/async_improvements_test.go @@ -0,0 +1,132 @@ +package celeris + +import ( + "sync/atomic" + "testing" +) + +// --- #1: Server.AsyncHandlers() reports the EFFECTIVE async state --------- + +// TestAsyncHandlersEffective_PerRouteAsync verifies the driver-footgun fix: a +// server with Config.AsyncHandlers=false but a .Async() route reports +// AsyncHandlers()==true, so a WithEngine driver picks its netpoll fast path +// under the recommended per-route-async idiom (instead of the slow mini-loop). +func TestAsyncHandlersEffective_PerRouteAsync(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + if s.AsyncHandlers() { + t.Fatal("no async routes + AsyncHandlers=false must report false") + } + s.GET("/db", noopHandler).Async() + if !s.AsyncHandlers() { + t.Fatal("a .Async() route must make AsyncHandlers() effective-true (driver fast path)") + } +} + +// TestAsyncHandlersEffective_ServerDefault verifies the server-level flag alone +// still reports true (unchanged behavior). +func TestAsyncHandlersEffective_ServerDefault(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: true}) + if !s.AsyncHandlers() { + t.Fatal("AsyncHandlers=true must report true") + } +} + +// TestAsyncHandlersEffective_GroupAsync verifies a group-level .Async() route +// also flips the effective state. +func TestAsyncHandlersEffective_GroupAsync(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + api := s.Group("/api").Async() + api.GET("/x", noopHandler) + if !s.AsyncHandlers() { + t.Fatal("a group .Async() route must make AsyncHandlers() effective-true") + } +} + +// --- #4: .UsesDriver() is an intent-revealing alias for .Async() ---------- + +func TestUsesDriver(t *testing.T) { + s := New(Config{Engine: Std, AsyncHandlers: false}) + s.GET("/cache/:key", noopHandler).UsesDriver() + if !s.router.routeAsync("GET", "/cache/anything") { + t.Fatal(".UsesDriver() must resolve hard-async") + } + if s.router.adaptiveRoutes["/cache/:key"] { + t.Fatal(".UsesDriver() (== .Async()) must not be adaptive") + } + if !s.AsyncHandlers() { + t.Fatal(".UsesDriver() must make AsyncHandlers() effective-true") + } +} + +// --- #3: immediate promotion of an unambiguously-blocking inline run ------- + +// TestAdaptiveBlockingThresholdConst guards the threshold ordering: the +// immediate-promote bar must sit above the slow bar so the streak path still +// owns the borderline band. +func TestAdaptiveBlockingThresholdConst(t *testing.T) { + if adaptiveBlockingThreshold <= adaptivePromoteThreshold { + t.Fatalf("blocking threshold %v must exceed slow threshold %v", + adaptiveBlockingThreshold, adaptivePromoteThreshold) + } +} + +// TestPromoteRouteImmediate verifies a single unambiguously-blocking inline run +// promotes the route immediately (no adaptivePromoteStreak wait), so a +// forgotten-async blocking handler stalls a worker for at most one request. +func TestPromoteRouteImmediate(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/slow", noopHandler) // inherits default → adaptive + r := s.router + if !r.adaptiveRoutes["/slow"] { + t.Fatal("/slow should be adaptive under AsyncHandlers=true") + } + if r.isPromoted("/slow") { + t.Fatal("must not be promoted before any blocking run") + } + r.promoteRouteImmediate("/slow") // one >adaptiveBlockingThreshold run + if !r.isPromoted("/slow") { + t.Fatal("a single blocking inline run must promote immediately") + } + if !r.routeAsync("GET", "/slow") { + t.Fatal("a promoted route must resolve async") + } +} + +// TestPromoteImmediate_ClearsFastStreak verifies the immediate promotion clears +// any accumulated fast streak so the route cannot wrongly settle on stale fast +// runs after the promotion TTL expires. +func TestPromoteImmediate_ClearsFastStreak(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/slow", noopHandler) + r := s.router + for i := 0; i < 10; i++ { + r.recordInlineRun("/slow", false) // accumulate fast runs + } + r.promoteRouteImmediate("/slow") + if !r.isPromoted("/slow") { + t.Fatal("must be promoted") + } + if v, ok := r.fastStreak.Load("/slow"); ok { + if got := v.(*atomic.Int32).Load(); got != 0 { + t.Fatalf("fast streak must be cleared on immediate promote, got %d", got) + } + } +} + +// TestStreakStillRequiresEight verifies the slow-streak path is unchanged: a +// borderline-slow (300µs–2ms) route still needs adaptivePromoteStreak runs. +func TestStreakStillRequiresEight(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/borderline", noopHandler) + r := s.router + for i := 0; i < adaptivePromoteStreak-1; i++ { + r.recordInlineRun("/borderline", true) + } + if r.isPromoted("/borderline") { + t.Fatalf("must not promote before %d consecutive slow runs", adaptivePromoteStreak) + } + r.recordInlineRun("/borderline", true) // the 8th + if !r.isPromoted("/borderline") { + t.Fatalf("must promote on the %dth consecutive slow run", adaptivePromoteStreak) + } +} diff --git a/async_promote_integration_linux_test.go b/async_promote_integration_linux_test.go new file mode 100644 index 00000000..736c844a --- /dev/null +++ b/async_promote_integration_linux_test.go @@ -0,0 +1,75 @@ +//go:build linux + +package celeris + +import ( + "context" + "net" + "net/http" + "testing" + "time" +) + +// TestAdaptiveImmediatePromote_Epoll verifies improvement #3 end-to-end on the +// real epoll engine (which runs HandleStream's inline timing): under +// AsyncHandlers=true, an UNMARKED route whose handler blocks for longer than +// adaptiveBlockingThreshold is promoted to async dispatch on the FIRST request, +// not after adaptivePromoteStreak (8) of them. A fast route used only for +// readiness must NOT promote. +func TestAdaptiveImmediatePromote_Epoll(t *testing.T) { + s := New(Config{Engine: Epoll, AsyncHandlers: true}) + s.GET("/ping", func(c *Context) error { return c.String(http.StatusOK, "ok") }) + s.GET("/slow", func(c *Context) error { + time.Sleep(3 * time.Millisecond) // > adaptiveBlockingThreshold (2ms) + return c.String(http.StatusOK, "slow") + }) + if !s.router.adaptiveRoutes["/slow"] || !s.router.adaptiveRoutes["/ping"] { + t.Fatal("both routes must be adaptive under AsyncHandlers=true") + } + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + go func() { _ = s.StartWithListener(ln) }() + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = s.Shutdown(ctx) + }() + + base := "http://" + ln.Addr().String() + client := &http.Client{} + // Readiness on the FAST route only (so /slow is untouched until our 1 probe). + deadline := time.Now().Add(5 * time.Second) + ready := false + for time.Now().Before(deadline) { + if resp, err := client.Get(base + "/ping"); err == nil { + _ = resp.Body.Close() + ready = true + break + } + time.Sleep(10 * time.Millisecond) + } + if !ready { + t.Fatal("server did not become ready") + } + + if s.router.isPromoted("/slow") { + t.Fatal("/slow must not be promoted before any request to it") + } + // Exactly ONE request to the blocking route. + resp, err := client.Get(base + "/slow") + if err != nil { + t.Fatalf("GET /slow: %v", err) + } + _ = resp.Body.Close() + + if !s.router.isPromoted("/slow") { + t.Fatal("#3: a single >2ms inline run must promote /slow immediately (got not-promoted)") + } + // The fast route hammered for readiness must never promote. + if s.router.isPromoted("/ping") { + t.Fatal("/ping (fast) must not be promoted") + } +} diff --git a/config.go b/config.go index ecb84547..e69a25d9 100644 --- a/config.go +++ b/config.go @@ -40,6 +40,25 @@ const ( // String returns the engine type name. func (t EngineType) String() string { return engine.EngineType(t).String() } +// WorkloadHint is an OPTIONAL declaration of expected steady-state concurrency, +// used only by the Adaptive engine's start-engine decision. Because connections +// cannot migrate between epoll and io_uring, the START engine decides keep-alive +// throughput — and the concurrency is unknowable when the server binds. This +// hint is the ONLY way to make Adaptive START on io_uring; without it Adaptive +// starts on epoll (best for the ramp-from-zero / low-concurrency / latency case) +// and promotes new connections to io_uring under sustained high load. +type WorkloadHint resource.WorkloadHint + +const ( + // WorkloadUnspecified (default) → start epoll, promote under load. + WorkloadUnspecified WorkloadHint = WorkloadHint(resource.WorkloadUnspecified) + // WorkloadLowConcurrency → thin/latency-sensitive traffic; stay epoll. + WorkloadLowConcurrency WorkloadHint = WorkloadHint(resource.WorkloadLowConcurrency) + // WorkloadHighConcurrency → many h1 keep-alive conns/worker; start io_uring + // (when the kernel and RLIMIT_MEMLOCK allow it). + WorkloadHighConcurrency WorkloadHint = WorkloadHint(resource.WorkloadHighConcurrency) +) + // Config holds the public server configuration. type Config struct { // Addr is the TCP address to listen on (e.g. ":8080"). @@ -52,6 +71,12 @@ type Config struct { // Workers is the number of I/O worker goroutines (default GOMAXPROCS). Workers int + // WorkloadHint optionally declares the expected steady-state concurrency. + // It only affects the Adaptive engine's start-engine choice (see WorkloadHint): + // the zero value starts on epoll; WorkloadHighConcurrency starts on io_uring + // when the kernel + memlock allow. + WorkloadHint WorkloadHint + // ReadTimeout is the max duration for reading the entire request. // Zero uses the default (60s). Set to -1 for no timeout. ReadTimeout time.Duration @@ -133,13 +158,20 @@ type Config struct { // groups can override it per handler with [Route.Async] / // [RouteGroup.Async] (most-specific wins: route > group > this // default), so a server with mostly CPU routes + a few DB routes can - // keep this false and mark just the DB routes .Async(), or set this - // true and mark hot CPU routes .Async(false). On HTTP/2 the override - // is honored per-stream (sync routes run inline on the event loop, - // async routes dispatch to the worker pool). Note: celeris drivers - // opened WithEngine(srv) consult the server-level AsyncHandlers (not - // per-route overrides) for their auto-async path selection — set this - // true when using WithEngine drivers under per-route async. + // keep this false and mark just the DB routes .Async() / .UsesDriver(), + // or set this true and mark hot CPU routes .Async(false). On HTTP/2 the + // override is honored per-stream (sync routes run inline on the event + // loop, async routes dispatch to the worker pool). + // + // DRIVERS: celeris drivers opened WithEngine(srv) pick their netpoll-park + // fast path from the server's EFFECTIVE async state — true when this flag + // is set OR any route is .Async(). So "keep this false + mark DB routes + // .Async()/.UsesDriver()" selects the fast driver path too, PROVIDED the + // driver is opened AFTER those routes are registered (the effective state + // is read at driver construction); otherwise set this true. Setting this + // true also enables the adaptive safety net that auto-promotes any unmarked + // handler slower than ~300µs, at a small learning-phase cost that settles to + // zero for static routes. // // Default: false. AsyncHandlers bool @@ -177,34 +209,6 @@ type Config struct { // - non-nil false: force disabled, even on Protocol=Auto. Useful when // the engine intentionally only serves HTTP/1. EnableH2Upgrade *bool - - // WorkerScaling configures the dynamic worker scaler. As of v1.4.6 - // the scaler is DEFAULT-ON — leaving this field nil resolves to a - // zero-value [resource.WorkerScalingConfig], which activates the - // scaler with the data-validated defaults (Strategy=StartHigh, - // MinActive=max(2, NumCPU/2), TargetConnsPerWorker=20, - // Interval=250ms, ScaleUpStep=2, ScaleDownStep=1, - // ScaleDownHysteresis=1, ScaleDownIdleTicks=4). This matches the - // "just-works" public design intent already in place for the - // Engine=Adaptive and Protocol=Auto defaults. - // - // Pre-v1.4.6 behaviour (scaler always disabled unless explicitly - // configured) is achievable by passing a non-nil struct that - // effectively makes the scaler a no-op: - // - // WorkerScaling: &resource.WorkerScalingConfig{ - // MinActive: runtime.GOMAXPROCS(0), // pin at NumCPU; never scale down - // } - // - // Set to a non-nil pointer with custom values to override one or - // more defaults. See [resource.WorkerScalingConfig] for tuning. - // The scaler keeps connections-per-active-worker around the target - // ratio by pausing/resuming workers; this dramatically improves - // CQE/event batching at low/mid concurrency where the static - // numCPU default would otherwise lose 30-90 % CPU/req to under- - // batched syscalls. See PR #257 / issue #281 for the full - // rationale and benchmark data. - WorkerScaling *resource.WorkerScalingConfig } // EngineMetrics is a point-in-time snapshot of engine-level performance counters. @@ -250,23 +254,13 @@ func (c Config) toResourceConfig() resource.Config { if c.MaxConns > 0 { rc.Resources.MaxConns = c.MaxConns } + rc.Resources.WorkloadHint = resource.WorkloadHint(c.WorkloadHint) rc.MaxRequestBodySize = c.MaxRequestBodySize rc.AsyncHandlers = c.AsyncHandlers rc.OnExpectContinue = c.OnExpectContinue rc.OnConnect = c.OnConnect rc.OnDisconnect = c.OnDisconnect - // Dynamic worker scaler default-on (issue #281). A nil - // WorkerScaling field — i.e. the user did not configure it — now - // resolves to a zero-value struct so the scaler activates with - // the data-validated defaults. Opt-out is documented as setting - // MinActive=NumCPU (no-op scaler), which preserves backward - // compatibility for users who really do want pre-v1.4.6 behaviour. - if c.WorkerScaling != nil { - rc.WorkerScaling = c.WorkerScaling - } else { - rc.WorkerScaling = &resource.WorkerScalingConfig{} - } // h2c upgrade resolution. Nil → protocol-dependent default (Auto → true, // HTTP1/H2C → false). Non-nil → user override honored verbatim. diff --git a/context.go b/context.go index b0736478..c3f142aa 100644 --- a/context.go +++ b/context.go @@ -168,6 +168,19 @@ type Context struct { hostOverride string respHdrBuf [16][2]string // reusable buffer for response headers (avoids heap escape) + // respHdrScratch retains the heap backing array the moment a middleware + // stack pushes respHeaders past respHdrBuf's 16 slots (e.g. chain-fullstack: + // 15 user headers + content-type + content-length = 17). Reused across + // requests so a >16-header route reallocates once per pooled Context, not + // once per request. nil until the first overflow. + respHdrScratch [][2]string + // blobHdrScratch is the reused assembly buffer for Blob's response header + // list (content-type + content-length + user headers) when the total exceeds + // respHdrBuf's 16 slots. Without it, Blob allocates make([][2]string,0,total) + // on EVERY many-header response (chain-fullstack: 18 headers ⇒ the dominant + // per-request alloc + GC pressure). Reused so it reallocates once per pooled + // Context, not once per request. nil until the first many-header Blob. + blobHdrScratch [][2]string trustedNets []*net.IPNet @@ -545,26 +558,27 @@ func (c *Context) reset() { c.fullPath = "" c.statusCode = 200 if n := len(c.respHeaders); n > 0 { - // respHeaders shares the respHdrBuf backing array up to its - // fixed capacity (16). Middleware stacks that emit more than 16 - // response headers (kitchen_sink with secure + cache + ratelimit - // + etag etc. easily exceeds it) trigger append() to allocate a - // new backing array — respHeaders now has > 16 entries but - // respHdrBuf is still just 16. Without this clamp, - // `clear(c.respHdrBuf[:n])` panics with "slice bounds out of - // range [:N] with length 16" — the panic propagates through - // recoverAndRelease and aborts the iouring/epoll async handler - // AFTER WriteResponse has queued bytes into the per-conn - // writeBuf but BEFORE flushSend pushes them to the socket, so - // the client sees an empty response. std-engine escapes the - // damage because Go's net/http writes the response inline - // before the cleanup panic. - if n > len(c.respHdrBuf) { - n = len(c.respHdrBuf) + // When a middleware stack emits >16 response headers (secure + cache + // + ratelimit + etag + ... easily exceeds it), append() has moved + // respHeaders onto a heap backing array. Retain it as respHdrScratch + // and reuse it next request rather than dropping it (which forced a + // fresh ~576B alloc on every header-heavy request). The cap check is + // the correct discriminator: cap<=16 means respHeaders is still the + // inline respHdrBuf, so clear only its used prefix; cap>16 means a + // heap array, clear its full length. (This also removes the old clamp + // that papered over a clear(respHdrBuf[:n>16]) bounds panic.) + if cap(c.respHeaders) > len(c.respHdrBuf) { + clear(c.respHeaders) + c.respHdrScratch = c.respHeaders + } else { + clear(c.respHdrBuf[:n]) } - clear(c.respHdrBuf[:n]) } - c.respHeaders = c.respHdrBuf[:0] + if c.respHdrScratch != nil { + c.respHeaders = c.respHdrScratch[:0] + } else { + c.respHeaders = c.respHdrBuf[:0] + } c.written = false c.aborted = false c.bytesWritten = 0 diff --git a/context_response.go b/context_response.go index ffe673c6..7a5d749e 100644 --- a/context_response.go +++ b/context_response.go @@ -589,7 +589,14 @@ func (c *Context) Blob(code int, contentType string, data []byte) error { headers = append(headers, [2]string{"content-length", itoa(len(data))}) headers = append(headers, tmp[:nUser]...) } else { - headers = make([][2]string, 0, total) + // Reuse a per-Context scratch instead of allocating per request — the + // dominant chain-fullstack alloc (18 headers > respHdrBuf's 16) lived + // here. respHeaders never aliases blobHdrScratch (separate buffers; the + // append below copies the [2]string values). + if cap(c.blobHdrScratch) < total { + c.blobHdrScratch = make([][2]string, 0, total) + } + headers = c.blobHdrScratch[:0] headers = append(headers, [2]string{"content-type", ct}) headers = append(headers, [2]string{"content-length", itoa(len(data))}) headers = append(headers, c.respHeaders...) @@ -761,9 +768,13 @@ func (c *Context) SetResponseHeaders(headers [][2]string) { if len(headers) <= len(c.respHdrBuf) { copy(c.respHdrBuf[:len(headers)], headers) c.respHeaders = c.respHdrBuf[:len(headers)] + } else if cap(c.respHdrScratch) >= len(headers) { + c.respHeaders = c.respHdrScratch[:len(headers)] + copy(c.respHeaders, headers) } else { c.respHeaders = make([][2]string, len(headers)) copy(c.respHeaders, headers) + c.respHdrScratch = c.respHeaders } } diff --git a/context_test.go b/context_test.go index 98429d31..8f2be7a6 100644 --- a/context_test.go +++ b/context_test.go @@ -249,6 +249,73 @@ func TestContextResetWithOverflowedRespHeaders(t *testing.T) { } } +// TestContextRespHeaderOverflowReuseZeroAlloc locks in celeris#360: once a +// middleware stack pushes respHeaders past the inline 16-slot respHdrBuf, the +// grown heap backing array is retained as respHdrScratch and reused on every +// subsequent request — so a >16-header route allocates the backing array ONCE +// per pooled Context, not once per request (chain-fullstack: 17 headers). +func TestContextRespHeaderOverflowReuseZeroAlloc(t *testing.T) { + s, _ := newTestStream("GET", "/test") + defer s.Release() + c := acquireContext(s) + defer releaseContext(c) + + // 17 clean lowercase headers > respHdrBuf's 16 slots → forces the heap path. + hdrs := [][2]string{ + {"h00", "v"}, {"h01", "v"}, {"h02", "v"}, {"h03", "v"}, {"h04", "v"}, + {"h05", "v"}, {"h06", "v"}, {"h07", "v"}, {"h08", "v"}, {"h09", "v"}, + {"h10", "v"}, {"h11", "v"}, {"h12", "v"}, {"h13", "v"}, {"h14", "v"}, + {"h15", "v"}, {"h16", "v"}, + } + avg := testing.AllocsPerRun(500, func() { + for _, h := range hdrs { + c.SetHeader(h[0], h[1]) + } + c.reset() + }) + if avg != 0 { + t.Fatalf("overflowed respHeaders must reuse the scratch backing array: got %.2f allocs/op, want 0", avg) + } + if cap(c.respHdrScratch) < len(hdrs) { + t.Fatalf("respHdrScratch should retain a >=%d cap backing array, got cap=%d", len(hdrs), cap(c.respHdrScratch)) + } +} + +// nopRW is a no-op ResponseWriter so an alloc test can isolate the Context path +// from the mock writer's own header/body copies. +type nopRW struct{} + +func (nopRW) WriteResponse(_ *stream.Stream, _ int, _ [][2]string, _ []byte) error { return nil } + +// TestContextBlobManyHeadersZeroAlloc locks in the chain-fullstack fix: when a +// response carries more than the inline-buffer's headers (15 user + content-type +// + content-length = 17 > 16), Blob must reuse blobHdrScratch instead of +// allocating make([][2]string,0,total) every request — that was the dominant +// per-request allocation (≈77% of chain-fullstack allocs → GC pressure). +func TestContextBlobManyHeadersZeroAlloc(t *testing.T) { + s, _ := newTestStream("GET", "/test") + defer s.Release() + s.ResponseWriter = nopRW{} // the default mock writer allocates; isolate Blob + c := acquireContext(s) + defer releaseContext(c) + + // 15 user headers ⇒ total 17 > respHdrBuf's 16 ⇒ Blob's many-header path. + for i := 0; i < 15; i++ { + c.SetHeader("h"+strconv.Itoa(i), "v") + } + body := []byte("hello") + avg := testing.AllocsPerRun(500, func() { + c.written = false + _ = c.Blob(200, "application/json", body) + }) + if avg != 0 { + t.Fatalf("Blob many-header path must reuse blobHdrScratch: got %.2f allocs/op, want 0", avg) + } + if cap(c.blobHdrScratch) < 17 { + t.Fatalf("blobHdrScratch should retain a >=17 cap backing array, got cap=%d", cap(c.blobHdrScratch)) + } +} + func TestContextFullPathResetOnRelease(t *testing.T) { s, _ := newTestStream("GET", "/test") defer s.Release() diff --git a/engine/engine.go b/engine/engine.go index 295e375a..aeeaf6b2 100644 --- a/engine/engine.go +++ b/engine/engine.go @@ -43,24 +43,6 @@ type SwitchFreezer interface { UnfreezeSwitching() } -// WorkerScaler is implemented by engines that support per-worker pause/resume -// for dynamic capacity adjustment based on load. Used by the higher-level -// scaler in the adaptive engine to delegate worker activation to whichever -// sub-engine is currently active. Per-worker pause is asynchronous — the -// worker drains in-flight connections before going SUSPENDED. Resume wakes a -// suspended worker so it re-creates its listen socket and rejoins the -// SO_REUSEPORT group. -type WorkerScaler interface { - // NumWorkers returns the total worker pool size (max active count). - NumWorkers() int - // PauseWorker deactivates worker i. Asynchronous; returns immediately. - // Idempotent — pausing an already-paused worker is a no-op. - PauseWorker(i int) - // ResumeWorker reactivates worker i. Wakes the worker if SUSPENDED. - // Idempotent — resuming an active worker is a no-op. - ResumeWorker(i int) -} - // SendfileCapable is an optional interface implemented by engines that // support zero-copy file responses via sendfile(2). The H1 static-file // response path type-asserts the engine for it; engines that do not @@ -124,4 +106,27 @@ type EngineMetrics struct { //nolint:revive // user-approved name // how often the inline → goroutine handoff fires vs the pure-sync // inline fast path. AsyncPromotedConns uint64 + // Workers is the number of I/O workers (io_uring) or event loops + // (epoll) the engine is running. Static after Listen. The adaptive + // controller divides ActiveConnections by it to derive the + // conns-per-worker load signal that drives engine selection. + Workers int + // AcceptCount is the cumulative number of connections accepted by this + // engine since start. Together with elapsed time it yields the accept + // rate (new-connection arrival rate) used as a secondary load signal. + AcceptCount uint64 + // CloseCount is the cumulative number of connections closed by this + // engine since start. AcceptCount - CloseCount tracks the live count; + // a high close rate relative to accepts indicates short-lived + // churn-style connections. + CloseCount uint64 + // BytesRead is the cumulative number of payload bytes received from + // the network across all connections. Used with BytesWritten and + // RequestCount to derive the average bytes-per-request signal that + // suppresses io_uring selection for link-bound (large-payload) + // workloads where the engines tie. + BytesRead uint64 + // BytesWritten is the cumulative number of payload bytes sent to the + // network across all connections. See BytesRead. + BytesWritten uint64 } diff --git a/engine/epoll/engine.go b/engine/epoll/engine.go index 8096b699..a1f49d14 100644 --- a/engine/epoll/engine.go +++ b/engine/epoll/engine.go @@ -13,7 +13,6 @@ import ( "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" "github.com/goceleris/celeris/internal/platform" "github.com/goceleris/celeris/protocol/h2/stream" "github.com/goceleris/celeris/resource" @@ -34,6 +33,17 @@ type Engine struct { // asyncPromoted counts inline → dispatch-goroutine promotions // across all loops (celeris #300). asyncPromoted atomic.Uint64 + // acceptCount / closeCount track cumulative connection lifecycle + // events; bytesRead / bytesWritten track cumulative payload bytes. + // All four feed the adaptive controller's load signals. Bytes are + // batched per-loop and flushed once per event-loop iteration + // (mirroring reqCount) to avoid hot-path cache-line bouncing; + // accepts/closes are infrequent so they increment directly like + // activeConns. + acceptCount atomic.Uint64 + closeCount atomic.Uint64 + bytesRead atomic.Uint64 + bytesWritten atomic.Uint64 } // asyncRoutes is the static AsyncRoutes count snapshotted at // construction from the handler's AsyncRouteCount (#300 G3). @@ -91,7 +101,9 @@ func (e *Engine) Listen(ctx context.Context) error { l := newLoop(i, cpus[i], e.handler, resolved, e.cfg, &e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount, - &e.metrics.asyncPromoted, &e.acceptPaused) + &e.metrics.asyncPromoted, &e.acceptPaused, + &e.metrics.acceptCount, &e.metrics.closeCount, + &e.metrics.bytesRead, &e.metrics.bytesWritten) e.loops[i] = l } e.mu.Unlock() @@ -126,16 +138,6 @@ func (e *Engine) Listen(ctx context.Context) error { ) } - // Dynamic loop scaler. Typed cfg.WorkerScaling takes precedence over - // env vars. Suppressed when wrapped by adaptive — adaptive runs ONE - // higher-level scaler that delegates to the active sub-engine. The - // algorithm itself lives in engine/scaler. - if !e.cfg.SkipBuiltinScaler { - if scalerCfg := scaler.Resolve(e.cfg, len(e.loops)); scalerCfg.Enabled { - go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns) - } - } - <-ctx.Done() wg.Wait() return nil @@ -176,6 +178,11 @@ func (e *Engine) Metrics() engine.EngineMetrics { ErrorCount: e.metrics.errCount.Load(), AsyncRoutes: e.asyncRoutes, AsyncPromotedConns: e.metrics.asyncPromoted.Load(), + Workers: len(e.loops), + AcceptCount: e.metrics.acceptCount.Load(), + CloseCount: e.metrics.closeCount.Load(), + BytesRead: e.metrics.bytesRead.Load(), + BytesWritten: e.metrics.bytesWritten.Load(), } } @@ -250,7 +257,6 @@ func (e *Engine) ResumeAccept() error { var ( _ engine.Engine = (*Engine)(nil) _ engine.AcceptController = (*Engine)(nil) - _ engine.WorkerScaler = (*Engine)(nil) ) // Addr returns the bound listener address. diff --git a/engine/epoll/loop.go b/engine/epoll/loop.go index 72c8e591..58369767 100644 --- a/engine/epoll/loop.go +++ b/engine/epoll/loop.go @@ -77,11 +77,6 @@ type Loop struct { wake chan struct{} wakeMu sync.Mutex suspended atomic.Bool - // inactive is the per-worker pause flag used by the dynamic worker - // scaler. ORed with acceptPaused (which is engine-wide) when computing - // effective paused state. The scaler flips this to deactivate idle - // loops under low load and reactivate them under burst load. - inactive atomic.Bool // listenFDClosed signals that the loop has closed its listen FD in // response to acceptPaused=true. PauseAccept polls this so it only // returns once the SO_REUSEPORT group has actually shed this listener @@ -93,14 +88,20 @@ type Loop struct { // re-arms the signal. listenFDClosed atomic.Bool - reqCount *atomic.Uint64 - activeConns *atomic.Int64 - errCount *atomic.Uint64 - asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) - reqBatch uint64 // batched request count, flushed to reqCount per iteration - tickCounter uint32 - consecutiveEmpty uint32 // consecutive iterations with no events (for adaptive timeout) - cachedNow int64 // cached time.Now().UnixNano(), refreshed once per events return + reqCount *atomic.Uint64 + activeConns *atomic.Int64 + errCount *atomic.Uint64 + asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) + acceptCount *atomic.Uint64 // cumulative accepts (engine-wide, shared) + closeCount *atomic.Uint64 // cumulative closes (engine-wide, shared) + bytesRead *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared) + bytesWritten *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared) + reqBatch uint64 // batched request count, flushed to reqCount per iteration + bytesReadBatch uint64 // batched recv bytes, flushed to bytesRead per iteration + bytesWrittenBatch uint64 // batched send bytes, flushed to bytesWritten per iteration + tickCounter uint32 + consecutiveEmpty uint32 // consecutive iterations with no events (for adaptive timeout) + cachedNow int64 // cached time.Now().UnixNano(), refreshed once per events return // fdCapDrops counts accepted fds that fell outside the l.conns table // (fd >= connTableSize) and were force-closed in acceptAll. Worker- @@ -148,7 +149,8 @@ type Loop struct { func newLoop(id, cpuID int, handler stream.Handler, resolved resource.ResolvedResources, cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64, - asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) *Loop { + asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool, + acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) *Loop { return &Loop{ id: id, @@ -167,8 +169,10 @@ func newLoop(id, cpuID int, handler stream.Handler, acceptPaused: acceptPaused, wake: make(chan struct{}), ready: make(chan error, 1), + // TCPNoDelay is omitted here: accepted sockets inherit TCP_NODELAY + // from the listen socket (set in createListenSocket), so per-accept + // ApplyFD doesn't need to re-issue it. sockOpts: sockopts.Options{ - TCPNoDelay: true, TCPQuickAck: true, SOBusyPoll: 50 * time.Microsecond, RecvBuf: resolved.SocketRecv, @@ -178,6 +182,10 @@ func newLoop(id, cpuID int, handler stream.Handler, activeConns: activeConns, errCount: errCount, asyncPromoted: asyncPromoted, + acceptCount: acceptCount, + closeCount: closeCount, + bytesRead: bytesRead, + bytesWritten: bytesWritten, h2cfg: conn.H2Config{ MaxConcurrentStreams: cfg.MaxConcurrentStreams, InitialWindowSize: cfg.InitialWindowSize, @@ -282,8 +290,7 @@ func (l *Loop) run(ctx context.Context) { // Cache the atomic load: ACTIVE→DRAINING and SUSPENDED→ACTIVE // branches both read it. Saves 1 atomic load per event-loop // iteration on the steady-state hot path. - // OR with the per-loop inactive flag (dynamic scaler). - paused := l.acceptPaused.Load() || l.inactive.Load() + paused := l.acceptPaused.Load() if l.listenFD >= 0 && paused { // Drain any pending accepts from the kernel's listen queue so // they get a clean shutdown (FIN) rather than the RST that @@ -465,7 +472,7 @@ func (l *Loop) run(ctx context.Context) { if mu := cs.detachMu; mu != nil { mu.Lock() } - err := flushWrites(cs) + err := l.flushWrites(cs, true) if err != nil { // Surface I/O failure to detached middleware before closing. if cs.h1State != nil && cs.h1State.OnError != nil { @@ -510,7 +517,7 @@ func (l *Loop) run(ctx context.Context) { if cs != nil && cs.h2State != nil && cs.h2State.WriteQueuePending() { cs.h2State.DrainWriteQueue(cs.writeFn) if cs.writePos < len(cs.writeBuf) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { l.removeDirty(cs) l.closeConn(fd) } else if cs.writePos < len(cs.writeBuf) { @@ -531,6 +538,17 @@ func (l *Loop) run(ctx context.Context) { l.reqBatch = 0 } + // Flush batched payload-byte counters with the same per-iteration + // cadence as reqCount, for the same cache-line-contention reason. + if l.bytesReadBatch > 0 { + l.bytesRead.Add(l.bytesReadBatch) + l.bytesReadBatch = 0 + } + if l.bytesWrittenBatch > 0 { + l.bytesWritten.Add(l.bytesWrittenBatch) + l.bytesWrittenBatch = 0 + } + // Check connection timeouts. Default cadence is every 1024 iterations // (~100ms under load); when detached conns exist with idle deadlines // the gate tightens to every 32 iterations (~50ms idle wall time) @@ -549,10 +567,9 @@ func (l *Loop) run(ctx context.Context) { } // DRAINING → SUSPENDED: no listen socket, no connections, events processed. - // Combined paused: engine-wide OR per-loop (dynamic scaler). - if l.listenFD < 0 && l.connCount == 0 && (l.acceptPaused.Load() || l.inactive.Load()) { + if l.listenFD < 0 && l.connCount == 0 && l.acceptPaused.Load() { l.wakeMu.Lock() - if !l.acceptPaused.Load() && !l.inactive.Load() { + if !l.acceptPaused.Load() { l.wakeMu.Unlock() continue } @@ -661,6 +678,7 @@ func (l *Loop) acceptAll(ctx context.Context, now int64) { } cs.writeFn = l.makeWriteFn(cs) l.activeConns.Add(1) + l.acceptCount.Add(1) if l.cfg.OnConnect != nil { l.cfg.OnConnect(cs.remoteAddr) @@ -719,7 +737,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) // Surface read failure to detached middleware (e.g. WS). if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(err) @@ -734,7 +752,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) // Surface peer-close (EOF) to detached middleware. if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(errPeerClosed) @@ -747,6 +765,9 @@ func (l *Loop) drainRead(fd int, now int64) { } cs.lastActivity = now + // n > 0 here (the err and EOF cases returned above): bytes read on + // this iteration, into either cs.buf or the zero-copy bodyBuf. + l.bytesReadBatch += uint64(n) // Direct-into-bodyBuf completion path: we read straight into // H1State.bodyBuf; dispatch the handler if the body is full, @@ -769,7 +790,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -786,7 +807,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -797,7 +818,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - if err := flushWrites(cs); err != nil { + if err := l.flushWrites(cs, true); err != nil { if mu := cs.detachMu; mu != nil { mu.Unlock() } @@ -947,7 +968,7 @@ func (l *Loop) drainRead(fd int, now int64) { // skipped by the `continue` so we must flush explicitly here, // otherwise the client blocks forever waiting for the 101. if cs.writePos < len(cs.writeBuf) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { l.closeConn(fd) return } @@ -1008,7 +1029,7 @@ func (l *Loop) drainRead(fd int, now int64) { if mu := cs.detachMu; mu != nil { mu.Lock() } - _ = flushWrites(cs) + _ = l.flushWrites(cs, true) cs.pendingBytes = 0 if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(processErr) @@ -1029,7 +1050,7 @@ func (l *Loop) drainRead(fd int, now int64) { mu.Lock() } if csWritePending(cs) { - if fErr := flushWrites(cs); fErr != nil { + if fErr := l.flushWrites(cs, true); fErr != nil { if cs.h1State != nil && cs.h1State.OnError != nil { cs.h1State.OnError(fErr) } @@ -1123,6 +1144,7 @@ func (l *Loop) hijackConn(fd int) (net.Conn, error) { l.conns[fd] = nil l.connCount-- l.activeConns.Add(-1) + l.closeCount.Add(1) f := os.NewFile(uintptr(fd), "tcp") c, err := net.FileConn(f) @@ -1655,7 +1677,10 @@ func (l *Loop) runAsyncHandler(cs *connState) { // flushWrites may partially complete; residual bytes // stay in cs.writeBuf and the worker retries via // markDirty once drainDetachQueue picks us up. - if err := flushWrites(cs); err != nil { + // Dispatch-goroutine call site: not the loop thread, so + // byte accounting must hit the shared atomic, not the + // per-loop batch (false → atomic path). + if err := l.flushWrites(cs, false); err != nil { promoteErr = err } } @@ -1738,7 +1763,8 @@ func (l *Loop) runAsyncHandler(cs *connState) { // race the middleware's own write path. if !errors.Is(processErr, conn.ErrHijacked) && (cs.writePos < len(cs.writeBuf) || len(cs.bodyBuf) > 0) { - flushErr = flushWrites(cs) + // Dispatch-goroutine call site (false → shared-atomic byte path). + flushErr = l.flushWrites(cs, false) } // Resync pendingBytes with actual buffer state. makeWriteFn uses // pendingBytes to enforce writeCap backpressure; without this @@ -1945,7 +1971,7 @@ func (l *Loop) handleWritable(cs *connState) { if mu := cs.detachMu; mu != nil { mu.Lock() } - err := flushWrites(cs) + err := l.flushWrites(cs, true) drained := err == nil && !csWritePending(cs) if err == nil { if drained { @@ -2238,6 +2264,7 @@ func (l *Loop) closeConn(fd int) { l.conns[fd] = nil l.connCount-- l.activeConns.Add(-1) + l.closeCount.Add(1) if l.cfg.OnDisconnect != nil { l.cfg.OnDisconnect(cs.remoteAddr) @@ -2379,6 +2406,10 @@ func createListenSocket(addr string) (int, error) { return -1, err } + // TCP_NODELAY on the listen socket: Linux copies it onto every accepted + // socket at SYN time, so per-accept ApplyFD can skip its own NODELAY + // setsockopt (one fewer syscall per accept on the hot path). + _ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) // TCP_DEFER_ACCEPT: kernel holds connections until data arrives, // eliminating wasted accept+wait cycles for idle connections. _ = unix.SetsockoptInt(fd, unix.IPPROTO_TCP, unix.TCP_DEFER_ACCEPT, 1) diff --git a/engine/epoll/nodelay_inherit_linux_test.go b/engine/epoll/nodelay_inherit_linux_test.go new file mode 100644 index 00000000..b6d7bcdf --- /dev/null +++ b/engine/epoll/nodelay_inherit_linux_test.go @@ -0,0 +1,92 @@ +//go:build linux + +package epoll + +import ( + "net" + "testing" + "time" + + "golang.org/x/sys/unix" +) + +// TestListenSocketHasNoDelay asserts createListenSocket enables TCP_NODELAY on +// the listen socket itself — the precondition for accepted sockets inheriting +// it (#337). +func TestListenSocketHasNoDelay(t *testing.T) { + lfd, err := createListenSocket("127.0.0.1:0") + if err != nil { + t.Skipf("listen socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(lfd) }) + + v, err := unix.GetsockoptInt(lfd, unix.IPPROTO_TCP, unix.TCP_NODELAY) + if err != nil { + t.Fatalf("getsockopt TCP_NODELAY: %v", err) + } + if v == 0 { + t.Fatal("listen socket TCP_NODELAY not set") + } +} + +// TestAcceptedConnInheritsNoDelay verifies accepted sockets carry TCP_NODELAY +// without acceptAll re-issuing the per-accept setsockopt. The kernel copies +// TCP_NODELAY from the listen socket at SYN time; this guards that the +// per-accept NODELAY removal (#337) leaves accepted conns nagle-disabled. +func TestAcceptedConnInheritsNoDelay(t *testing.T) { + lfd, err := createListenSocket("127.0.0.1:0") + if err != nil { + t.Skipf("listen socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(lfd) }) + + la := boundAddr(lfd) + if la == nil { + t.Skip("could not resolve bound addr") + } + tcpAddr, ok := la.(*net.TCPAddr) + if !ok { + t.Skipf("unexpected addr type %T", la) + } + + cfd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + t.Skipf("client socket unavailable: %v", err) + } + t.Cleanup(func() { _ = unix.Close(cfd) }) + + var sa unix.SockaddrInet4 + sa.Port = tcpAddr.Port + copy(sa.Addr[:], tcpAddr.IP.To4()) + if err := unix.Connect(cfd, &sa); err != nil { + t.Fatalf("connect: %v", err) + } + + // TCP_DEFER_ACCEPT is set on the listener, so the connection only becomes + // acceptable once data arrives — send a byte before accepting. + if _, err := unix.Write(cfd, []byte{'x'}); err != nil { + t.Fatalf("write: %v", err) + } + + var afd int + for i := 0; i < 200; i++ { + afd, _, err = unix.Accept4(lfd, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) + if err == unix.EAGAIN { + time.Sleep(time.Millisecond) + continue + } + break + } + if err != nil { + t.Fatalf("accept: %v", err) + } + t.Cleanup(func() { _ = unix.Close(afd) }) + + v, err := unix.GetsockoptInt(afd, unix.IPPROTO_TCP, unix.TCP_NODELAY) + if err != nil { + t.Fatalf("getsockopt TCP_NODELAY on accepted fd: %v", err) + } + if v == 0 { + t.Fatal("accepted socket did not inherit TCP_NODELAY from listener") + } +} diff --git a/engine/epoll/review_v150_test.go b/engine/epoll/review_v150_test.go index 64538b57..745c2b0a 100644 --- a/engine/epoll/review_v150_test.go +++ b/engine/epoll/review_v150_test.go @@ -80,11 +80,15 @@ func TestCheckTimeoutsClosesAllExpired(t *testing.T) { t.Cleanup(func() { _ = unix.Close(epfd) }) l := &Loop{ - epollFD: epfd, - conns: make([]*connState, 1024), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - cfg: resource.Config{IdleTimeout: time.Second}, + epollFD: epfd, + conns: make([]*connState, 1024), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + cfg: resource.Config{IdleTimeout: time.Second}, } past := time.Now().Add(-time.Hour).UnixNano() @@ -168,17 +172,21 @@ func TestAcceptAllDrainsBacklogNoStrand(t *testing.T) { } l := &Loop{ - epollFD: epfd, - listenFD: lfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 256), - activeConns: &atomic.Int64{}, - errCount: &atomic.Uint64{}, - reqCount: &atomic.Uint64{}, - eventFD: -1, - timerFD: -1, - resolved: resource.ResolvedResources{BufferSize: 8192}, - cfg: resource.Config{}, + epollFD: epfd, + listenFD: lfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 256), + activeConns: &atomic.Int64{}, + errCount: &atomic.Uint64{}, + reqCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + closeCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + timerFD: -1, + resolved: resource.ResolvedResources{BufferSize: 8192}, + cfg: resource.Config{}, } const want = 200 @@ -261,12 +269,16 @@ func TestHijackDefersReleaseWhileAsyncGoroutineActive(t *testing.T) { } l := &Loop{ - epollFD: epfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - eventFD: -1, - cfg: resource.Config{}, + epollFD: epfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + cfg: resource.Config{}, } // Async-mode conn: detachMu set, dispatch goroutine "alive" (asyncRun). @@ -345,12 +357,16 @@ func TestHijackSyncReleasesImmediately(t *testing.T) { } l := &Loop{ - epollFD: epfd, - conns: make([]*connState, connTableSize), - liveConns: make([]int, 0, 8), - activeConns: &atomic.Int64{}, - eventFD: -1, - cfg: resource.Config{}, + epollFD: epfd, + conns: make([]*connState, connTableSize), + liveConns: make([]int, 0, 8), + activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, + acceptCount: &atomic.Uint64{}, + bytesRead: &atomic.Uint64{}, + bytesWritten: &atomic.Uint64{}, + eventFD: -1, + cfg: resource.Config{}, } // Sync mode: detachMu nil, no dispatch goroutine. cs := &connState{fd: local, liveIdx: -1} diff --git a/engine/epoll/scaler.go b/engine/epoll/scaler.go deleted file mode 100644 index f8e948ad..00000000 --- a/engine/epoll/scaler.go +++ /dev/null @@ -1,62 +0,0 @@ -//go:build linux - -package epoll - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine/scaler" -) - -// epollScalerSource adapts the epoll Engine to the scaler.Source -// interface. Generation always returns 0 — epoll isn't a meta-engine. -type epollScalerSource struct { - e *Engine - activeConns *atomic.Int64 -} - -func (s *epollScalerSource) NumWorkers() int { return s.e.NumWorkers() } -func (s *epollScalerSource) ActiveConns() int64 { return s.activeConns.Load() } -func (s *epollScalerSource) PauseWorker(i int) { s.e.PauseWorker(i) } -func (s *epollScalerSource) ResumeWorker(i int) { s.e.ResumeWorker(i) } -func (s *epollScalerSource) Generation() uint64 { return 0 } -func (s *epollScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger } - -// runScaler is started by Engine.Listen when scaler config is enabled. -// All algorithm logic lives in engine/scaler. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) { - scaler.Run(ctx, &epollScalerSource{e: e, activeConns: activeConns}, cfg) -} - -// PauseWorker deactivates loop i. The loop drains in-flight conns and -// goes SUSPENDED. Asynchronous; returns immediately. -func (e *Engine) PauseWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.loops) { - return - } - e.loops[i].inactive.Store(true) -} - -// ResumeWorker reactivates loop i. Wakes the loop from SUSPENDED if it was -// already idle. -func (e *Engine) ResumeWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.loops) { - return - } - l := e.loops[i] - l.inactive.Store(false) - l.listenFDClosed.Store(false) - l.wakeMu.Lock() - if l.suspended.Load() { - close(l.wake) - l.wake = make(chan struct{}) - l.suspended.Store(false) - } - l.wakeMu.Unlock() -} diff --git a/engine/epoll/scaler_test.go b/engine/epoll/scaler_test.go deleted file mode 100644 index ee7507ac..00000000 --- a/engine/epoll/scaler_test.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build linux - -package epoll - -// The scaler algorithm itself lives in engine/scaler and has its own -// test suite there. The epoll engine's scaler.go is just the -// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on loops -// slice). diff --git a/engine/epoll/writer.go b/engine/epoll/writer.go index 75f58dc6..7f4526f4 100644 --- a/engine/epoll/writer.go +++ b/engine/epoll/writer.go @@ -16,16 +16,33 @@ import ( // writev(2) with iovec = [writeBuf[writePos:], bodyBuf]. Saves one // full body-sized memcpy per request compared to appending the body // into writeBuf first. -func flushWrites(cs *connState) error { +// onLoopThread distinguishes the worker (event-loop) call sites from the +// async-dispatch-goroutine ones. Loop-thread bytes accumulate in the +// non-atomic per-loop batch (flushed once per iteration, like reqBatch); +// goroutine bytes go straight to the shared atomic. Keeping the two paths +// apart is what makes the batch field worker-thread-only and therefore +// race-free — the dispatch goroutine never touches bytesWrittenBatch. +func (l *Loop) addWrittenBytes(n int, onLoopThread bool) { + if n <= 0 { + return + } + if onLoopThread { + l.bytesWrittenBatch += uint64(n) + return + } + l.bytesWritten.Add(uint64(n)) +} + +func (l *Loop) flushWrites(cs *connState, onLoopThread bool) error { if len(cs.bodyBuf) > 0 { - return flushWritesV(cs) + return l.flushWritesV(cs, onLoopThread) } if cs.writePos >= len(cs.writeBuf) { cs.writeBuf = cs.writeBuf[:0] cs.writePos = 0 // writeBuf is empty — if a zero-copy sendfile response is pending, // drive it now (after any prior pipelined writeBuf bytes flushed). - return flushSendfile(cs) + return l.flushSendfile(cs, onLoopThread) } n, err := unix.Write(cs.fd, cs.writeBuf[cs.writePos:]) if err != nil { @@ -36,6 +53,7 @@ func flushWrites(cs *connState) error { cs.writePos = 0 return err } + l.addWrittenBytes(n, onLoopThread) cs.writePos += n if cs.writePos >= len(cs.writeBuf) { // Fully flushed @@ -43,7 +61,7 @@ func flushWrites(cs *connState) error { cs.writePos = 0 // writeBuf drained this call — continue into the sendfile body if a // sendfile response is pending. Keeps header→body ordering correct. - return flushSendfile(cs) + return l.flushSendfile(cs, onLoopThread) } else if cs.writePos > cap(cs.writeBuf)/2 { // Amortized compaction: only copy when more than half the buffer is consumed. // This reduces the average cost from O(n) per partial write to O(1) amortized. @@ -66,12 +84,13 @@ func flushWrites(cs *connState) error { // the writeBuf path above. On a real I/O error the file is released and // the error is surfaced (the caller closes the conn). On completion the // dup'd file is closed and cs.sendfile cleared. -func flushSendfile(cs *connState) error { +func (l *Loop) flushSendfile(cs *connState, onLoopThread bool) error { st := cs.sendfile if st == nil { return nil } - _, err := st.advance(cs.fd) + sent, err := st.advance(cs.fd) + l.addWrittenBytes(int(sent), onLoopThread) if err != nil { if err == unix.EAGAIN || err == unix.EWOULDBLOCK { return nil // send buffer full; resume on next writable edge @@ -120,7 +139,7 @@ func csPendingBytes(cs *connState) int { // is non-nil; clears bodyBuf once fully sent. Partial writev collapses // the remainder into writeBuf so the next flushWrites call uses the // plain write path. -func flushWritesV(cs *connState) error { +func (l *Loop) flushWritesV(cs *connState, onLoopThread bool) error { headerRem := cs.writeBuf[cs.writePos:] iovs := [2][]byte{headerRem, cs.bodyBuf} n, err := unix.Writev(cs.fd, iovs[:]) @@ -133,6 +152,7 @@ func flushWritesV(cs *connState) error { cs.bodyBuf = nil return err } + l.addWrittenBytes(n, onLoopThread) total := len(headerRem) + len(cs.bodyBuf) switch { case n >= total: diff --git a/engine/iouring/async_revert_test.go b/engine/iouring/async_revert_test.go new file mode 100644 index 00000000..c298e2cd --- /dev/null +++ b/engine/iouring/async_revert_test.go @@ -0,0 +1,243 @@ +//go:build linux + +package iouring + +import ( + "bufio" + "context" + "fmt" + "io" + "net" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/goceleris/celeris/engine" + "github.com/goceleris/celeris/protocol/h2/stream" + "github.com/goceleris/celeris/resource" +) + +// revertResolverHandler is an AsyncRouteResolver whose async classification is +// flipped at runtime, simulating celeris#356 route promotion (true) and the +// celeris#364 TTL de-promotion (false) without waiting on the real router TTL. +type revertResolverHandler struct{ async atomic.Bool } + +func (h *revertResolverHandler) HandleStream(_ context.Context, s *stream.Stream) error { + if s.ResponseWriter == nil { + return nil + } + return s.ResponseWriter.WriteResponse(s, 200, + [][2]string{{"content-type", "text/plain"}, {"content-length", "2"}}, []byte("ok")) +} +func (h *revertResolverHandler) RouteAsync(_, _ string) bool { return h.async.Load() } +func (h *revertResolverHandler) HasAsyncRoutes() bool { return true } + +func startRevertEngine(t *testing.T, h stream.Handler) *Engine { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("pick port: %v", err) + } + addr := ln.Addr().String() + _ = ln.Close() + + e, err := New(resource.Config{ + Addr: addr, + Protocol: engine.HTTP1, + Resources: resource.Resources{Workers: 4}, + AsyncHandlers: true, + }, h) + if err != nil { + t.Skipf("iouring engine unavailable: %v", err) + } + ctx, cancel := context.WithCancel(t.Context()) + errCh := make(chan error, 1) + go func() { errCh <- e.Listen(ctx) }() + t.Cleanup(func() { + cancel() + select { + case <-errCh: + case <-time.After(3 * time.Second): + } + }) + dl := time.Now().Add(30 * time.Second) + for time.Now().Before(dl) && e.Addr() == nil { + select { + case err := <-errCh: + if err != nil && (strings.Contains(err.Error(), "cannot allocate memory") || + strings.Contains(err.Error(), "io_uring_setup") || strings.Contains(err.Error(), "tier")) { + t.Skipf("io_uring unavailable on this runner: %v", err) + } + t.Fatalf("engine.Listen returned early: %v", err) + default: + } + time.Sleep(10 * time.Millisecond) + } + if e.Addr() == nil { + t.Fatal("engine did not bind in time") + } + return e +} + +// sendKeepAlive issues one keep-alive GET on an existing conn and reads the +// full 2-byte "ok" response, leaving the conn open for the next request. +func sendKeepAlive(c net.Conn, br *bufio.Reader, timeout time.Duration) error { + _ = c.SetDeadline(time.Now().Add(timeout)) + if _, err := c.Write([]byte("GET /x HTTP/1.1\r\nHost: x\r\n\r\n")); err != nil { + return fmt.Errorf("write: %w", err) + } + statusLine, err := br.ReadString('\n') + if err != nil { + return fmt.Errorf("read status: %w", err) + } + if len(statusLine) < 12 || statusLine[9:12] != "200" { + return fmt.Errorf("bad status: %q", statusLine) + } + for { + line, err := br.ReadString('\n') + if err != nil { + return fmt.Errorf("read header: %w", err) + } + if line == "\r\n" || line == "\n" { + break + } + } + body := make([]byte, 2) + if _, err := io.ReadFull(br, body); err != nil { + return fmt.Errorf("read body: %w", err) + } + if string(body) != "ok" { + return fmt.Errorf("bad body: %q", body) + } + return nil +} + +// TestAsyncConnRevertsOnRouteDepromotion is the celeris#364 conn-level revert +// regression: a keep-alive conn promoted to async dispatch must return to the +// inline fast path once its route de-promotes (RouteAsync flips false), proven +// by its ability to RE-promote afterwards (a still-async conn cannot re-promote +// — it never re-runs the inline ErrAsyncDispatch gate). +func TestAsyncConnRevertsOnRouteDepromotion(t *testing.T) { + h := &revertResolverHandler{} + h.async.Store(true) + e := startRevertEngine(t, h) + target := e.Addr().String() + + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer func() { _ = c.Close() }() + br := bufio.NewReader(c) + send := func(phase string) { + if err := sendKeepAlive(c, br, 2*time.Second); err != nil { + t.Fatalf("%s request failed: %v", phase, err) + } + } + + // Phase A — RouteAsync=true: the conn promotes to async dispatch. + for i := 0; i < 20; i++ { + send("promote") + } + p1 := e.Metrics().AsyncPromotedConns + if p1 == 0 { + t.Fatal("conn never promoted under RouteAsync=true") + } + + // Phase B — RouteAsync=false: the dispatch goroutine must revert the conn + // to inline at its next idle point. Requests must keep succeeding. + h.async.Store(false) + for i := 0; i < 30; i++ { + send("revert") + time.Sleep(time.Millisecond) + } + + // Phase C — RouteAsync=true again: a reverted (now inline) conn re-promotes; + // a conn still stuck async would NOT, so AsyncPromotedConns must increase. + h.async.Store(true) + for i := 0; i < 30; i++ { + send("repromote") + } + p2 := e.Metrics().AsyncPromotedConns + if p2 <= p1 { + t.Fatalf("conn did not revert+re-promote (still stuck async): promoted before=%d after=%d", p1, p2) + } + t.Logf("promotions: after-A=%d after-C=%d (revert confirmed by re-promotion)", p1, p2) +} + +// TestAsyncConnRevertRace hammers the promote/feed/revert/re-promote interaction +// across many keep-alive conns with RouteAsync flipping continuously. Run under +// -race it validates that the worker recv path and the dispatch goroutine never +// race on cs.asyncPromoted / asyncInBuf during a revert. Gated on -short. +func TestAsyncConnRevertRace(t *testing.T) { + if testing.Short() { + t.Skip("revert race test needs sustained toggling load; -short skips it") + } + h := &revertResolverHandler{} + h.async.Store(true) + e := startRevertEngine(t, h) + target := e.Addr().String() + + const ( + concurrency = 64 + duration = 10 * time.Second + ) + stop := make(chan struct{}) + // Toggler: flip the route's async classification fast enough that conns are + // constantly promoting and reverting. + go func() { + tk := time.NewTicker(3 * time.Millisecond) + defer tk.Stop() + for { + select { + case <-stop: + return + case <-tk.C: + h.async.Store(!h.async.Load()) + } + } + }() + + deadline := time.Now().Add(duration) + var ok, failed atomic.Int64 + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + failed.Add(1) + return + } + defer func() { _ = c.Close() }() + br := bufio.NewReader(c) + for time.Now().Before(deadline) { + if err := sendKeepAlive(c, br, 2*time.Second); err != nil { + failed.Add(1) + return // conn broken; a corrupted response would surface here + } + ok.Add(1) + } + }() + } + wg.Wait() + close(stop) + + if ok.Load() < 1000 { + t.Fatalf("too few successful requests (ok=%d failed=%d) — server may have stalled", ok.Load(), failed.Load()) + } + // Liveness epilogue. + h.async.Store(false) + c, err := net.DialTimeout("tcp", target, 2*time.Second) + if err != nil { + t.Fatalf("engine not serving after revert churn: %v", err) + } + defer func() { _ = c.Close() }() + if err := sendKeepAlive(c, bufio.NewReader(c), 2*time.Second); err != nil { + t.Fatalf("engine not serving after revert churn: %v", err) + } + t.Logf("ok=%d failed=%d over %s", ok.Load(), failed.Load(), duration) +} diff --git a/engine/iouring/buf_ring_scale_test.go b/engine/iouring/buf_ring_scale_test.go index e535d782..6aa5a5fe 100644 --- a/engine/iouring/buf_ring_scale_test.go +++ b/engine/iouring/buf_ring_scale_test.go @@ -16,7 +16,7 @@ func TestResolveBufRingCountDefaults(t *testing.T) { targetConns int want int }{ - // Formula: 2 * TargetConnsPerWorker, rounded up to a power of 2, + // Formula: 2 * connsPerWorker, rounded up to a power of 2, // clamped to [bufRingCountMin=1024, bufRingCountMax]. The ring is // PER-WORKER, so the result MUST NOT depend on Workers (celeris#322 // follow-up — the prior 2*Workers*target over-sized every worker's diff --git a/engine/iouring/check_timeouts_test.go b/engine/iouring/check_timeouts_test.go index 50178e81..8d0c6512 100644 --- a/engine/iouring/check_timeouts_test.go +++ b/engine/iouring/check_timeouts_test.go @@ -65,6 +65,7 @@ func TestCheckTimeoutsSwapRemoveNoSkip(t *testing.T) { liveConns: make([]int, 0, n), errCount: &atomic.Uint64{}, activeConns: &atomic.Int64{}, + closeCount: &atomic.Uint64{}, cfg: resource.Config{IdleTimeout: time.Second}, } past := time.Now().Add(-time.Hour).UnixNano() diff --git a/engine/iouring/conn.go b/engine/iouring/conn.go index 06a8476f..4521400d 100644 --- a/engine/iouring/conn.go +++ b/engine/iouring/conn.go @@ -149,10 +149,20 @@ type connState struct { asyncClosed atomic.Bool // asyncPromoted is set once an async-marked route has been observed // on this conn while it ran inline on the worker (per-handler async, - // celeris #300). Sticky: once promoted every subsequent recv goes - // straight to the dispatch goroutine instead of retrying the inline - // fast path. Reset on release. - asyncPromoted bool + // celeris #300). Once promoted, recv goes to the dispatch goroutine. + // REVERSIBLE (celeris#364): the dispatch goroutine clears it to revert + // the conn to inline when the promoting route de-promotes. Atomic because + // the worker reads it on the recv hot path while the goroutine may clear + // it; the worker re-reads it under asyncInMu before feeding to close the + // feed-vs-revert race. Reset on release. + asyncPromoted atomic.Bool + // promotedMethod/promotedPath record the route that forced this conn's + // promotion (celeris#364). Written by the worker before the dispatch + // goroutine starts (happens-before), read by the goroutine to decide + // revert. Empty => not revert-eligible (e.g. promoted for a buffered + // partial-header / chunked continuation, where no full route is known). + promotedMethod string + promotedPath string // asyncH2Promoted signals the worker that runAsyncHandler observed // ErrUpgradeH2C and completed the cs-local H1→H2 state swap under // detachMu. drainDetachQueue finishes the promotion by appending @@ -304,7 +314,9 @@ func releaseConnState(cs *connState) { cs.asyncOutBuf = cs.asyncOutBuf[:0] cs.asyncRun = false cs.asyncClosed.Store(false) - cs.asyncPromoted = false + cs.asyncPromoted.Store(false) + cs.promotedMethod = "" + cs.promotedPath = "" cs.asyncDetachUnlocked = false cs.asyncDetachPending = false cs.bodyBuf = nil diff --git a/engine/iouring/consts.go b/engine/iouring/consts.go index 3ff11afd..a8e24182 100644 --- a/engine/iouring/consts.go +++ b/engine/iouring/consts.go @@ -41,8 +41,8 @@ const ( // opPROVIDEBUFFERS = 31 removed in v1.5.0 (celeris#320) — the legacy // BufferGroup / PROVIDE_BUFFERS SQE was never wired into the engine. // The supported path is IORING_REGISTER_PBUF_RING (BufferRing above). - opSHUTDOWN = 52 // IORING_OP_SHUTDOWN (kernel 5.11+) - opSENDZC = 53 // IORING_OP_SEND_ZC (kernel 6.0+) + opSHUTDOWN = 34 // IORING_OP_SHUTDOWN (kernel 5.11+) + opSENDZC = 47 // IORING_OP_SEND_ZC (kernel 6.0+) ) // SQE flags. @@ -57,7 +57,7 @@ const ( const ( cqeFBuffer = 1 << 0 cqeFMore = 1 << 1 - cqeFNotif = 1 << 2 // IORING_CQE_F_NOTIF: zero-copy send notification + cqeFNotif = 1 << 3 // IORING_CQE_F_NOTIF: zero-copy send notification ) // Accept flags. @@ -104,3 +104,11 @@ const ( sqeSize = 64 cqeSize = 16 ) + +// sendZCMinBytes is the payload threshold below which SEND_ZC is not worth its +// cost. Zero-copy adds a second (NOTIF) CQE per send and holds zcNotifPending +// across the buffer's DMA lifetime, stalling the next flush; it also forfeits +// the SEND→RECV link on the keep-alive fast path. Only above this size does the +// avoided memcpy outweigh the extra completion, so small responses use a plain +// linked SEND (1 CQE, immediate buffer reuse). +const sendZCMinBytes = 4096 diff --git a/engine/iouring/engine.go b/engine/iouring/engine.go index 722c64bf..2b4f67e2 100644 --- a/engine/iouring/engine.go +++ b/engine/iouring/engine.go @@ -12,7 +12,6 @@ import ( "time" "github.com/goceleris/celeris/engine" - "github.com/goceleris/celeris/engine/scaler" "github.com/goceleris/celeris/internal/platform" "github.com/goceleris/celeris/probe" "github.com/goceleris/celeris/protocol/h2/stream" @@ -36,6 +35,17 @@ type Engine struct { // asyncPromoted counts the cumulative inline → dispatch-goroutine // promotions across all workers (celeris #300). asyncPromoted atomic.Uint64 + // acceptCount / closeCount track cumulative connection lifecycle + // events; bytesRead / bytesWritten track cumulative payload bytes. + // All four feed the adaptive controller's load signals. Bytes are + // batched per-worker and flushed once per event-loop iteration + // (mirroring reqCount) to avoid hot-path cache-line bouncing; + // accepts/closes are infrequent so they increment directly like + // activeConns. + acceptCount atomic.Uint64 + closeCount atomic.Uint64 + bytesRead atomic.Uint64 + bytesWritten atomic.Uint64 } // asyncRoutes is cached from the handler's HasAsyncRoutes/route count // at construction so Metrics() doesn't pay the type-assertion per @@ -235,16 +245,6 @@ func (e *Engine) Listen(ctx context.Context) error { ) } - // Dynamic worker scaler. Typed cfg.WorkerScaling takes precedence over - // env vars. Suppressed when wrapped by adaptive — adaptive runs ONE - // higher-level scaler that delegates to the active sub-engine. The - // algorithm itself lives in engine/scaler; this is just the call site. - if !e.cfg.SkipBuiltinScaler { - if scalerCfg := scaler.Resolve(e.cfg, len(workers)); scalerCfg.Enabled { - go e.runScaler(innerCtx, scalerCfg, &e.metrics.activeConns) - } - } - <-ctx.Done() // Workers use SubmitAndWaitTimeout and check ctx.Err() on each iteration, // so they will exit within ~100ms of context cancellation. @@ -259,7 +259,9 @@ func (e *Engine) createWorkers(tier TierStrategy, cpus []int, w, err := newWorker(i, cpus[i], tier, e.handler, resolved, e.cfg, &e.metrics.reqCount, &e.metrics.activeConns, &e.metrics.errCount, - &e.metrics.asyncPromoted, &e.acceptPaused) + &e.metrics.asyncPromoted, &e.acceptPaused, + &e.metrics.acceptCount, &e.metrics.closeCount, + &e.metrics.bytesRead, &e.metrics.bytesWritten) if err != nil { // Clean up already-created workers. for _, prev := range workers[:i] { @@ -314,6 +316,11 @@ func (e *Engine) Metrics() engine.EngineMetrics { ErrorCount: e.metrics.errCount.Load(), AsyncRoutes: e.asyncRoutes, AsyncPromotedConns: e.metrics.asyncPromoted.Load(), + Workers: len(e.workers), + AcceptCount: e.metrics.acceptCount.Load(), + CloseCount: e.metrics.closeCount.Load(), + BytesRead: e.metrics.bytesRead.Load(), + BytesWritten: e.metrics.bytesWritten.Load(), } } @@ -388,7 +395,6 @@ var ( _ engine.Engine = (*Engine)(nil) _ engine.AcceptController = (*Engine)(nil) _ engine.EventLoopProvider = (*Engine)(nil) - _ engine.WorkerScaler = (*Engine)(nil) ) // NumWorkers returns the number of worker event loops available for diff --git a/engine/iouring/engine_test.go b/engine/iouring/engine_test.go index cdee385a..f44acdf8 100644 --- a/engine/iouring/engine_test.go +++ b/engine/iouring/engine_test.go @@ -197,9 +197,11 @@ func TestSelectTierOptionalWithoutSendZC(t *testing.T) { } } -func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) { - // SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN. - // SQPOLL path must use neither IPI-related flag. +func TestSelectTierOptionalUsesTaskrunNotSQPoll(t *testing.T) { + // #377: SQPOLL is never selected. Per-worker rings would spawn one kernel + // poll thread per worker (N spinning cores), and the dormant SQPOLL submit + // path has a latent SQ-tail-publish race. The optional tier uses the + // task-run completion model like the high tier instead. profile := engine.CapabilityProfile{ IOUringTier: engine.Optional, CoopTaskrun: true, @@ -212,18 +214,36 @@ func TestSelectTierOptionalWithDeferTaskrun(t *testing.T) { t.Fatal("expected non-nil tier") } flags := tier.SetupFlags() - if flags&setupDeferTaskrun != 0 { - t.Error("DEFER_TASKRUN must not be set with SQPOLL (incompatible)") + if flags&setupSQPoll != 0 { + t.Error("SQPOLL must not be selected (#377)") } - if flags&setupCoopTaskrun != 0 { - t.Error("COOP_TASKRUN must not be set with SQPOLL (incompatible)") + if flags&setupDeferTaskrun == 0 { + t.Error("expected DEFER_TASKRUN when available") + } + if flags&setupSingleIssuer == 0 { + t.Error("expected SINGLE_ISSUER") } - if flags&setupSQPoll == 0 { - t.Error("expected SQPOLL in setup flags") + if tier.SQPollIdle() != 0 { + t.Error("SQPollIdle must be 0 when SQPOLL is disabled") } if !tier.SupportsFixedFiles() { t.Error("fixed files should be enabled when profile.FixedFiles is true") } + + // Without DEFER_TASKRUN (e.g. a 6.0 kernel), fall back to COOP_TASKRUN — + // still never SQPOLL. + coopProfile := engine.CapabilityProfile{ + IOUringTier: engine.Optional, + CoopTaskrun: true, + SQPoll: true, + } + coopFlags := SelectTier(coopProfile, 0).SetupFlags() + if coopFlags&setupSQPoll != 0 { + t.Error("SQPOLL must not be selected without DeferTaskrun either (#377)") + } + if coopFlags&setupCoopTaskrun == 0 { + t.Error("expected COOP_TASKRUN when DEFER_TASKRUN unavailable") + } } func TestSelectTierNone(t *testing.T) { diff --git a/engine/iouring/probe.go b/engine/iouring/probe.go index 19875881..3a9ce69e 100644 --- a/engine/iouring/probe.go +++ b/engine/iouring/probe.go @@ -201,7 +201,7 @@ func probeSendZC() (SendZCProbeResult, string) { entry = ring.cqeAt(cqHead) notifFlags := entry.Flags - isNotif := notifFlags&0x04 != 0 // CQE_F_NOTIF + isNotif := notifFlags&cqeFNotif != 0 // CQE_F_NOTIF (1<<3); 0x04 is SOCK_NONEMPTY notifRes := entry.Res ring.EndCQ(cqHead + 1) diff --git a/engine/iouring/ring.go b/engine/iouring/ring.go index f2d3ed87..d4f54d75 100644 --- a/engine/iouring/ring.go +++ b/engine/iouring/ring.go @@ -19,6 +19,29 @@ import ( // we already promised it'd fit. const minMemlockPerWorker = 12 * 1024 * 1024 +// MaxWorkersForMemlock returns the io_uring worker ceiling imposed by +// RLIMIT_MEMLOCK, or -1 when the soft limit is unlimited (RLIM_INFINITY) or +// cannot be read — i.e. "no cap". It performs no syscall beyond Getrlimit and +// constructs no ring, so it is safe to call from the adaptive start decision +// BEFORE building the io_uring engine. The returned ceiling is at least 1 when +// a finite limit is present (createWorkers/NewRing will surface a precise +// ENOMEM if even one worker doesn't fit). This is the single source of truth +// for the rlim.Cur / minMemlockPerWorker math (capWorkersToMemlock calls it). +func MaxWorkersForMemlock() int { + var rlim unix.Rlimit + if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { + return -1 + } + if rlim.Cur == ^uint64(0) { + return -1 + } + maxByMemlock := int(rlim.Cur / minMemlockPerWorker) + if maxByMemlock < 1 { + maxByMemlock = 1 + } + return maxByMemlock +} + // capWorkersToMemlock returns min(want, RLIMIT_MEMLOCK / minMemlockPerWorker). // When the soft limit is unlimited (RLIM_INFINITY), the request is honoured // as-is. When the cap forces a reduction, the chosen logger is informed so @@ -30,20 +53,15 @@ func capWorkersToMemlock(want int, logger *slog.Logger) int { if want <= 1 { return want } - var rlim unix.Rlimit - if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { - return want - } - if rlim.Cur == ^uint64(0) { + maxByMemlock := MaxWorkersForMemlock() + if maxByMemlock == -1 { return want } - maxByMemlock := int(rlim.Cur / minMemlockPerWorker) - if maxByMemlock < 1 { - maxByMemlock = 1 - } if maxByMemlock >= want { return want } + var rlim unix.Rlimit + _ = unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim) if logger != nil { logger.Warn("io_uring workers capped by RLIMIT_MEMLOCK", "requested", want, @@ -231,6 +249,14 @@ func (r *Ring) mmap() error { } // GetSQE returns a pointer to the next available SQE, or nil if the ring is full. +// +// NOT SQPOLL-safe: this advances the SHARED SQ tail before the caller writes +// the SQE payload, which is correct only because io_uring_enter is the kernel +// sync point on the non-SQPOLL submit path. Under IORING_SETUP_SQPOLL the +// kernel poll thread reads the tail continuously and could consume a +// half-written SQE — so SQPOLL must not be enabled without first switching to +// a deferred local tail + a release-store publish after fill. No tier selects +// SQPOLL (see optionalTier / #377), so this path is never exercised today. func (r *Ring) GetSQE() unsafe.Pointer { var tail, head uint32 if r.singleIssuer { diff --git a/engine/iouring/scaler.go b/engine/iouring/scaler.go deleted file mode 100644 index d4ef07fd..00000000 --- a/engine/iouring/scaler.go +++ /dev/null @@ -1,65 +0,0 @@ -//go:build linux - -package iouring - -import ( - "context" - "log/slog" - "sync/atomic" - - "github.com/goceleris/celeris/engine/scaler" -) - -// iouringScalerSource adapts the iouring Engine to the scaler.Source -// interface. Generation always returns 0 — iouring isn't a meta-engine -// and doesn't switch identities at runtime; only the adaptive engine's -// source returns non-zero generations. -type iouringScalerSource struct { - e *Engine - activeConns *atomic.Int64 -} - -func (s *iouringScalerSource) NumWorkers() int { return s.e.NumWorkers() } -func (s *iouringScalerSource) ActiveConns() int64 { return s.activeConns.Load() } -func (s *iouringScalerSource) PauseWorker(i int) { s.e.PauseWorker(i) } -func (s *iouringScalerSource) ResumeWorker(i int) { s.e.ResumeWorker(i) } -func (s *iouringScalerSource) Generation() uint64 { return 0 } -func (s *iouringScalerSource) Logger() *slog.Logger { return s.e.cfg.Logger } - -// runScaler is started by Engine.Listen when scaler config is enabled. -// All algorithm logic lives in engine/scaler; this is a thin source -// adapter so the iouring engine plugs into the shared loop. -func (e *Engine) runScaler(ctx context.Context, cfg scaler.Config, activeConns *atomic.Int64) { - scaler.Run(ctx, &iouringScalerSource{e: e, activeConns: activeConns}, cfg) -} - -// PauseWorker deactivates worker i. The worker drains in-flight conns -// and goes SUSPENDED. Asynchronous; returns immediately. -func (e *Engine) PauseWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.workers) { - return - } - e.workers[i].inactive.Store(true) -} - -// ResumeWorker reactivates worker i. Wakes the worker from SUSPENDED if -// it was already idle. -func (e *Engine) ResumeWorker(i int) { - e.mu.Lock() - defer e.mu.Unlock() - if i < 0 || i >= len(e.workers) { - return - } - w := e.workers[i] - w.inactive.Store(false) - w.listenFDClosed.Store(false) - w.wakeMu.Lock() - if w.suspended.Load() { - close(w.wake) - w.wake = make(chan struct{}) - w.suspended.Store(false) - } - w.wakeMu.Unlock() -} diff --git a/engine/iouring/scaler_test.go b/engine/iouring/scaler_test.go deleted file mode 100644 index 91819e3d..00000000 --- a/engine/iouring/scaler_test.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build linux - -package iouring - -// The scaler algorithm itself lives in engine/scaler and has its own -// test suite there. The iouring engine's scaler.go is just the -// engine.WorkerScaler adapter (PauseWorker / ResumeWorker on workers -// slice). The runtime exercises this adapter via the existing engine -// integration tests (e.g., async_churn_uaf_test.go) and the spike-B -// strict-matrix runs that ran with CELERIS_DYN_WORKERS=1. diff --git a/engine/iouring/send_zc_gate_test.go b/engine/iouring/send_zc_gate_test.go new file mode 100644 index 00000000..dc99302f --- /dev/null +++ b/engine/iouring/send_zc_gate_test.go @@ -0,0 +1,81 @@ +//go:build linux + +package iouring + +import ( + "testing" + "unsafe" +) + +// TestUseSendZC pins the zero-copy gating policy (celeris#332): ZC is only +// chosen for unlinked sends whose payload is at least sendZCMinBytes. Below the +// threshold, or for any linked send, or when the capability is absent, the plain +// SEND path must win. +func TestUseSendZC(t *testing.T) { + cases := []struct { + name string + sendZC bool + linked bool + n int + want bool + }{ + {"large-unlinked-capable", true, false, sendZCMinBytes, true}, + {"above-threshold", true, false, sendZCMinBytes + 1, true}, + {"just-below-threshold", true, false, sendZCMinBytes - 1, false}, + {"small-unlinked", true, false, 100, false}, + {"empty", true, false, 0, false}, + {"large-linked", true, true, sendZCMinBytes * 4, false}, + {"large-no-capability", false, false, sendZCMinBytes * 4, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := useSendZC(tc.sendZC, tc.linked, tc.n); got != tc.want { + t.Errorf("useSendZC(%v, %v, %d) = %v, want %v", + tc.sendZC, tc.linked, tc.n, got, tc.want) + } + }) + } +} + +// prepSendSQEOpcode runs prepSendSQE against a freshly zeroed SQE for a worker +// with ZC enabled and an unlinked send of payload size n, returning the chosen +// opcode byte. +func prepSendSQEOpcode(t *testing.T, n int) byte { + t.Helper() + var sqe [sqeSize]byte + w := &Worker{sendZC: true} + cs := &connState{fd: 7, sendBuf: make([]byte, n)} + w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, false) + return sqe[0] +} + +// TestPrepSendSQEGatesBySize verifies the worker async flush path emits a plain +// SEND for sub-threshold responses (1 CQE, no NOTIF stall) and SEND_ZC only once +// the payload reaches sendZCMinBytes. +func TestPrepSendSQEGatesBySize(t *testing.T) { + if op := prepSendSQEOpcode(t, 100); op != opSEND { + t.Errorf("small response opcode = %d, want opSEND(%d)", op, opSEND) + } + if op := prepSendSQEOpcode(t, sendZCMinBytes-1); op != opSEND { + t.Errorf("just-below-threshold opcode = %d, want opSEND(%d)", op, opSEND) + } + if op := prepSendSQEOpcode(t, sendZCMinBytes); op != opSENDZC { + t.Errorf("at-threshold opcode = %d, want opSENDZC(%d)", op, opSENDZC) + } +} + +// TestPrepSendSQELinkedNeverZC guards the link invariant: a linked send must +// never use SEND_ZC (the NOTIF CQE would break the SEND→RECV chain), regardless +// of payload size. +func TestPrepSendSQELinkedNeverZC(t *testing.T) { + var sqe [sqeSize]byte + w := &Worker{sendZC: true} + cs := &connState{fd: 7, sendBuf: make([]byte, sendZCMinBytes*4)} + w.prepSendSQE(unsafe.Pointer(&sqe[0]), cs, true) + if sqe[0] != opSEND { + t.Errorf("linked large send opcode = %d, want opSEND(%d)", sqe[0], opSEND) + } + if sqe[1]&sqeIOLink == 0 { + t.Errorf("linked send missing IOSQE_IO_LINK in flags 0x%02x", sqe[1]) + } +} diff --git a/engine/iouring/tier.go b/engine/iouring/tier.go index 7860c302..ad07d406 100644 --- a/engine/iouring/tier.go +++ b/engine/iouring/tier.go @@ -24,9 +24,9 @@ type TierStrategy interface { } // SelectTier returns the highest available tier strategy for the given profile. -// sqPollIdle is the objective-specific SQPOLL thread idle timeout; if zero, -// defaults to 2000ms. -func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) TierStrategy { +// The sqPollIdle parameter is retained for signature stability but no longer +// used: SQPOLL is not selected by any tier (see optionalTier doc / #377). +func SelectTier(profile engine.CapabilityProfile, _ time.Duration) TierStrategy { switch { // DEFER_TASKRUN: completions run in worker's context (no extra kernel thread). // Preferred over SQPOLL because the SQPOLL kernel thread steals CPU from workers. @@ -40,12 +40,7 @@ func SelectTier(profile engine.CapabilityProfile, sqPollIdle time.Duration) Tier multishotRecv: profile.MultishotRecv, } case profile.IOUringTier >= engine.Optional && profile.SQPoll: - idle := uint32(sqPollIdle.Milliseconds()) - if idle == 0 { - idle = 2000 - } return &optionalTier{ - sqPollIdle: idle, deferTaskrun: profile.DeferTaskrun, fixedFiles: profile.FixedFiles, sendZC: profile.SendZC, @@ -156,7 +151,7 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) { if sqe == nil { return } - if t.sendZC && !linked { + if useSendZC(t.sendZC, linked, len(buf)) { if t.fixedFiles { prepSendZCFixed(sqe, fd, buf, false) } else { @@ -171,9 +166,18 @@ func (t *highTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) { setSQEUserData(sqe, encodeUserData(udSend, fd)) } -// optionalTier: kernel 6.0+, adds SQPOLL, SEND_ZC. With 6.1+: DEFER_TASKRUN, fixed files. +// optionalTier: kernel 6.0+ with provided buffers but below the High tier +// (e.g. a 6.x kernel where the provided-buffers probe disabled the High path). +// Uses the task-run completion model, NOT SQPOLL. +// +// SQPOLL is deliberately never used here: celeris runs one io_uring ring per +// worker, so SQPOLL would spawn one kernel poll thread PER worker (N spinning +// cores — measured -83% throughput / 75% idle CPU on a 16-worker box), and the +// dormant SQPOLL submit path has a latent SQ-tail-publish race in GetSQE +// (the shared tail is advanced before the SQE payload is written, which is +// safe only because io_uring_enter is the sync point on the non-SQPOLL path). +// See #377. SQPollIdle returns 0 so the worker never enters the SQPOLL branch. type optionalTier struct { - sqPollIdle uint32 deferTaskrun bool fixedFiles bool sendZC bool @@ -183,16 +187,20 @@ type optionalTier struct { func (t *optionalTier) Tier() engine.Tier { return engine.Optional } func (t *optionalTier) SetupFlags() uint32 { - // SQPOLL is incompatible with both DEFER_TASKRUN and COOP_TASKRUN. - // SINGLE_ISSUER is compatible and enables SQ head optimization. - return setupSQPoll | setupSingleIssuer + // Mirror highTier: task-run completions in the worker's own context, no + // extra kernel thread. (Was setupSQPoll|setupSingleIssuer — see the type + // doc and #377 for why SQPOLL is not used.) + if t.deferTaskrun { + return setupDeferTaskrun | setupSingleIssuer + } + return setupCoopTaskrun | setupSingleIssuer } func (t *optionalTier) SupportsProvidedBuffers() bool { return true } func (t *optionalTier) SupportsMultishotAccept() bool { return t.multishotAccept } func (t *optionalTier) SupportsMultishotRecv() bool { return t.multishotRecv } func (t *optionalTier) SupportsFixedFiles() bool { return t.fixedFiles } func (t *optionalTier) SupportsSendZC() bool { return t.sendZC } -func (t *optionalTier) SQPollIdle() uint32 { return t.sqPollIdle } +func (t *optionalTier) SQPollIdle() uint32 { return 0 } // SQPOLL disabled — see type doc / #377 func (t *optionalTier) PrepareAccept(ring *Ring, listenFD int) { sqe := ring.GetSQE() @@ -224,7 +232,7 @@ func (t *optionalTier) PrepareSend(ring *Ring, fd int, buf []byte, linked bool) if sqe == nil { return } - if t.sendZC && !linked { + if useSendZC(t.sendZC, linked, len(buf)) { // SEND_ZC cannot be linked (the notification CQE would break // the link chain), so fall back to regular SEND for linked ops. if t.fixedFiles { diff --git a/engine/iouring/worker.go b/engine/iouring/worker.go index c673c2d4..d7f31de9 100644 --- a/engine/iouring/worker.go +++ b/engine/iouring/worker.go @@ -71,17 +71,24 @@ const bufRingCountMax = 1 << 15 // 32768 entries × 8 KiB = 256 MiB worst case ( // the case for very-high-concurrency benchmarks (16k+ connections) where // each worker may have more in-flight multishot recvs than the formula // anticipates. Setting 0 (or leaving the env var unset) reverts to -// auto-scaling from Workers × TargetConnsPerWorker. +// auto-scaling from the per-worker conn target. const envPbufCount = "CELERIS_IOURING_PBUF_COUNT" +// defaultConnsPerWorker is the per-worker connection target used to size +// the provided-buffer ring. The ring is sized at 2 buffers per conn at +// this target, giving comfortable headroom so the kernel rarely stalls +// waiting for buffer returns. Operators override the resulting ring size +// directly via CELERIS_IOURING_PBUF_COUNT. +const defaultConnsPerWorker = 20 + // resolveBufRingCount picks the provided-buffer-ring size for a worker. // The default formula is `nextPowerOf2(max(bufRingCountMin, 2 * -// TargetConnsPerWorker))`, i.e. two buffers per conn in the scaler's -// steady-state target — enough headroom that the kernel rarely stalls -// waiting for buffer returns. Above 1024 conns the previous hard-coded -// 1024 was too small: buffers were reused aggressively, the kernel -// stalled, and the very behaviour the ring is designed to optimise -// (multishot recv CQE batching) collapsed into CQE storms (celeris#322). +// connsPerWorker))`, i.e. two buffers per conn at the per-worker conn +// target — enough headroom that the kernel rarely stalls waiting for +// buffer returns. Above 1024 conns the previous hard-coded 1024 was too +// small: buffers were reused aggressively, the kernel stalled, and the +// very behaviour the ring is designed to optimise (multishot recv CQE +// batching) collapsed into CQE storms (celeris#322). // // The ring is PER-WORKER: NewBufferRing is created once per Worker on its // own ring, so the scaling MUST be driven by the per-worker conn target, @@ -90,7 +97,7 @@ const envPbufCount = "CELERIS_IOURING_PBUF_COUNT" // mmap'd RSS per worker and risking the kernel cap on large boxes // (celeris#322 follow-up). // Operators can override via CELERIS_IOURING_PBUF_COUNT. -func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorker int) int { +func resolveBufRingCount(_ resource.ResolvedResources, connsPerWorker int) int { if v := os.Getenv(envPbufCount); v != "" { if n, err := strconv.Atoi(v); err == nil && n > 0 { if n&(n-1) != 0 { @@ -99,9 +106,9 @@ func resolveBufRingCount(_ resource.ResolvedResources, scalerTargetConnsPerWorke return clampBufRingCount(n) } } - target := scalerTargetConnsPerWorker + target := connsPerWorker if target <= 0 { - target = 20 // mirrors scaler.Resolve's default + target = defaultConnsPerWorker } scaled := 2 * target if scaled < bufRingCountMin { @@ -224,11 +231,6 @@ type Worker struct { wake chan struct{} wakeMu sync.Mutex suspended atomic.Bool - // inactive is the per-worker pause flag used by the dynamic worker - // scaler. ORed with acceptPaused (which is engine-wide) when computing - // effective paused state. The scaler flips this to deactivate idle - // workers under low load and reactivate them under burst load. - inactive atomic.Bool // listenFDClosed signals that the worker has cancelled in-flight // accept SQEs and closed its listen FD in response to acceptPaused // being set. PauseAccept polls this so it only returns once the @@ -239,11 +241,17 @@ type Worker struct { // RST as it pauses). listenFDClosed atomic.Bool - reqCount *atomic.Uint64 - activeConns *atomic.Int64 - errCount *atomic.Uint64 - asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) - reqBatch uint64 // batched request count, flushed to reqCount per iteration + reqCount *atomic.Uint64 + activeConns *atomic.Int64 + errCount *atomic.Uint64 + asyncPromoted *atomic.Uint64 // cumulative inline → dispatch promotions (#300) + acceptCount *atomic.Uint64 // cumulative accepts (engine-wide, shared) + closeCount *atomic.Uint64 // cumulative closes (engine-wide, shared) + bytesRead *atomic.Uint64 // cumulative recv payload bytes (engine-wide, shared) + bytesWritten *atomic.Uint64 // cumulative send payload bytes (engine-wide, shared) + reqBatch uint64 // batched request count, flushed to reqCount per iteration + bytesReadBatch uint64 // batched recv bytes, flushed to bytesRead per iteration + bytesWrittenBatch uint64 // batched send bytes, flushed to bytesWritten per iteration tickCounter uint32 cachedNow int64 // cached time.Now().UnixNano(), refreshed every 64 iterations @@ -315,7 +323,8 @@ type Worker struct { func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler, resolved resource.ResolvedResources, cfg resource.Config, reqCount *atomic.Uint64, activeConns *atomic.Int64, errCount *atomic.Uint64, - asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init + asyncPromoted *atomic.Uint64, acceptPaused *atomic.Bool, + acceptCount, closeCount, bytesRead, bytesWritten *atomic.Uint64) (*Worker, error) { //nolint:unparam // error return used by callers for future fallible init // Listen socket creation is deferred to run() (after CPU pinning and NUMA // binding) so that the kernel allocates socket internal buffers on the @@ -342,6 +351,10 @@ func newWorker(id, cpuID int, tier TierStrategy, handler stream.Handler, activeConns: activeConns, errCount: errCount, asyncPromoted: asyncPromoted, + acceptCount: acceptCount, + closeCount: closeCount, + bytesRead: bytesRead, + bytesWritten: bytesWritten, acceptPaused: acceptPaused, wake: make(chan struct{}), ready: make(chan error, 1), @@ -431,15 +444,11 @@ func (w *Worker) run(ctx context.Context) { // The ring size scales with the worker's per-conn target (celeris#322): // the previous hard-coded 1024 entries was undersized above ~1024 conns // and produced CQE storms as the kernel stalled waiting for buffer - // returns. The formula gives 2 buffers per conn at the scaler's - // steady-state target — comfortable headroom without runaway RSS. + // returns. The formula gives 2 buffers per conn at the per-worker conn + // target — comfortable headroom without runaway RSS. // CELERIS_IOURING_PBUF_COUNT overrides the auto-scaled value. if w.tier.SupportsMultishotRecv() && os.Getenv("CELERIS_IOURING_MULTISHOT_RECV") == "1" { - var targetConns int - if w.cfg.WorkerScaling != nil { - targetConns = w.cfg.WorkerScaling.TargetConnsPerWorker - } - bufRingCount := resolveBufRingCount(w.resolved, targetConns) + bufRingCount := resolveBufRingCount(w.resolved, defaultConnsPerWorker) br, err := NewBufferRing(w.ring, bufRingGroupID, bufRingCount, w.resolved.BufferSize) if err != nil { w.logger.Warn("ring-mapped buffer registration failed, using per-connection buffers", @@ -481,8 +490,7 @@ func (w *Worker) run(ctx context.Context) { // Cache the atomic load: same value used by the two branches // below and (further down) the SUSPENDED check. Saves 2 atomic // loads per event-loop iteration on the steady-state hot path. - // OR with the per-worker inactive flag (dynamic scaler). - paused := w.acceptPaused.Load() || w.inactive.Load() + paused := w.acceptPaused.Load() if w.listenFD >= 0 && paused { if sqe := w.ring.GetSQE(); sqe != nil { prepCancelFDSkipSuccess(sqe, w.listenFD) @@ -674,6 +682,17 @@ func (w *Worker) run(ctx context.Context) { w.reqBatch = 0 } + // Flush batched payload-byte counters with the same per-iteration + // cadence as reqCount, for the same cache-line-contention reason. + if w.bytesReadBatch > 0 { + w.bytesRead.Add(w.bytesReadBatch) + w.bytesReadBatch = 0 + } + if w.bytesWrittenBatch > 0 { + w.bytesWritten.Add(w.bytesWrittenBatch) + w.bytesWrittenBatch = 0 + } + // Single atomic publish for all batched buffer returns (P0). if w.hasBufReturns { w.bufRing.PublishBuffers() @@ -782,7 +801,6 @@ func (w *Worker) run(ctx context.Context) { // DRAINING → SUSPENDED: no listen socket, no connections, CQEs processed. // Checked after CQE processing so accept CQEs for connections that // completed before the listen socket close are served, not leaked. - // Combined paused: engine-wide OR per-worker (dynamic scaler). // // hasDriverConns gate: connCount only counts HTTP conns. An // EventLoopProvider driver may still have live conns in @@ -790,9 +808,9 @@ func (w *Worker) run(ctx context.Context) { // park its event loop and starve those driver conns of CQE // servicing. Stay active while any driver conn is registered (v1.5.0 // review 2.10). - if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && (w.acceptPaused.Load() || w.inactive.Load()) { + if w.listenFD < 0 && w.connCount == 0 && !w.hasDriverConns.Load() && w.acceptPaused.Load() { w.wakeMu.Lock() - if !w.acceptPaused.Load() && !w.inactive.Load() { + if !w.acceptPaused.Load() { w.wakeMu.Unlock() continue } @@ -1184,6 +1202,7 @@ func (w *Worker) onAcceptedFD(ctx context.Context, newFD int, now int64, isFixed } cs.writeFn = w.makeWriteFn(cs) w.activeConns.Add(1) + w.acceptCount.Add(1) if w.cfg.OnConnect != nil { w.cfg.OnConnect(cs.remoteAddr) @@ -1225,6 +1244,7 @@ func (w *Worker) hijackConn(fd int) (net.Conn, error) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) // Cancel-then-release discipline, hijack variant: a single-shot // recv SQE is virtually always still armed on cs.buf here. The fd // lives on under the caller's net.Conn, so an uncancelled recv would @@ -1424,7 +1444,7 @@ func (w *Worker) initProtocol(cs *connState) { // process on the first /ws or /events request. The // asyncPromoted gate restricts the Unlock to the dispatch- // goroutine path that actually holds the lock. See celeris#309. - if w.async && cs.asyncPromoted && cs.detachMu != nil && !cs.asyncDetachUnlocked { + if w.async && cs.asyncPromoted.Load() && cs.detachMu != nil && !cs.asyncDetachUnlocked { cs.asyncDetachUnlocked = true cs.detachMu.Unlock() } @@ -1573,6 +1593,9 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { } cs.lastActivity = now + // c.Res > 0 here (the c.Res <= 0 cases returned above): bytes received + // on this recv CQE, regardless of which buffer they landed in. + w.bytesReadBatch += uint64(c.Res) // Direct-into-bodyBuf path: the previous recv SQE targeted // H1State.bodyBuf (NextRecvBuf). The CQE's Res applies to bodyBuf, @@ -1725,8 +1748,20 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // which point the conn is promoted and its stashed request handed to // the goroutine. This lets sync routes run inline on the worker // (no handoff) on a server that mixes sync + async handlers. - if w.async && cs.asyncPromoted && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) { + asyncFeed := false + if w.async && cs.asyncPromoted.Load() && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) { cs.asyncInMu.Lock() + // celeris#364: re-check under asyncInMu — the dispatch goroutine clears + // asyncPromoted (reverting the conn to inline) under this same lock. If + // it won the race, do NOT feed asyncInBuf (the goroutine is exiting); + // fall through to inline with the unconsumed `data` instead. + if cs.asyncPromoted.Load() { + asyncFeed = true + } else { + cs.asyncInMu.Unlock() + } + } + if asyncFeed { // Backpressure: drop the conn if the dispatch goroutine is // falling behind. Prevents a pipelining client from ballooning // asyncInBuf without bound. @@ -1793,7 +1828,7 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // hasn't been promoted yet, run ProcessH1 inline in InlineMode so it // bails (ErrAsyncDispatch) when it hits an async route. The flag is // set only around the ProcessH1 call(s) below. - tryInline := w.async && !cs.asyncPromoted && cs.h1State != nil && + tryInline := w.async && !cs.asyncPromoted.Load() && cs.h1State != nil && (w.h1Only || engine.Protocol(cs.protocol.Load()) == engine.HTTP1) // h1Only mode (Protocol=HTTP1 + EnableH2Upgrade=false): no atomic // Load, no switch dispatch, no upgrade-handling block — ProcessH1 @@ -1891,7 +1926,15 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // goes straight to the dispatch path (asyncPromoted guard above). if tryInline { if errors.Is(processErr, conn.ErrAsyncDispatch) { - cs.asyncPromoted = true + // celeris#364: record the route that forced promotion (single-shot + // recv only — the revert path assumes the worker-owned cs.buf recv + // model). Set BEFORE asyncPromoted/goroutine start so the dispatch + // goroutine observes it (happens-before). Empty path => the + // goroutine treats the conn as not revert-eligible. + if w.bufRing == nil { + cs.promotedMethod, cs.promotedPath = cs.h1State.CurrentRoute() + } + cs.asyncPromoted.Store(true) w.asyncPromoted.Add(1) stashed := cs.h1State.TakeBufferedBytes() // Flush any inline-handled response (pipelined sync request @@ -1918,8 +1961,10 @@ func (w *Worker) handleRecv(c *completionEntry, fd int, now int64) { // continuation runs on the dispatch goroutine — the partial-state // parse paths must not run inline (only the fresh-parse site // honors the async check). - if processErr == nil && cs.h1State.HasPendingData() { - cs.asyncPromoted = true + if processErr == nil && cs.h1State.HasPendingDispatchState() { + // No complete request parsed yet (buffered headers / chunked), so + // no route to record — leave promotedPath empty: not revert-eligible. + cs.asyncPromoted.Store(true) w.asyncPromoted.Add(1) } } @@ -2119,6 +2164,11 @@ func (w *Worker) completeSend(cs *connState, fd int, sent int, now int64) { return } + // sent >= 0 here: payload bytes flushed by this send completion + // (covers regular SEND and the SEND_ZC NOTIF path, both of which + // reach completeSend with the byte count). + w.bytesWrittenBatch += uint64(sent) + // Partial-send handling, split by whether we issued a plain SEND // (sendBuf only) or a WRITEV (sendBuf + sendBody). Partial WRITEV // responses collapse the remainder into sendBuf so the retry path @@ -2470,6 +2520,7 @@ func (w *Worker) finishClose(fd int) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) if w.cfg.OnDisconnect != nil && cs != nil { w.cfg.OnDisconnect(cs.remoteAddr) @@ -2570,6 +2621,7 @@ func (w *Worker) finishCloseDetached(fd int, cs *connState) { w.conns[fd] = nil w.connCount-- w.activeConns.Add(-1) + w.closeCount.Add(1) if w.cfg.OnDisconnect != nil { w.cfg.OnDisconnect(cs.remoteAddr) @@ -2688,6 +2740,21 @@ func (w *Worker) promoteConnToAsync(cs *connState, _ int, stashed []byte, c *com } } +// canRevertToInline reports whether a promoted conn should be reverted to the +// inline fast path (celeris#364). True when single-shot recv is in use, the +// conn recorded the route that promoted it, and that route's promotion has +// since expired (RouteAsync now false — the route-level TTL de-promotion). +// Called by runAsyncHandler ONLY while holding asyncInMu with asyncInBuf empty. +func (w *Worker) canRevertToInline(cs *connState) bool { + return w.bufRing == nil && cs.promotedPath != "" && cs.h1State != nil && + cs.h1State.RouteAsync != nil && + // Clean request boundary only: never revert mid-request (a partial body + // or buffered headers still accumulating), so h1State ownership flips + // back to the worker between requests, exactly like a fresh inline conn. + !cs.h1State.HasPendingData() && + !cs.h1State.RouteAsync(cs.promotedMethod, cs.promotedPath) +} + func (w *Worker) runAsyncHandler(cs *connState) { defer w.asyncWG.Done() defer func() { @@ -2720,6 +2787,19 @@ func (w *Worker) runAsyncHandler(cs *connState) { for { cs.asyncInMu.Lock() for len(cs.asyncInBuf) == 0 && !cs.asyncClosed.Load() { + // celeris#364: revert this conn to inline when the route that + // promoted it has de-promoted (its TTL expired). Safe ONLY here: + // asyncInBuf is empty (no in-flight input, last response already + // written) and we hold asyncInMu, which the worker's feed path + // re-acquires and re-checks asyncPromoted against — so clearing it + // here cannot race a concurrent feed. The worker owns recv and + // resumes the inline fast path on the next CQE. + if w.canRevertToInline(cs) { + cs.asyncPromoted.Store(false) + cs.asyncRun = false + cs.asyncInMu.Unlock() + return + } cs.asyncCond.Wait() } if cs.asyncClosed.Load() { @@ -3052,10 +3132,13 @@ func (w *Worker) pickRecvTarget(cs *connState) []byte { // completed, so no kernel SQE still targets the old bodyBuf array. The // body path below re-sets the pin when it arms into bodyBuf again. cs.bodyRecvPin = nil - // Async mode: the dispatch goroutine owns h1State; the worker cannot - // safely observe NextRecvBuf without synchronization. Always use - // cs.buf so the goroutine handles body accumulation on its side. - if w.async || w.bufRing != nil || cs.h1State == nil { + // Async mode: only a PROMOTED conn hands h1State to the dispatch + // goroutine, which the worker cannot safely observe NextRecvBuf against. + // A non-promoted conn (celeris#356 inline-first) runs ProcessH1 on the + // worker itself (tryInline), so the worker owns h1State exactly as the + // sync path does and the zero-copy direct-into-bodyBuf recv is safe — + // gate the bail on cs.asyncPromoted, not blanket w.async. + if (w.async && cs.asyncPromoted.Load()) || w.bufRing != nil || cs.h1State == nil { return cs.buf } if !w.h1Only && engine.Protocol(cs.protocol.Load()) != engine.HTTP1 { @@ -3289,10 +3372,12 @@ func (w *Worker) flushSend(cs *connState) bool { } // prepSendSQE prepares a SEND or SEND_ZC SQE based on worker capabilities. -// SEND_ZC is only used for unlinked sends (the notification CQE would break -// the link chain). Linked sends always use regular SEND. +// SEND_ZC is only used for unlinked sends at or above sendZCMinBytes (the +// notification CQE would break the link chain, and on small payloads its extra +// completion costs more than the avoided memcpy). Smaller and linked sends use +// regular SEND. func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) { - if w.sendZC && !linked { + if useSendZC(w.sendZC, linked, len(cs.sendBuf)) { if cs.fixedFile { prepSendZCFixed(sqe, cs.fd, cs.sendBuf, false) } else { @@ -3305,6 +3390,13 @@ func (w *Worker) prepSendSQE(sqe unsafe.Pointer, cs *connState, linked bool) { } } +// useSendZC decides whether a send should use zero-copy. ZC is only viable for +// unlinked sends whose payload is large enough that the saved memcpy outweighs +// the extra NOTIF CQE (see sendZCMinBytes). +func useSendZC(sendZC, linked bool, n int) bool { + return sendZC && !linked && n >= sendZCMinBytes +} + // flushSendLink is like flushSend but links a RECV SQE after the SEND using // IOSQE_IO_LINK. The kernel chains the operations: when SEND completes, RECV // starts automatically without another io_uring_enter. This eliminates one diff --git a/engine/scaler/doc.go b/engine/scaler/doc.go deleted file mode 100644 index c5ebe959..00000000 --- a/engine/scaler/doc.go +++ /dev/null @@ -1,16 +0,0 @@ -// Package scaler implements the dynamic worker scaler shared by the -// iouring, epoll, and adaptive engines. The scaler tracks the active -// connection count and adjusts how many workers participate in the -// SO_REUSEPORT group so connections-per-active-worker stays near the -// configured target — this keeps CQE/event batching density in the -// sweet spot. -// -// Engines plug in via the [Source] interface. Per-engine sources -// (engine/iouring/scaler.go, engine/epoll/scaler.go, adaptive/scaler.go) -// adapt the engine to this contract; the algorithm in [Run] is shared. -// -// Implementation is Linux-only because the underlying engines that -// surface a [Source] are themselves Linux-only. This package compiles -// to an empty stub on non-Linux GOOS so godoc and reflection-based -// tooling still see the public types and contract. -package scaler diff --git a/engine/scaler/scaler.go b/engine/scaler/scaler.go deleted file mode 100644 index ebd06ea2..00000000 --- a/engine/scaler/scaler.go +++ /dev/null @@ -1,285 +0,0 @@ -//go:build linux - -// Package-level documentation lives in doc.go (build-unconstrained so -// pkg.go.dev can render it on non-Linux). Per-engine [Source] sources: -// iouring + epoll report Generation()=0 always; the adaptive engine's -// source increments it on each engine switch so the scaler can -// re-baseline the new active engine's worker pause state. - -package scaler - -import ( - "context" - "log/slog" - "os" - "strconv" - "time" - - "github.com/goceleris/celeris/resource" -) - -// Source is the engine-side interface the scaler uses to read load and -// adjust worker activation. Implementations must be safe for concurrent -// use; the scaler calls these from its own goroutine. -type Source interface { - // NumWorkers returns the total worker pool size (= max active count). - // Must be stable across the lifetime of a scaler run. - NumWorkers() int - // ActiveConns returns the current active connection count. Read every tick. - ActiveConns() int64 - // PauseWorker deactivates worker i. Idempotent. - PauseWorker(i int) - // ResumeWorker reactivates worker i. Idempotent. - ResumeWorker(i int) - // Generation returns a counter that increments whenever the - // underlying engine identity changes (e.g., adaptive engine switch). - // Per-engine sources return 0 always. The scaler uses this to detect - // switches and re-baseline the active count on the new engine. - Generation() uint64 - // Logger returns the slog.Logger used for scaler diagnostics. May - // return nil to suppress log output. - Logger() *slog.Logger -} - -// Config holds the resolved scaler parameters. All fields are mutable -// before Run starts; the loop reads them once at startup. Use -// [Resolve] to derive a Config from a resource.Config (handles both -// the typed WorkerScaling field and the env-var fallback). -type Config struct { - // Enabled gates the entire scaler. Resolve sets this to true when - // either the typed config or the CELERIS_DYN_WORKERS env is set. - Enabled bool - // StartHigh seeds the scaler at NumWorkers active and scales down - // on idle. Default true (data-validated; preserves SO_REUSEPORT - // distribution at startup, +34-78% over start-low on ramp scenarios). - StartHigh bool - // MinActive is the floor on active worker count. - MinActive int - // TargetConnsPerWorker is the scaling target. desired = ceil(conns/Target). - TargetConnsPerWorker int - // Interval is the tick cadence. Default 250ms. - Interval time.Duration - // ScaleUpStep is the max workers added per tick (burst limit). - ScaleUpStep int - // ScaleDownStep is the max workers removed per tick. - ScaleDownStep int - // ScaleDownHysteresis: scale-down requires desired ≤ active - hyst - 1. - ScaleDownHysteresis int - // ScaleDownIdleTicks: consecutive sub-threshold ticks needed to scale down. - ScaleDownIdleTicks int - // Trace logs every scaler decision when true. - Trace bool -} - -// Resolve produces a Config from the engine's resource.Config. Typed -// cfg.WorkerScaling takes precedence; env vars are the legacy fallback. -// Returns Enabled=false when neither configures the scaler. -func Resolve(cfg resource.Config, numWorkers int) Config { - if cfg.WorkerScaling != nil { - return fromTyped(cfg.WorkerScaling, numWorkers) - } - return fromEnv(numWorkers) -} - -func fromTyped(c *resource.WorkerScalingConfig, numWorkers int) Config { - minActive := c.MinActive - if minActive == 0 { - minActive = numWorkers / 2 - if minActive < 2 { - minActive = 2 - } - } - if minActive > numWorkers { - minActive = numWorkers - } - target := c.TargetConnsPerWorker - if target == 0 { - target = 20 - } - interval := c.Interval - if interval == 0 { - interval = 250 * time.Millisecond - } - upStep := c.ScaleUpStep - if upStep == 0 { - upStep = 2 - } - downStep := c.ScaleDownStep - if downStep == 0 { - downStep = 1 - } - hyst := c.ScaleDownHysteresis - if hyst == 0 { - hyst = 1 - } - idleTicks := c.ScaleDownIdleTicks - if idleTicks == 0 { - idleTicks = 4 - } - return Config{ - Enabled: true, - StartHigh: c.Strategy != resource.ScalingStrategyStartLow, - MinActive: minActive, - TargetConnsPerWorker: target, - Interval: interval, - ScaleUpStep: upStep, - ScaleDownStep: downStep, - ScaleDownHysteresis: hyst, - ScaleDownIdleTicks: idleTicks, - Trace: c.Trace, - } -} - -func fromEnv(numWorkers int) Config { - // The CELERIS_DYN_* env vars are the legacy fallback for the typed - // [resource.WorkerScalingConfig]. They remain active so deployed - // manifests that pre-date v1.4.6 (when the typed config shipped) - // keep working without an edit. New code should use - // celeris.Config.WorkerScaling directly. - getInt := func(k string, def int) int { - if v := os.Getenv(k); v != "" { - if n, err := strconv.Atoi(v); err == nil { - return n - } - } - return def - } - enabled := getInt("CELERIS_DYN_WORKERS", 0) != 0 - minActive := getInt("CELERIS_DYN_MIN", numWorkers/2) - if minActive < 1 { - minActive = 1 - } - if minActive > numWorkers { - minActive = numWorkers - } - return Config{ - Enabled: enabled, - // Defaults to true (data-validated; matches the typed-config - // Strategy=ScalingStrategyStartHigh default). start-low pauses - // workers BEFORE the engine has finished settling its listen - // FDs in the SO_REUSEPORT group, which races against incoming - // H2 prior-knowledge dials and produces RST mid-handshake (seen - // in the v1.4.1 strict-matrix run on get-json-64k-h2/adaptive - // before this change). Users who explicitly want start-low can - // set CELERIS_DYN_START_HIGH=0. - StartHigh: os.Getenv("CELERIS_DYN_START_HIGH") != "0", - MinActive: minActive, - TargetConnsPerWorker: getInt("CELERIS_DYN_TARGET", 20), - Interval: time.Duration(getInt("CELERIS_DYN_INTERVAL", 250)) * time.Millisecond, - ScaleUpStep: getInt("CELERIS_DYN_UPSTEP", 2), - ScaleDownStep: getInt("CELERIS_DYN_DOWNSTEP", 1), - ScaleDownHysteresis: getInt("CELERIS_DYN_DOWNHYST", 1), - ScaleDownIdleTicks: getInt("CELERIS_DYN_DOWNIDLE", 4), - Trace: os.Getenv("CELERIS_DYN_TRACE") == "1", - } -} - -// Run executes the scaler loop until ctx is canceled. Reads load from -// src.ActiveConns(), pauses/resumes workers via src.PauseWorker / -// src.ResumeWorker, and re-baselines on Generation changes. -// -// Caller must check cfg.Enabled before invoking Run; Run does not -// short-circuit when disabled. -func Run(ctx context.Context, src Source, cfg Config) { - totalWorkers := src.NumWorkers() - var active int - if cfg.StartHigh { - active = totalWorkers - } else { - active = cfg.MinActive - for i := cfg.MinActive; i < totalWorkers; i++ { - src.PauseWorker(i) - } - } - - if log := src.Logger(); log != nil { - log.Info("dynamic worker scaler started", - "min_active", cfg.MinActive, - "max", totalWorkers, - "target_conns_per_worker", cfg.TargetConnsPerWorker, - "interval_ms", int(cfg.Interval/time.Millisecond), - "start_high", cfg.StartHigh, - "up_step", cfg.ScaleUpStep, - "down_step", cfg.ScaleDownStep, - "down_hyst", cfg.ScaleDownHysteresis, - "down_idle_ticks", cfg.ScaleDownIdleTicks) - } - - ticker := time.NewTicker(cfg.Interval) - defer ticker.Stop() - idleTicks := 0 - lastGen := src.Generation() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - if gen := src.Generation(); gen != lastGen { - // Engine switched (only meaningful for adaptive). Re-baseline - // the new engine to the current `active` count: 0..active-1 - // resumed, active..total-1 paused. This is idempotent against - // the new engine's existing per-worker state. - for i := 0; i < totalWorkers; i++ { - if i < active { - src.ResumeWorker(i) - } else { - src.PauseWorker(i) - } - } - lastGen = gen - idleTicks = 0 - continue - } - active, idleTicks = tick(src, cfg, active, idleTicks) - } - } -} - -// tick is a single iteration of the scaler decision loop. Split out -// for unit-testing the algorithm without spinning up a real engine. -func tick(src Source, cfg Config, active, idleTicks int) (int, int) { - totalWorkers := src.NumWorkers() - conns := src.ActiveConns() - desired := int((conns + int64(cfg.TargetConnsPerWorker) - 1) / int64(cfg.TargetConnsPerWorker)) - if desired < cfg.MinActive { - desired = cfg.MinActive - } - if desired > totalWorkers { - desired = totalWorkers - } - if cfg.Trace { - if log := src.Logger(); log != nil { - log.Info("scaler tick", - "conns", conns, "desired", desired, "active", active, "idle_ticks", idleTicks) - } - } - - switch { - case desired > active: - step := desired - active - if step > cfg.ScaleUpStep { - step = cfg.ScaleUpStep - } - for n := 0; n < step; n++ { - src.ResumeWorker(active) - active++ - } - idleTicks = 0 - case desired <= active-cfg.ScaleDownHysteresis-1: - idleTicks++ - if idleTicks >= cfg.ScaleDownIdleTicks { - step := active - desired - if step > cfg.ScaleDownStep { - step = cfg.ScaleDownStep - } - for n := 0; n < step && active > cfg.MinActive; n++ { - active-- - src.PauseWorker(active) - } - idleTicks = 0 - } - default: - idleTicks = 0 - } - return active, idleTicks -} diff --git a/engine/scaler/scaler_test.go b/engine/scaler/scaler_test.go deleted file mode 100644 index bcce2b7d..00000000 --- a/engine/scaler/scaler_test.go +++ /dev/null @@ -1,327 +0,0 @@ -//go:build linux - -package scaler - -import ( - "context" - "log/slog" - "sync" - "testing" - "time" - - "github.com/goceleris/celeris/resource" -) - -// TestResolve_Defaults locks in the data-validated defaults captured in -// the v1.4.1 spike-B sweep: start-high, min=numCPU/2, target=20, -// interval=250ms, upStep=2, downStep=1, hyst=1, idleTicks=4. -func TestResolve_Defaults(t *testing.T) { - t.Parallel() - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{}} - c := Resolve(rcfg, 8) - if !c.Enabled { - t.Fatal("typed config (non-nil) must enable the scaler") - } - if !c.StartHigh { - t.Errorf("StartHigh: expected true (zero value Strategy = StartHigh), got false") - } - if c.MinActive != 4 { - t.Errorf("MinActive: expected 4 (numCPU/2 with numCPU=8), got %d", c.MinActive) - } - if c.TargetConnsPerWorker != 20 { - t.Errorf("TargetConnsPerWorker: expected 20, got %d", c.TargetConnsPerWorker) - } - if c.Interval != 250*time.Millisecond { - t.Errorf("Interval: expected 250ms, got %v", c.Interval) - } - if c.ScaleUpStep != 2 { - t.Errorf("ScaleUpStep: expected 2, got %d", c.ScaleUpStep) - } - if c.ScaleDownStep != 1 { - t.Errorf("ScaleDownStep: expected 1, got %d", c.ScaleDownStep) - } - if c.ScaleDownHysteresis != 1 { - t.Errorf("ScaleDownHysteresis: expected 1, got %d", c.ScaleDownHysteresis) - } - if c.ScaleDownIdleTicks != 4 { - t.Errorf("ScaleDownIdleTicks: expected 4, got %d", c.ScaleDownIdleTicks) - } -} - -// TestResolve_StartLow verifies opt-in to start-low. -func TestResolve_StartLow(t *testing.T) { - t.Parallel() - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{ - Strategy: resource.ScalingStrategyStartLow, - }} - c := Resolve(rcfg, 8) - if c.StartHigh { - t.Errorf("StartHigh: expected false (Strategy=StartLow), got true") - } -} - -// TestResolve_MinActiveClamping covers the floor + cap on MinActive. -func TestResolve_MinActiveClamping(t *testing.T) { - t.Parallel() - cases := []struct { - name string - minActive int - numWorkers int - want int - }{ - {"zero defaults to numCPU/2", 0, 8, 4}, - {"zero with small pool floors to 2", 0, 2, 2}, - {"explicit 1 respected", 1, 8, 1}, - {"above pool clamped to pool", 16, 8, 8}, - {"explicit 3", 3, 8, 3}, - } - for _, tc := range cases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - c := Resolve(resource.Config{WorkerScaling: &resource.WorkerScalingConfig{MinActive: tc.minActive}}, tc.numWorkers) - if c.MinActive != tc.want { - t.Errorf("MinActive: got %d, want %d", c.MinActive, tc.want) - } - }) - } -} - -// TestResolve_ConfigBeatsEnv verifies typed config wins over env vars. -func TestResolve_ConfigBeatsEnv(t *testing.T) { - t.Setenv("CELERIS_DYN_TARGET", "999") - rcfg := resource.Config{WorkerScaling: &resource.WorkerScalingConfig{TargetConnsPerWorker: 25}} - c := Resolve(rcfg, 4) - if c.TargetConnsPerWorker != 25 { - t.Errorf("typed config did not take precedence over env: got %d, want 25", c.TargetConnsPerWorker) - } - if !c.Enabled { - t.Error("typed config should produce Enabled=true") - } -} - -// TestResolve_EnvFallback verifies the env-var path is the fallback. -func TestResolve_EnvFallback(t *testing.T) { - t.Setenv("CELERIS_DYN_WORKERS", "1") - t.Setenv("CELERIS_DYN_TARGET", "33") - c := Resolve(resource.Config{}, 4) - if !c.Enabled { - t.Fatal("CELERIS_DYN_WORKERS=1 should enable the scaler") - } - if c.TargetConnsPerWorker != 33 { - t.Errorf("env target read incorrectly: got %d, want 33", c.TargetConnsPerWorker) - } -} - -// TestResolve_DisabledByDefault verifies the legacy zero-config path. -func TestResolve_DisabledByDefault(t *testing.T) { - t.Setenv("CELERIS_DYN_WORKERS", "") - c := Resolve(resource.Config{}, 4) - if c.Enabled { - t.Errorf("scaler should be disabled when neither env nor config provides it") - } -} - -// fakeSource implements Source for unit-testing the algorithm without -// spinning up an engine. PauseWorker / ResumeWorker calls are tracked -// per worker. -type fakeSource struct { - mu sync.Mutex - numWorkers int - conns int64 - gen uint64 - pauseLog []int - resumeLog []int - paused []bool -} - -func newFake(n int) *fakeSource { return &fakeSource{numWorkers: n, paused: make([]bool, n)} } - -func (s *fakeSource) NumWorkers() int { return s.numWorkers } -func (s *fakeSource) ActiveConns() int64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.conns -} -func (s *fakeSource) PauseWorker(i int) { - s.mu.Lock() - defer s.mu.Unlock() - s.pauseLog = append(s.pauseLog, i) - if i >= 0 && i < len(s.paused) { - s.paused[i] = true - } -} -func (s *fakeSource) ResumeWorker(i int) { - s.mu.Lock() - defer s.mu.Unlock() - s.resumeLog = append(s.resumeLog, i) - if i >= 0 && i < len(s.paused) { - s.paused[i] = false - } -} -func (s *fakeSource) Generation() uint64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.gen -} -func (s *fakeSource) Logger() *slog.Logger { return nil } - -func (s *fakeSource) setConns(n int64) { - s.mu.Lock() - s.conns = n - s.mu.Unlock() -} -func (s *fakeSource) bumpGen() { - s.mu.Lock() - s.gen++ - s.mu.Unlock() -} -func (s *fakeSource) numActive() int { - s.mu.Lock() - defer s.mu.Unlock() - n := 0 - for _, p := range s.paused { - if !p { - n++ - } - } - return n -} - -// runAsync starts Run in a goroutine and returns a stop func plus a -// Done chan so callers can poll the source state without depending on -// the run-loop's wall-clock progress. Older versions of these tests -// used a fixed-duration ctx and asserted at the end; under -race or -// on slow CPUs the time.Ticker can fire fewer than expected times in -// a 200ms window, leading to flakes (msa2-client hit "got 7" once). -func runAsync(t *testing.T, src *fakeSource, cfg Config) (stop func(), done <-chan struct{}) { - t.Helper() - ctx, cancel := context.WithCancel(context.Background()) - d := make(chan struct{}) - go func() { - defer close(d) - Run(ctx, src, cfg) - }() - return cancel, d -} - -// waitForActive polls src.numActive() until it matches want or -// timeout elapses. Returns true on success. Takes the place of a -// fixed-time sleep in scaler tests. -func waitForActive(t *testing.T, src *fakeSource, want int, timeout time.Duration) bool { - t.Helper() - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - if src.numActive() == want { - return true - } - time.Sleep(2 * time.Millisecond) - } - t.Errorf("numActive: waited %v for %d, last=%d", timeout, want, src.numActive()) - return false -} - -// TestRun_StartHighScalesDownOnIdle verifies the start-high default -// behaviour: starts at NumWorkers active, scales down to a floor of -// MinActive when load is low. -// -// Floor: with ScaleDownHysteresis=0 the floor IS MinActive. With -// hysteresis>0 the floor is MinActive+hysteresis to prevent flapping -// near the boundary; that's tested in -// TestRun_HysteresisFloorPreservedAboveMinActive. -func TestRun_StartHighScalesDownOnIdle(t *testing.T) { - src := newFake(8) - src.setConns(0) // idle from the start - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 0, ScaleDownIdleTicks: 2, - } - stop, done := runAsync(t, src, cfg) - waitForActive(t, src, 2, 2*time.Second) - stop() - <-done -} - -// TestRun_HysteresisFloorPreservedAboveMinActive verifies that with the -// default hysteresis=1 the algorithm stops scaling at MinActive+1, not -// at MinActive. This prevents flapping near the boundary when conn -// count oscillates ±1 around the threshold. -func TestRun_HysteresisFloorPreservedAboveMinActive(t *testing.T) { - src := newFake(8) - src.setConns(0) - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 2, - } - stop, done := runAsync(t, src, cfg) - // Floor is MinActive + hyst = 3. - waitForActive(t, src, 3, 2*time.Second) - stop() - <-done -} - -// TestRun_StartLowScalesUpOnLoad verifies the start-low path: starts at -// MinActive, scales up to ceil(conns/target) when load arrives. -func TestRun_StartLowScalesUpOnLoad(t *testing.T) { - src := newFake(8) - cfg := Config{ - Enabled: true, StartHigh: false, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 4, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100, - } - // Set conns so desired = ceil(160/20) = 8 (full pool). - src.setConns(160) - stop, done := runAsync(t, src, cfg) - waitForActive(t, src, 8, 2*time.Second) - stop() - <-done -} - -// TestRun_GenerationChangeRebaselines verifies that incrementing -// Generation triggers re-baseline of the active count on the new -// underlying engine. This is the adaptive switch-handling invariant. -func TestRun_GenerationChangeRebaselines(t *testing.T) { - src := newFake(8) - src.setConns(60) // desired = 3, but with start-high we start at 8 - cfg := Config{ - Enabled: true, StartHigh: true, - MinActive: 2, TargetConnsPerWorker: 20, - Interval: 5 * time.Millisecond, - ScaleUpStep: 2, ScaleDownStep: 1, - ScaleDownHysteresis: 1, ScaleDownIdleTicks: 100, - } - // Run for a few ticks so the scaler settles. Then bump Generation - // — the next tick should re-baseline the (still-fake) source by - // pausing/resuming workers to enforce the current `active`. - ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) - defer cancel() - go Run(ctx, src, cfg) - time.Sleep(15 * time.Millisecond) - - // Reset the logs; the next bump should write a full re-baseline burst. - src.mu.Lock() - src.pauseLog = nil - src.resumeLog = nil - src.mu.Unlock() - src.bumpGen() - time.Sleep(15 * time.Millisecond) - cancel() - time.Sleep(10 * time.Millisecond) - - src.mu.Lock() - pauseLogLen := len(src.pauseLog) - resumeLogLen := len(src.resumeLog) - src.mu.Unlock() - if pauseLogLen+resumeLogLen < src.NumWorkers() { - t.Errorf("expected re-baseline to issue Pause+Resume calls covering all %d workers; got %d pauses + %d resumes", - src.NumWorkers(), pauseLogLen, resumeLogLen) - } -} diff --git a/handler.go b/handler.go index 7c0e618b..1eb3b060 100644 --- a/handler.go +++ b/handler.go @@ -131,7 +131,28 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro c.handlers = handlers c.fullPath = fullPath - if err := c.Next(); err != nil { + // celeris#356: an adaptive route (inherited the AsyncHandlers=true default) + // runs INLINE here until observed to block. Time this inline run; if the + // handler chain exceeds adaptivePromoteThreshold it is genuinely blocking, + // so promote the route to async dispatch — future requests then run on a + // goroutine instead of stalling the event-loop worker. Non-adaptive configs + // (no AsyncHandlers default) hit the empty-map fast path and skip timing. + rt := a.server.router + if rt.adaptiveRoutes[fullPath] && rt.adaptiveLearning(fullPath) { + start := time.Now() + err := c.Next() + dur := time.Since(start) + if dur > adaptiveBlockingThreshold { + // Unambiguously a blocking I/O round-trip: promote on the first such + // run rather than waiting for adaptivePromoteStreak slow runs. + rt.promoteRouteImmediate(fullPath) + } else { + rt.recordInlineRun(fullPath, dur > adaptivePromoteThreshold) + } + if err != nil { + a.handleError(c, s, err) + } + } else if err := c.Next(); err != nil { a.handleError(c, s, err) } if c.buffered && !c.written { @@ -141,6 +162,55 @@ func (a *routerAdapter) HandleStream(ctx context.Context, s *stream.Stream) erro return nil } +// adaptivePromoteThreshold is the inline handler duration that counts as a +// "slow" run for adaptive promotion (celeris#356). A non-blocking handler — +// even a heavy middleware chain — returns in tens of microseconds; a blocking +// one (DB/cache round-trip) takes hundreds of µs to ms. The bar sits well above +// the CPU-bound range so a transient GC/scheduling burst cannot push a +// CPU-bound chain over it for adaptivePromoteStreak consecutive runs and +// wrongly promote it to the slower async path — a 50µs bar did exactly that, +// intermittently collapsing iouring-async chain-fullstack (celeris#364). +// Genuinely-blocking routes are marked .Async() explicitly (opting out of +// adaptive); auto-promotion is a safety net for an unmarked handler that blocks +// on EVERY request. +const adaptivePromoteThreshold = 300 * time.Microsecond + +// adaptiveBlockingThreshold is the inline duration that is UNAMBIGUOUSLY a +// blocking I/O round-trip (not CPU work under contention). A single inline run +// over this bar promotes the route IMMEDIATELY, skipping the +// adaptivePromoteStreak hysteresis — a genuinely-blocking handler that an +// operator forgot to mark .Async() then stalls a worker for at most one request +// instead of adaptivePromoteStreak of them. The bar sits far above any CPU-bound +// chain's wall-clock (even under GC/scheduling jitter), so it cannot misfire on +// a CPU route; the 300µs/streak path still handles the borderline 300µs–2ms band. +const adaptiveBlockingThreshold = 2 * time.Millisecond + +// adaptivePromoteStreak is how many CONSECUTIVE slow inline runs promote an +// adaptive route to async. The consecutive requirement (a fast run resets the +// streak) makes a one-off cold start / GC pause harmless, while a handler that +// blocks on every request promotes within a handful of requests. +const adaptivePromoteStreak = 8 + +// adaptiveSettleStreak is how many CONSECUTIVE fast inline runs SETTLE an +// adaptive route (celeris#361): proven non-blocking, it is removed from the +// timed path so the hot loop stops paying two time.Now() vDSO calls per +// request forever. High enough that only consistently-static routes settle (a +// slow run resets the streak); at scale a static route settles in well under a +// millisecond. A genuinely-blocking handler should be marked .Async() — it +// promotes long before it could settle. +const adaptiveSettleStreak = 256 + +// adaptivePromoteTTL bounds how long a promotion lasts before the route is +// re-evaluated inline (celeris#364). Promotion is otherwise terminal — a +// promoted route runs async and is never re-timed — so a CPU-bound chain that +// was falsely promoted by a transient load/jitter spike (inline wall-clock +// crossing adaptivePromoteThreshold under worker contention, not actual +// blocking) stayed on the ~32%-slower async path until restart. After the TTL +// the route runs inline again and re-settles if fast, or re-promotes within +// adaptivePromoteStreak runs if genuinely blocking. The clock is read only for +// already-promoted routes, so the fast path is unaffected. +const adaptivePromoteTTL = 5 * time.Second + // recoverAndRelease handles panic recovery and context release. Extracted to a // separate noinline function so that HandleStream's stack frame is not inflated // by the deferred closure and debug.Stack() call (P5). diff --git a/internal/conn/h1.go b/internal/conn/h1.go index ea5b0d7d..415f6164 100644 --- a/internal/conn/h1.go +++ b/internal/conn/h1.go @@ -246,6 +246,29 @@ func (s *H1State) HasPendingData() bool { return s.buffer.Len() > 0 || s.bodyNeeded > 0 } +// HasPendingDispatchState reports whether ProcessH1 left partial state that +// MUST be continued on the dispatch goroutine (InlineMode=false): buffered +// partial headers or a chunked body in progress, both of which resume via the +// buffered parse path that does not re-run the per-route async check. A +// fixed-length body in progress (bodyNeeded > 0) is deliberately EXCLUDED: its +// continuation re-parses the already-async-checked (so provably non-async) +// request and dispatches the handler inline on the worker — exactly as the +// sync engine does — so it must NOT force the conn onto the slower async +// dispatch path. Promoting on it permanently poisoned keep-alive conns that +// hit even one split-across-recvs body (the post-4k regression). +func (s *H1State) HasPendingDispatchState() bool { + return s.buffer.Len() > 0 && s.bodyNeeded <= 0 +} + +// CurrentRoute returns the method and path of the request currently parsed into +// the H1 state. The engine records these when a route forces an async promotion +// so the dispatch goroutine can revert the connection to inline once that +// route's promotion expires (celeris#364). req.Method/req.Path are interned +// (stable) strings, safe to retain past the recv buffer's lifetime. +func (s *H1State) CurrentRoute() (method, path string) { + return s.req.Method, s.req.Path +} + // UpdateWriteFn replaces the response adapter's write function. Called by // OnDetach to route StreamWriter writes through the mutex-guarded writeFn. func (s *H1State) UpdateWriteFn(fn func([]byte)) { diff --git a/middleware/compress/go.mod b/middleware/compress/go.mod index d14eeb27..d68ab9da 100644 --- a/middleware/compress/go.mod +++ b/middleware/compress/go.mod @@ -4,7 +4,7 @@ go 1.26.4 require ( github.com/andybalholm/brotli v1.2.1 - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 github.com/klauspost/compress v1.18.6 ) diff --git a/middleware/metrics/go.mod b/middleware/metrics/go.mod index 33d52836..dd707a33 100644 --- a/middleware/metrics/go.mod +++ b/middleware/metrics/go.mod @@ -3,10 +3,10 @@ module github.com/goceleris/celeris/middleware/metrics go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/prometheus/common v0.68.1 + github.com/prometheus/common v0.69.0 ) require ( diff --git a/middleware/metrics/go.sum b/middleware/metrics/go.sum index b7ee233c..4baf42e7 100644 --- a/middleware/metrics/go.sum +++ b/middleware/metrics/go.sum @@ -14,8 +14,8 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.68.1 h1:omjRRl4QP4komogpXuhfeOiisQg7xdy8VM1UY+pStaY= -github.com/prometheus/common v0.68.1/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y= +github.com/prometheus/common v0.69.0 h1:OA85nJQS/T/MaYh/Q2CcgDKSGWqNIgrBDvDH85CuiNk= +github.com/prometheus/common v0.69.0/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= diff --git a/middleware/otel/go.mod b/middleware/otel/go.mod index fc135432..61423c64 100644 --- a/middleware/otel/go.mod +++ b/middleware/otel/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/otel go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 go.opentelemetry.io/otel v1.44.0 go.opentelemetry.io/otel/metric v1.44.0 go.opentelemetry.io/otel/sdk v1.44.0 diff --git a/middleware/protobuf/go.mod b/middleware/protobuf/go.mod index 948f256a..bad3be60 100644 --- a/middleware/protobuf/go.mod +++ b/middleware/protobuf/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/middleware/protobuf go 1.26.4 require ( - github.com/goceleris/celeris v1.4.4 + github.com/goceleris/celeris v1.5.3 google.golang.org/protobuf v1.36.11 ) diff --git a/middleware/secure/config.go b/middleware/secure/config.go index f6151f4c..c630a409 100644 --- a/middleware/secure/config.go +++ b/middleware/secure/config.go @@ -67,12 +67,13 @@ type Config struct { CrossOriginResourcePolicy string `yaml:"cross_origin_resource_policy"` // CrossOriginEmbedderPolicy sets the Cross-Origin-Embedder-Policy header. - // Default: "require-corp". + // Default: "" (NOT emitted — opt-in). // - // WARNING: "require-corp" blocks cross-origin resources (images, scripts, - // etc.) that do not carry a Cross-Origin-Resource-Policy header or valid - // CORS headers. If your application loads third-party assets, use - // "credentialless" or [Suppress] to avoid breakage. + // COEP is off by default because "require-corp" blocks cross-origin + // resources (images, scripts, etc.) that lack a Cross-Origin-Resource-Policy + // or valid CORS headers — enabling it by default silently breaks many sites, + // so like helmet we leave it opt-in. Set "require-corp" or "credentialless" + // to enable. CrossOriginEmbedderPolicy string `yaml:"cross_origin_embedder_policy"` // XDNSPrefetchControl sets the X-DNS-Prefetch-Control header. @@ -88,7 +89,9 @@ type Config struct { OriginAgentCluster string `yaml:"origin_agent_cluster"` // XDownloadOptions sets the X-Download-Options header. - // Default: "noopen". + // Default: "" (NOT emitted — opt-in). This header only affected legacy + // Internet Explorer downloads and is obsolete on modern browsers; set + // "noopen" to restore it. XDownloadOptions string `yaml:"x_download_options"` } @@ -103,11 +106,14 @@ var defaultConfig = Config{ ReferrerPolicy: "strict-origin-when-cross-origin", CrossOriginOpenerPolicy: "same-origin", CrossOriginResourcePolicy: "same-origin", - CrossOriginEmbedderPolicy: "require-corp", + // COEP off by default (opt-in): require-corp breaks cross-origin resource + // loads; enabling it by default is a footgun (matches helmet). #338. + CrossOriginEmbedderPolicy: "", XDNSPrefetchControl: "off", XPermittedCrossDomain: "none", OriginAgentCluster: "?1", - XDownloadOptions: "noopen", + // X-Download-Options off by default (opt-in): legacy IE-only, obsolete. + XDownloadOptions: "", } func applyDefaults(cfg Config) Config { diff --git a/middleware/secure/doc.go b/middleware/secure/doc.go index 156acdb3..7c7025a4 100644 --- a/middleware/secure/doc.go +++ b/middleware/secure/doc.go @@ -14,14 +14,16 @@ // - Referrer-Policy: "strict-origin-when-cross-origin" // - Cross-Origin-Opener-Policy: "same-origin" // - Cross-Origin-Resource-Policy: "same-origin" -// - Cross-Origin-Embedder-Policy: "require-corp" // - X-DNS-Prefetch-Control: "off" // - X-Permitted-Cross-Domain-Policies: "none" // - Origin-Agent-Cluster: "?1" -// - X-Download-Options: "noopen" // -// Content-Security-Policy and Permissions-Policy are only included when -// their respective [Config] fields are non-empty. +// Cross-Origin-Embedder-Policy ("require-corp") and X-Download-Options +// ("noopen") are OPT-IN (off by default, #338): COEP-by-default breaks +// cross-origin resource loads (matching helmet, which leaves it off), and +// X-Download-Options only affected legacy Internet Explorer. Content-Security- +// Policy and Permissions-Policy are likewise only emitted when their [Config] +// fields are non-empty. // // # Usage // @@ -64,8 +66,8 @@ // # CORS Interaction // // When using CORS middleware alongside secure, note that the default -// CrossOriginEmbedderPolicy (require-corp) and CrossOriginResourcePolicy -// (same-origin) block cross-origin resource loading. APIs serving -// cross-origin requests should set CrossOriginResourcePolicy: "cross-origin" -// and CrossOriginEmbedderPolicy: [Suppress]. +// CrossOriginResourcePolicy (same-origin) blocks cross-origin resource +// loading; APIs serving cross-origin requests should set +// CrossOriginResourcePolicy: "cross-origin". COEP is off by default so it no +// longer needs suppressing; enable it explicitly only when isolating the page. package secure diff --git a/middleware/secure/secure_test.go b/middleware/secure/secure_test.go index 67beefd7..abadfee8 100644 --- a/middleware/secure/secure_test.go +++ b/middleware/secure/secure_test.go @@ -28,11 +28,13 @@ func TestDefaultConfigSetsAllHeaders(t *testing.T) { testutil.AssertHeader(t, rec, "referrer-policy", "strict-origin-when-cross-origin") testutil.AssertHeader(t, rec, "cross-origin-opener-policy", "same-origin") testutil.AssertHeader(t, rec, "cross-origin-resource-policy", "same-origin") - testutil.AssertHeader(t, rec, "cross-origin-embedder-policy", "require-corp") + // COEP (require-corp) and X-Download-Options (legacy IE) are OFF by default + // — opt-in (#338); COEP-by-default breaks cross-origin loads. + testutil.AssertNoHeader(t, rec, "cross-origin-embedder-policy") testutil.AssertHeader(t, rec, "x-dns-prefetch-control", "off") testutil.AssertHeader(t, rec, "x-permitted-cross-domain-policies", "none") testutil.AssertHeader(t, rec, "origin-agent-cluster", "?1") - testutil.AssertHeader(t, rec, "x-download-options", "noopen") + testutil.AssertNoHeader(t, rec, "x-download-options") } func TestDefaultConfigOmitsCSPAndPermissionsPolicy(t *testing.T) { @@ -56,8 +58,8 @@ func TestHeaderDefaultsAndOverrides(t *testing.T) { {"xss-protection custom", &Config{XSSProtection: "1; mode=block"}, "x-xss-protection", "1; mode=block"}, {"origin-agent-cluster default", nil, "origin-agent-cluster", "?1"}, {"origin-agent-cluster custom", &Config{OriginAgentCluster: "?0"}, "origin-agent-cluster", "?0"}, - {"x-download-options default", nil, "x-download-options", "noopen"}, - {"x-download-options custom", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"}, + {"x-download-options custom (opt-in)", &Config{XDownloadOptions: "custom"}, "x-download-options", "custom"}, + {"coep opt-in", &Config{CrossOriginEmbedderPolicy: "require-corp"}, "cross-origin-embedder-policy", "require-corp"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -275,8 +277,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) { if cfg.CrossOriginResourcePolicy != "same-origin" { t.Fatalf("CrossOriginResourcePolicy: got %q, want %q", cfg.CrossOriginResourcePolicy, "same-origin") } - if cfg.CrossOriginEmbedderPolicy != "require-corp" { - t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q", cfg.CrossOriginEmbedderPolicy, "require-corp") + if cfg.CrossOriginEmbedderPolicy != "" { + t.Fatalf("CrossOriginEmbedderPolicy: got %q, want %q (opt-in default, #338)", cfg.CrossOriginEmbedderPolicy, "") } if cfg.XDNSPrefetchControl != "off" { t.Fatalf("XDNSPrefetchControl: got %q, want %q", cfg.XDNSPrefetchControl, "off") @@ -287,8 +289,8 @@ func TestApplyDefaultsFillsEmptyFields(t *testing.T) { if cfg.OriginAgentCluster != "?1" { t.Fatalf("OriginAgentCluster: got %q, want %q", cfg.OriginAgentCluster, "?1") } - if cfg.XDownloadOptions != "noopen" { - t.Fatalf("XDownloadOptions: got %q, want %q", cfg.XDownloadOptions, "noopen") + if cfg.XDownloadOptions != "" { + t.Fatalf("XDownloadOptions: got %q, want %q (opt-in default, #338)", cfg.XDownloadOptions, "") } } @@ -342,14 +344,14 @@ func TestBuildHeadersSkipsEmptyStrings(t *testing.T) { func TestBuildHeadersDefaultCount(t *testing.T) { cfg := applyDefaults(Config{}) headers := buildHeaders(cfg) - // Default config should produce 11 headers (no CSP, no PermissionsPolicy). - // x-content-type-options, x-frame-options, x-xss-protection, - // referrer-policy, cross-origin-opener-policy, cross-origin-resource-policy, - // cross-origin-embedder-policy, x-dns-prefetch-control, x-permitted-cross-domain-policies, - // origin-agent-cluster, x-download-options. - // Note: HSTS is not in buildHeaders (runtime check). - if len(headers) != 11 { - t.Fatalf("expected 11 headers from default config, got %d", len(headers)) + // Default config produces 9 headers (no CSP, no PermissionsPolicy; COEP + + // X-Download-Options are opt-in, #338): x-content-type-options, + // x-frame-options, x-xss-protection, referrer-policy, + // cross-origin-opener-policy, cross-origin-resource-policy, + // x-dns-prefetch-control, x-permitted-cross-domain-policies, + // origin-agent-cluster. Note: HSTS is not in buildHeaders (runtime check). + if len(headers) != 9 { + t.Fatalf("expected 9 headers from default config, got %d", len(headers)) } } @@ -359,9 +361,9 @@ func TestBuildHeadersWithCSPAndPermissions(t *testing.T) { PermissionsPolicy: "camera=()", }) headers := buildHeaders(cfg) - // 11 defaults + CSP + PermissionsPolicy = 13. - if len(headers) != 13 { - t.Fatalf("expected 13 headers, got %d", len(headers)) + // 9 defaults + CSP + PermissionsPolicy = 11. + if len(headers) != 11 { + t.Fatalf("expected 11 headers, got %d", len(headers)) } } diff --git a/probe/probe_caps_linux_test.go b/probe/probe_caps_linux_test.go index fcd2234e..84c949e7 100644 --- a/probe/probe_caps_linux_test.go +++ b/probe/probe_caps_linux_test.go @@ -16,3 +16,29 @@ func getNice(t *testing.T) (int, error) { t.Helper() return unix.Getpriority(unix.PRIO_PROCESS, 0) } + +// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the +// dangerous Setpriority-based probe (finding 5.2): the real read-only +// implementation must not change the process scheduling priority. We +// snapshot the current nice value, run the check, and assert it is +// unchanged. The boolean result itself is environment-dependent and not +// asserted. Linux-only (lives here with getNice so the cross-platform +// probe_test.go compiles on darwin/non-linux). +func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) { + sp := defaultProber() + if sp.CheckCapSysNice == nil { + t.Skip("no CheckCapSysNice on this platform") + } + before, err := getNice(t) + if err != nil { + t.Skipf("cannot read nice value: %v", err) + } + _ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects + after, err := getNice(t) + if err != nil { + t.Fatalf("cannot read nice value after check: %v", err) + } + if before != after { + t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after) + } +} diff --git a/probe/probe_test.go b/probe/probe_test.go index 8debb548..60944acd 100644 --- a/probe/probe_test.go +++ b/probe/probe_test.go @@ -314,34 +314,6 @@ func TestSQPollGateAtOptionalTier(t *testing.T) { } } -// TestCheckCapSysNiceIsSideEffectFree guards against a regression of the -// dangerous Setpriority-based probe (finding 5.2): the real read-only -// implementation must not change the process scheduling priority. We -// snapshot the current nice value, run the check, and assert it is -// unchanged. The boolean result itself is environment-dependent and not -// asserted. -func TestCheckCapSysNiceIsSideEffectFree(t *testing.T) { - if runtime.GOOS != "linux" { - t.Skip("CAP_SYS_NICE check is linux-only") - } - sp := defaultProber() - if sp.CheckCapSysNice == nil { - t.Skip("no CheckCapSysNice on this platform") - } - before, err := getNice(t) - if err != nil { - t.Skipf("cannot read nice value: %v", err) - } - _ = sp.CheckCapSysNice() // result is environment-dependent; we test for side effects - after, err := getNice(t) - if err != nil { - t.Fatalf("cannot read nice value after check: %v", err) - } - if before != after { - t.Fatalf("CheckCapSysNice mutated process nice: before=%d after=%d", before, after) - } -} - // TestProbeSendfileAndZerocopy pins the sendfile / zerocopy capability // flags (celeris#317). Sendfile is unconditional on Linux (kernel // 2.6.23+, every distro past 2.6.33) and is now actually wired into the diff --git a/protocol/h1/parser.go b/protocol/h1/parser.go index 89b0058e..36e4bf53 100644 --- a/protocol/h1/parser.go +++ b/protocol/h1/parser.go @@ -86,15 +86,12 @@ func (p *Parser) ParseRequest(req *Request) (int, error) { return 0, nil } - remaining := p.buf[p.pos:] - // Quick check: if remaining starts with \r\n, headers are empty (no headers). - // Otherwise use SIMD-accelerated findHeaderEnd to verify \r\n\r\n is present. - if len(remaining) < 2 || remaining[0] != '\r' || remaining[1] != '\n' { - if findHeaderEnd(remaining) < 0 { - return 0, nil - } - } - + // No upfront whole-block findHeaderEnd scan: parseHeaders detects an + // incomplete block itself (a final line with no CRLF yields lineEnd==-1 → + // (false,nil)), and every caller Reset()s parser+req before re-parsing, so + // a partial parse is always retried cleanly. Scanning here first would + // CRLF-walk the same bytes parseHeaders walks again — pure double work on + // the common single-read request. if p.noStringHeaders { req.Headers = req.Headers[:0] } else { diff --git a/protocol/h2/stream/stream.go b/protocol/h2/stream/stream.go index 77a95e56..46cf7b9b 100644 --- a/protocol/h2/stream/stream.go +++ b/protocol/h2/stream/stream.go @@ -314,6 +314,12 @@ func (s *Stream) resetAndPool() { } // ResetH1Stream performs a lightweight per-request reset for H1 stream reuse. +// It clears ONLY the fields populateCachedStream + handleH1Request do not +// unconditionally overwrite on the next request. rawBody is load-bearing: the +// caller installs it only when the request carries a body, so a bodyless GET +// would otherwise inherit the previous request's body. Headers/Method/Path/ +// Scheme/Authority/LazyRawHeaders/IsHEAD/EndStream/ResponseWriter/state are all +// re-set per request (h1.go:852-941), so re-zeroing them here is dead work. func ResetH1Stream(s *Stream) { if s.Data != nil { s.Data.Reset() @@ -321,19 +327,9 @@ func ResetH1Stream(s *Stream) { s.Data = nil } s.rawBody = nil - s.Headers = s.hdrBuf[:0] - s.LazyRawHeaders = nil s.lazyHeadersBuilt = false s.pseudoMaterialized = false - s.Method = "" - s.Path = "" - s.Scheme = "" - s.Authority = "" s.headersSent.Store(false) - s.EndStream = false - s.ResponseWriter = nil - s.IsHEAD = false - s.state.Store(int32(StateIdle)) } // MaterializeHeaders ensures that every header carried by this H1 stream diff --git a/protocol/h2/stream/stream_test.go b/protocol/h2/stream/stream_test.go new file mode 100644 index 00000000..854c6831 --- /dev/null +++ b/protocol/h2/stream/stream_test.go @@ -0,0 +1,48 @@ +package stream + +import "testing" + +// TestResetH1StreamClearsUniqueFields locks in the contract that ResetH1Stream +// owns: the fields the per-request caller does NOT unconditionally overwrite. +// rawBody is the load-bearing one — a bodyless GET that follows a request with +// a body must not inherit the stale body (celeris#346). +func TestResetH1StreamClearsUniqueFields(t *testing.T) { + s := NewH1Stream(1) + + s.SetRawBody([]byte("previous request body")) + s.GetBuf().WriteString("buffered data") + s.lazyHeadersBuilt = true + s.pseudoMaterialized = true + s.headersSent.Store(true) + + ResetH1Stream(s) + + if s.rawBody != nil { + t.Fatalf("rawBody = %q, want nil (stale body would leak into next request)", s.rawBody) + } + if s.Data != nil { + t.Fatalf("Data = %v, want nil (buffer must be returned to the pool)", s.Data) + } + if s.lazyHeadersBuilt { + t.Fatal("lazyHeadersBuilt = true, want false") + } + if s.pseudoMaterialized { + t.Fatal("pseudoMaterialized = true, want false") + } + if s.headersSent.Load() { + t.Fatal("headersSent = true, want false") + } +} + +func BenchmarkResetH1Stream(b *testing.B) { + s := NewH1Stream(1) + body := []byte("hello") + b.ReportAllocs() + for b.Loop() { + s.SetRawBody(body) + s.lazyHeadersBuilt = true + s.pseudoMaterialized = true + s.headersSent.Store(true) + ResetH1Stream(s) + } +} diff --git a/resource/config.go b/resource/config.go index 25b0e2f1..32fffdef 100644 --- a/resource/config.go +++ b/resource/config.go @@ -83,103 +83,6 @@ type Config struct { // from celeris.Config.EnableH2Upgrade (pointer, may be nil) and Protocol. // Always a concrete value after WithDefaults. EnableH2Upgrade bool - // SkipBuiltinScaler suppresses the per-engine dynamic worker scaler - // loop. Set by the adaptive engine when it constructs its sub-engines — - // adaptive runs ONE higher-level scaler that delegates to the active - // sub-engine, so the iouring + epoll built-in scalers must stay quiet - // to avoid two scalers fighting over the same worker pool. - SkipBuiltinScaler bool - // WorkerScaling configures the dynamic worker scaler. As of v1.4.6 - // (issue #281), [celeris.Config.toResourceConfig] sets this to a - // zero-value struct whenever the user-facing field was nil — the - // scaler is therefore DEFAULT-ON for any user constructing the - // server via celeris.New. Direct callers of resource.Config (i.e. - // engine tests + the resource_test suite) still see nil-as-disabled - // to keep the legacy contract for that low-level surface. - // - // When set, the scaler activates and deactivates workers based on - // observed load to keep CQE/event batching density in the sweet - // spot. See WorkerScalingConfig for tuning details. - WorkerScaling *WorkerScalingConfig -} - -// WorkerScalingStrategy selects the seed strategy for the dynamic -// worker scaler. The zero value (ScalingStrategyStartHigh) is the -// recommended default: start at NumWorkers active, scale down once -// load is observably low. This preserves SO_REUSEPORT distribution at -// startup, which the spike-B sweep showed is dramatically better on -// ramp / oscil traffic patterns (+34-78 % across all three engines). -type WorkerScalingStrategy int - -const ( - // ScalingStrategyStartHigh seeds the scaler at NumWorkers active. - // Best for production where traffic ramps from idle and bursts. - // Zero value — selected when the field is unset. - ScalingStrategyStartHigh WorkerScalingStrategy = 0 - // ScalingStrategyStartLow seeds the scaler at MinActive. Best when - // the application has a long idle warmup before any conns arrive - // and saving CPU during that idle period matters more than peak - // throughput on the first burst. - ScalingStrategyStartLow WorkerScalingStrategy = 1 -) - -// WorkerScalingConfig controls the dynamic worker scaler used by the -// iouring, epoll, and adaptive engines. Zero values mean "use the -// scaler's default" — see field comments for what those are. Pass via -// celeris.Config.WorkerScaling to enable; nil disables the scaler -// entirely (the engine runs all configured workers all the time, like -// versions before the scaler was introduced). -// -// The scaler tracks the engine's activeConns counter and adjusts the -// number of "active" workers (workers participating in the SO_REUSEPORT -// group) so that conns / active is roughly TargetConnsPerWorker. Scale-up -// is reactive (next tick after a load increase). Scale-down is hysteretic -// — must observe ScaleDownIdleTicks consecutive ticks below the -// hysteresis threshold before reducing one worker. -type WorkerScalingConfig struct { - // Strategy picks the seed-state strategy. Zero value is - // ScalingStrategyStartHigh, which is the data-validated default for - // most production workloads. See WorkerScalingStrategy for tuning. - Strategy WorkerScalingStrategy - // MinActive is the floor on the active worker count. The scaler will - // never reduce active workers below this. Defaults to max(2, NumCPU/2). - // Set to NumCPU to force the scaler to always run at full capacity - // (effectively a static-w=NumCPU configuration). - MinActive int - // TargetConnsPerWorker is the active-worker scaling target. The scaler - // computes desired = ceil(activeConns / TargetConnsPerWorker), clamps - // to [MinActive, NumWorkers], and steers active toward that. Default 20. - // Higher values keep more conns per worker (better batching, less - // parallelism). Lower values prefer parallelism over batching. - TargetConnsPerWorker int - // Interval controls how often the scaler reevaluates active count. - // Default 250ms. Lower values respond to load changes faster but burn - // more CPU on the controller goroutine. - Interval time.Duration - // ScaleUpStep is the maximum number of workers the scaler will - // resume per tick. Default 2 — wider bursts disrupt SO_REUSEPORT - // load balancing more than they help. Bigger values are tempting on - // SPIKE workloads but produce worse throughput per the v1.4.1 - // SPIKE-test sweep (upStep=4 and upStep=8 both lost to upStep=2). - ScaleUpStep int - // ScaleDownStep is the maximum number of workers the scaler will - // pause per tick when load drops. Default 1 — scale-down too quickly - // and you can't recover throughput when load comes back. - ScaleDownStep int - // ScaleDownHysteresis adds a buffer between desired and active - // before scale-down fires: scale-down only if desired ≤ active - - // ScaleDownHysteresis - 1. Default 1 (so a desired-of-N triggers - // scale-down only when active is N+2 or higher). - ScaleDownHysteresis int - // ScaleDownIdleTicks is how many consecutive sub-threshold ticks - // must pass before a single scale-down step fires. Default 4 - // (= 1 second at the default 250ms interval). Tunes how patient - // the scaler is about temporary lulls: a request-rate dip of one - // tick will not trigger scale-down. - ScaleDownIdleTicks int - // Trace logs every scaler decision (active, desired, idle_ticks). - // Default false. Use when diagnosing scaling behaviour. - Trace bool } // Validate checks all config fields and returns any validation errors. diff --git a/resource/resource.go b/resource/resource.go index 756ed1d6..7325ef43 100644 --- a/resource/resource.go +++ b/resource/resource.go @@ -1,5 +1,29 @@ package resource +// WorkloadHint is an OPTIONAL operator declaration of the expected steady-state +// concurrency, consumed only by the adaptive engine's start-engine decision. +// +// Because established connections cannot migrate between epoll and io_uring, +// the START engine decides keep-alive throughput; and the expected concurrency +// is unknowable at Listen() time (no connections exist yet). This hint is the +// ONLY input that can make the adaptive engine START on io_uring. Absent it, +// the engine defaults to epoll (every server ramps from zero connections — the +// low-concurrency regime where epoll wins on both throughput and tail latency) +// and lets the runtime switch route NEW connections up to io_uring under load. +type WorkloadHint int + +const ( + // WorkloadUnspecified (zero value) leaves the start-engine choice to the + // default policy: start epoll, promote new conns to io_uring under load. + WorkloadUnspecified WorkloadHint = iota + // WorkloadLowConcurrency explicitly declares thin/latency-sensitive traffic + // — start (and stay) on epoll. + WorkloadLowConcurrency + // WorkloadHighConcurrency explicitly declares many h1 keep-alive + // connections per worker — start on io_uring (when kernel + memlock allow). + WorkloadHighConcurrency +) + // Resources allows user overrides of default resource values. // Zero values mean "use engine default". type Resources struct { @@ -13,6 +37,9 @@ type Resources struct { SocketSend int // MaxConns is the max simultaneous connections per worker (0 = unlimited). MaxConns int + // WorkloadHint is an OPTIONAL operator concurrency declaration; the only + // input that can make the adaptive engine START on io_uring (see WorkloadHint). + WorkloadHint WorkloadHint } // ResolvedResources contains the final computed values after applying defaults, diff --git a/router.go b/router.go index a1f6b9e2..50697047 100644 --- a/router.go +++ b/router.go @@ -5,8 +5,15 @@ import ( "log/slog" "slices" "sync" + "sync/atomic" + "time" ) +// nowNano returns the current time in Unix nanoseconds. A package var so the +// adaptive promotion TTL (celeris#364) can be exercised deterministically in +// tests without sleeping. +var nowNano = func() int64 { return time.Now().UnixNano() } + // staticEntry holds the pre-composed handler chain and full path for a fully // static route, enabling O(1) map lookup instead of a trie walk. type staticEntry struct { @@ -95,6 +102,35 @@ type router struct { // decide whether the async dispatch infrastructure is needed at // all (a server with zero async routes keeps the inline fast path). asyncRouteCount int + + // adaptiveRoutes (celeris#356) holds the fullPaths of routes that + // inherited the server-level AsyncHandlers=true default (rather than an + // explicit .Async()). They start INLINE (ring-batched send, the cheap + // path) and are promoted to async dispatch only when an inline run is + // observed to block — so trivial handlers keep io_uring's syscall + // batching while genuinely-blocking handlers still get goroutine + // isolation. Built at registration; read-only while serving. + adaptiveRoutes map[string]bool + // promoted records adaptive fullPaths that have been promoted to async + // after sustained blocking inline runs. sync.Map: concurrent reads on the + // hot path, rare writes at promotion time. + promoted sync.Map + // slowStreak tracks consecutive slow inline runs per adaptive fullPath + // (fullPath -> *atomic.Int32). Hysteresis: a one-off cold/GC outlier must + // not poison a fast route; only a handler that blocks on EVERY run (DB/ + // cache) accumulates adaptivePromoteStreak slow runs and gets promoted. A + // fast run resets the streak. + slowStreak sync.Map + // fastStreak tracks consecutive FAST inline runs per adaptive fullPath + // (fullPath -> *atomic.Int32). Once a route is fast on adaptiveSettleStreak + // consecutive runs it is provably non-blocking and gets SETTLED — future + // requests skip the per-request inline timing (two time.Now() vDSO calls) + // entirely (celeris#361). A slow run resets it. + fastStreak sync.Map + // settled holds adaptive fullPaths proven non-blocking (see fastStreak). + // A settled route is no longer timed/promotable; it runs inline like a + // plain sync route. Explicit .Async()/.Sync() clears it (setAsync). + settled sync.Map } // Route is an opaque handle to a registered route. Use the Name method to @@ -205,6 +241,22 @@ func (r *Route) Sync() *Route { return r.setAsync(false) } +// UsesDriver marks a route whose handler performs a blocking backend round-trip +// via a celeris driver (postgres / redis / memcached) opened WithEngine(srv). +// It is exactly equivalent to .Async() — the route is dispatched off the worker +// so the blocking call parks the handler goroutine on netpoll instead of +// stalling an I/O worker — but reads as intent at the call site and is the +// recommended way to mark driver routes: +// +// srv.GET("/users/:id", getUser).UsesDriver() // == .Async(), clearer intent +// +// The adaptive safety net (Config.AsyncHandlers=true) only auto-promotes +// handlers slower than ~300µs, so a fast localhost driver call (sub-300µs) would +// otherwise keep blocking a worker every request — mark such routes explicitly. +func (r *Route) UsesDriver() *Route { + return r.setAsync(true) +} + // setAsync is the shared implementation of [Route.Async] and [Route.Sync]. // Updates the node, the router's asyncRouteCount, and the static-fast-path // map entry so all three observers see the same async flag. @@ -212,11 +264,28 @@ func (r *Route) setAsync(want bool) *Route { if r.node == nil { return r } - if r.node.async != want && r.router != nil { - if want { - r.router.asyncRouteCount++ - } else if r.router.asyncRouteCount > 0 { - r.router.asyncRouteCount-- + if r.router != nil { + // celeris#356: an explicit .Async()/.Sync() overrides the adaptive + // (server-default) classification — the route is no longer + // inline-first-with-promotion, so drop it from the adaptive set and + // clear any prior promotion. + if r.router.adaptiveRoutes[r.path] { + delete(r.router.adaptiveRoutes, r.path) + r.router.promoted.Delete(r.path) + r.router.settled.Delete(r.path) + r.router.fastStreak.Delete(r.path) + r.router.slowStreak.Delete(r.path) + // Adaptive routes were already counted (they may promote): keep the + // count when the explicit choice is async, drop it when sync. + if !want && r.router.asyncRouteCount > 0 { + r.router.asyncRouteCount-- + } + } else if r.node.async != want { + if want { + r.router.asyncRouteCount++ + } else if r.router.asyncRouteCount > 0 { + r.router.asyncRouteCount-- + } } } r.node.async = want @@ -234,8 +303,102 @@ func (r *Route) setAsync(want bool) *Route { func newRouter() *router { return &router{ - namedRoutes: make(map[string]*Route), + namedRoutes: make(map[string]*Route), + adaptiveRoutes: make(map[string]bool), + } +} + +// isPromoted reports whether an adaptive route (celeris#356) is currently +// promoted to async dispatch. +// +// Promotion is REVERSIBLE (celeris#364): it expires after adaptivePromoteTTL. +// Once expired, the route is dropped from the promoted set and its slow streak +// is cleared, so the next request runs INLINE again and is re-timed — both the +// routing decision (adaptivePromoted) and the per-request timing decision +// (adaptiveLearning) key off this method, so they flip back together. A route +// that is genuinely blocking re-promotes within adaptivePromoteStreak runs; a +// route that was promoted by a transient load/jitter spike (a CPU-bound chain +// whose inline wall-clock briefly crossed the threshold under contention) stays +// inline. Without this, a single spike pinned a route to the slower async path +// until process restart (the intermittent ~32% chain collapse). +// +// The clock is read only for routes actually in the promoted set, so learning +// and settled routes pay nothing here. +func (r *router) isPromoted(fullPath string) bool { + v, ok := r.promoted.Load(fullPath) + if !ok { + return false } + if nowNano()-v.(int64) > int64(adaptivePromoteTTL) { + // Expired: re-enter the learning/inline path to re-evaluate. + r.promoted.Delete(fullPath) + if sv, ok := r.slowStreak.Load(fullPath); ok { + sv.(*atomic.Int32).Store(0) + } + return false + } + return true +} + +// promoteRoute marks an adaptive route as async after sustained blocking inline +// runs, stamped with the promotion time so isPromoted can expire it +// (celeris#364). Re-promotion refreshes the stamp. +func (r *router) promoteRoute(fullPath string) { + r.promoted.Store(fullPath, nowNano()) +} + +// promoteRouteImmediate promotes an adaptive route after a single +// unambiguously-blocking inline run (> adaptiveBlockingThreshold), skipping the +// adaptivePromoteStreak hysteresis. The fast streak is cleared so a burst of +// prior fast runs cannot settle the route once the promotion TTL expires and it +// re-enters the learning path. +func (r *router) promoteRouteImmediate(fullPath string) { + if fv, ok := r.fastStreak.Load(fullPath); ok { + fv.(*atomic.Int32).Store(0) + } + r.promoteRoute(fullPath) +} + +// recordInlineRun feeds one inline-run observation into the adaptive +// classifier (celeris#356). A fast run resets the slow streak; a slow run +// increments it, and once a route is slow on adaptivePromoteStreak CONSECUTIVE +// runs it is promoted to async. The consecutive requirement makes a single cold +// start / GC pause harmless while a genuinely-blocking handler (slow on every +// run) promotes within a handful of requests. +func (r *router) recordInlineRun(fullPath string, slow bool) { + if !slow { + if v, ok := r.slowStreak.Load(fullPath); ok { + v.(*atomic.Int32).Store(0) + } + // celeris#361: a route fast on adaptiveSettleStreak CONSECUTIVE runs is + // provably non-blocking — settle it so adaptiveLearning short-circuits + // and handler.go stops timing every request forever. + fv, _ := r.fastStreak.LoadOrStore(fullPath, new(atomic.Int32)) + if fv.(*atomic.Int32).Add(1) >= adaptiveSettleStreak { + r.settled.Store(fullPath, struct{}{}) + } + return + } + if v, ok := r.fastStreak.Load(fullPath); ok { + v.(*atomic.Int32).Store(0) + } + v, _ := r.slowStreak.LoadOrStore(fullPath, new(atomic.Int32)) + if v.(*atomic.Int32).Add(1) >= adaptivePromoteStreak { + r.promoteRoute(fullPath) + } +} + +// adaptiveLearning reports whether an adaptive route is still being observed — +// i.e. neither settled (proven non-blocking, celeris#361) nor promoted (proven +// blocking, celeris#356). Only learning routes pay the per-request inline +// timing in handler.go; once decided, the hot path skips it. Steady state is a +// single sync.Map.Load (settled), the same lookup cost the prior isPromoted +// check carried, but without the two time.Now() vDSO calls per request. +func (r *router) adaptiveLearning(fullPath string) bool { + if _, ok := r.settled.Load(fullPath); ok { + return false + } + return !r.isPromoted(fullPath) } // addRoute registers a route inheriting the server-level async default. @@ -253,6 +416,15 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, validatePath(path) async := r.resolveAsync(as) + // celeris#356: a route that inherits the server-level AsyncHandlers=true + // default (rather than an explicit .Async()) starts INLINE and is promoted + // to async only when an inline run is observed to block. Explicit + // .Async(true)/.Async(false) is honored verbatim. + adaptive := as == asyncDefault && r.defaultAsync + if adaptive { + async = false + r.adaptiveRoutes[path] = true + } root := r.getTree(method) if root == nil { @@ -271,7 +443,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, root.async = async route.node = root r.setStaticEntry(method, "/", staticEntry{handlers: handlers, fullPath: "/", async: async}) - if async { + if async || adaptive { r.asyncRouteCount++ } return route @@ -296,7 +468,7 @@ func (r *router) addRouteWithAsync(method, path string, handlers []HandlerFunc, r.setStaticEntry(method, path, staticEntry{handlers: handlers, fullPath: path, async: async}) } - if async { + if async || adaptive { r.asyncRouteCount++ } @@ -321,19 +493,28 @@ func (r *router) routeAsync(method, path string) bool { if idx >= 0 { if m := r.staticRoutes[idx]; m != nil { if e, ok := m[path]; ok { - return e.async + return e.async || r.adaptivePromoted(e.fullPath) } } } else if r.customStatic != nil { if m := r.customStatic[method]; m != nil { if e, ok := m[path]; ok { - return e.async + return e.async || r.adaptivePromoted(e.fullPath) } } } var params Params - _, _, async := r.find(method, path, ¶ms) - return async + _, fullPath, async := r.find(method, path, ¶ms) + return async || r.adaptivePromoted(fullPath) +} + +// adaptivePromoted reports whether an adaptive (server-default-async) route has +// been promoted to async dispatch after a blocking inline run (celeris#356). +// The adaptiveRoutes map is read-only while serving, so the lookup is a plain +// (lock-free) map read followed by a sync.Map load only for adaptive routes; +// non-adaptive configs (no AsyncHandlers default) keep the empty-map fast path. +func (r *router) adaptivePromoted(fullPath string) bool { + return r.adaptiveRoutes[fullPath] && r.isPromoted(fullPath) } // warnDuplicateRoute emits a warning when a route is registered twice for diff --git a/router_async_test.go b/router_async_test.go index 87e03bd4..0185159a 100644 --- a/router_async_test.go +++ b/router_async_test.go @@ -21,16 +21,27 @@ func TestRouteAsync_ServerDefaultSync(t *testing.T) { } } -// TestRouteAsync_ServerDefaultAsync verifies routes inherit an async -// server default (Config.AsyncHandlers=true) when not overridden. -func TestRouteAsync_ServerDefaultAsync(t *testing.T) { +// TestRouteAsync_ServerDefaultAdaptive verifies the celeris#356 contract: a +// route that inherits the AsyncHandlers=true default (not an explicit .Async()) +// is ADAPTIVE — it starts INLINE (routeAsync=false, the ring-batched fast path) +// and is promoted to async only after a blocking inline run. hasAsyncRoutes +// stays true because adaptive routes may promote and need the async infra. +func TestRouteAsync_ServerDefaultAdaptive(t *testing.T) { s := New(Config{AsyncHandlers: true}) s.GET("/ping", noopHandler) - if !s.router.routeAsync("GET", "/ping") { - t.Fatal("route should inherit async server default") + if s.router.routeAsync("GET", "/ping") { + t.Fatal("adaptive route should start INLINE (not async) until it blocks") + } + if !s.router.adaptiveRoutes["/ping"] { + t.Fatal("/ping should be registered adaptive under AsyncHandlers=true") } if !s.router.hasAsyncRoutes() { - t.Fatal("hasAsyncRoutes should be true when default is async") + t.Fatal("hasAsyncRoutes should be true (adaptive routes may promote)") + } + // A blocking inline run promotes the route; it then resolves async. + s.router.promoteRoute("/ping") + if !s.router.routeAsync("GET", "/ping") { + t.Fatal("after promotion the adaptive route should resolve async") } } @@ -51,18 +62,47 @@ func TestRouteAsync_RouteOverrideOn(t *testing.T) { } } -// TestRouteAsync_RouteOverrideOff forces a single route sync on an -// async-default server. +// TestRouteAsync_RouteOverrideOff verifies that on an async-default server, an +// inherited route is adaptive (inline-first, celeris#356) while an explicit +// .Async(false) route is hard-sync and never adaptive. func TestRouteAsync_RouteOverrideOff(t *testing.T) { s := New(Config{AsyncHandlers: true}) - s.GET("/db", noopHandler) - s.GET("/cached", noopHandler).Async(false) - if !s.router.routeAsync("GET", "/db") { - t.Fatal("/db should inherit async default") + s.GET("/db", noopHandler) // inherits default → adaptive + s.GET("/cached", noopHandler).Async(false) // explicit sync → never adaptive + if s.router.routeAsync("GET", "/db") { + t.Fatal("/db inherits the async default → adaptive, starts inline") + } + if !s.router.adaptiveRoutes["/db"] { + t.Fatal("/db should be adaptive") } if s.router.routeAsync("GET", "/cached") { t.Fatal("/cached should be forced sync via Async(false)") } + if s.router.adaptiveRoutes["/cached"] { + t.Fatal("/cached is explicitly sync → must not be adaptive") + } +} + +// TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive is the celeris#356 +// no-regression guard for blocking handlers. A route explicitly marked +// .Async() on an async-default server (exactly how the probatorium driver +// routes — /cache, /db, /mc — register) must be hard-async, NOT adaptive: +// it must never enter the inline-first window, so a blocking handler never +// runs inline and stalls a worker. This is what makes #356 safe to ship +// without a driver-backend regression: trivial routes inherit (adaptive → +// inline win) while explicitly-async blocking routes stay always-async. +func TestRouteAsync_ExplicitAsyncOptsOutOfAdaptive(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/cache/:key", noopHandler).Async() // blocking driver route + if s.router.adaptiveRoutes["/cache/:key"] { + t.Fatal("explicit .Async() route must be removed from the adaptive set") + } + if _, promoted := s.router.promoted.Load("/cache/:key"); promoted { + t.Fatal("explicit .Async() route must carry no promotion state") + } + if !s.router.routeAsync("GET", "/cache/42") { + t.Fatal("explicit .Async() route must resolve hard-async (never inline)") + } } // TestRouteAsync_GroupInherit verifies group-level Async applies to its @@ -170,3 +210,177 @@ func TestRouteAsync_ResolverInterface(t *testing.T) { t.Fatal("adapter RouteAsync should report unmatched sync") } } + +// TestRouteAsync_AdaptiveHysteresis verifies celeris#356 promotion hysteresis: +// a single slow inline run (cold start / GC) must NOT promote a route, but +// adaptivePromoteStreak consecutive slow runs (a genuinely-blocking handler) +// must. A fast run resets the streak. +func TestRouteAsync_AdaptiveHysteresis(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + s.GET("/h", noopHandler) + rt := s.router + // One slow outlier, then a fast run → no promotion. + rt.recordInlineRun("/h", true) + rt.recordInlineRun("/h", false) + if rt.routeAsync("GET", "/h") { + t.Fatal("a single slow run must not promote (cold-start hysteresis)") + } + // Sustained slowness (blocking handler) → promotion after the streak. + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/h", true) + } + if !rt.routeAsync("GET", "/h") { + t.Fatalf("%d consecutive slow runs must promote to async", adaptivePromoteStreak) + } +} + +// TestRouteAsync_AdaptiveSettles verifies celeris#361: a route fast on +// adaptiveSettleStreak CONSECUTIVE inline runs SETTLES (leaves the timed +// learning path so the hot loop stops paying two time.Now() per request), and a +// slow run before the streak completes resets it. A settled route still runs +// inline (not async). +func TestRouteAsync_AdaptiveSettles(t *testing.T) { + s := New(Config{AsyncHandlers: true}) + route := s.GET("/s", noopHandler) + rt := s.router + + // One short of the streak → still learning. + for i := 0; i < adaptiveSettleStreak-1; i++ { + rt.recordInlineRun("/s", false) + } + if !rt.adaptiveLearning("/s") { + t.Fatal("route must still be learning before the settle streak completes") + } + + // A slow run resets the fast streak; the next near-full run must NOT settle. + rt.recordInlineRun("/s", true) + for i := 0; i < adaptiveSettleStreak-1; i++ { + rt.recordInlineRun("/s", false) + } + if !rt.adaptiveLearning("/s") { + t.Fatal("a slow run must reset the fast streak — route not settled yet") + } + + // One more fast run completes the streak → settle. + rt.recordInlineRun("/s", false) + if rt.adaptiveLearning("/s") { + t.Fatalf("route must settle after %d consecutive fast runs", adaptiveSettleStreak) + } + if rt.routeAsync("GET", "/s") { + t.Fatal("a settled route runs INLINE, not async") + } + + // An explicit override clears the settled state (setAsync). + route.Sync() + if _, settled := rt.settled.Load("/s"); settled { + t.Fatal("explicit .Sync() must clear the settled state") + } +} + +// stubNowNano installs a controllable clock for the adaptive promotion TTL +// (celeris#364) and returns a pointer to advance it plus a restore func. +func stubNowNano() (clock *int64, restore func()) { + old := nowNano + var c int64 + nowNano = func() int64 { return c } + return &c, func() { nowNano = old } +} + +// TestRouteAsync_PromotionExpires verifies celeris#364: promotion is reversible. +// A route promoted by a transient spike must de-promote after adaptivePromoteTTL +// and run inline again, and the de-promotion must reset the slow streak so a +// single later slow run does not immediately re-promote. +func TestRouteAsync_PromotionExpires(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/d", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/d", true) + } + if !rt.routeAsync("GET", "/d") { + t.Fatal("expected promotion after the slow streak") + } + + // Within the TTL → still promoted. + *clock += int64(adaptivePromoteTTL) - 1 + if !rt.routeAsync("GET", "/d") { + t.Fatal("promotion must persist within the TTL") + } + + // Past the TTL → de-promoted, route runs inline again. + *clock += 2 + if rt.routeAsync("GET", "/d") { + t.Fatal("promotion must expire after the TTL (route runs inline again)") + } + if _, ok := rt.promoted.Load("/d"); ok { + t.Fatal("expired promotion must be removed from the promoted set") + } + + // The slow streak was reset: one slow run must NOT immediately re-promote. + rt.recordInlineRun("/d", true) + if rt.routeAsync("GET", "/d") { + t.Fatal("de-promotion must reset the slow streak (one slow run must not re-promote)") + } +} + +// TestRouteAsync_DePromotedRouteCanSettle verifies that after a falsely-promoted +// route de-promotes, sustained fast runs SETTLE it (proven non-blocking, inline +// forever) — i.e. re-evaluation works end to end. +func TestRouteAsync_DePromotedRouteCanSettle(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/d", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/d", true) + } + *clock += int64(adaptivePromoteTTL) + 1 + if rt.routeAsync("GET", "/d") { + t.Fatal("expected de-promotion at expiry") + } + + for i := 0; i < adaptiveSettleStreak; i++ { + rt.recordInlineRun("/d", false) + } + if rt.adaptiveLearning("/d") { + t.Fatal("a de-promoted route that runs fast must settle (inline forever)") + } + if rt.routeAsync("GET", "/d") { + t.Fatal("a settled route runs inline, not async") + } +} + +// TestRouteAsync_RePromotesAfterExpiryWhenBlocking verifies a genuinely-blocking +// adaptive route re-promotes after the TTL re-evaluation — de-promotion must not +// pin a blocking handler to the inline path. +func TestRouteAsync_RePromotesAfterExpiryWhenBlocking(t *testing.T) { + clock, restore := stubNowNano() + defer restore() + + s := New(Config{AsyncHandlers: true}) + s.GET("/b", noopHandler) + rt := s.router + + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/b", true) + } + *clock += int64(adaptivePromoteTTL) + 1 + if rt.routeAsync("GET", "/b") { + t.Fatal("expected de-promotion at expiry") + } + + // Still blocking → re-promotes after a fresh streak. + for i := 0; i < adaptivePromoteStreak; i++ { + rt.recordInlineRun("/b", true) + } + if !rt.routeAsync("GET", "/b") { + t.Fatal("a still-blocking route must re-promote after expiry") + } +} diff --git a/server.go b/server.go index 6cf14e09..515f1e99 100644 --- a/server.go +++ b/server.go @@ -21,7 +21,7 @@ import ( ) // Version is the semantic version of the celeris module. -const Version = "1.5.0" +const Version = "1.5.3" // ErrAlreadyStarted is returned when Start or StartWithContext is called on a // server that is already running. @@ -476,7 +476,16 @@ func (s *Server) EventLoopProvider() engine.EventLoopProvider { // (their net.TCPConn goroutines keep running regardless of which // sub-engine is active). func (s *Server) AsyncHandlers() bool { - if !s.config.AsyncHandlers { + // Report the EFFECTIVE async state, not the raw Config flag: the server + // stands up async-dispatch infrastructure when the server-level default is + // set OR any route opted in via .Async() (doPrepare flips the engine cfg the + // same way via hasAsyncRoutes). WithEngine drivers consult this to pick their + // netpoll-park fast path; keying on the raw config alone put a driver called + // from a per-route-.Async() handler on the slower mini-loop path — the + // documented footgun. NOTE: hasAsyncRoutes reflects routes registered so far, + // so open WithEngine drivers AFTER registering .Async() routes (or set + // Config.AsyncHandlers=true) to be order-independent. + if !s.config.AsyncHandlers && !s.router.hasAsyncRoutes() { return false } switch s.config.Engine { diff --git a/server_test.go b/server_test.go index 73fa0538..aa98294e 100644 --- a/server_test.go +++ b/server_test.go @@ -11,7 +11,6 @@ import ( "github.com/goceleris/celeris/engine" "github.com/goceleris/celeris/protocol/h2/stream" - "github.com/goceleris/celeris/resource" ) func TestServerEngineInfo(t *testing.T) { @@ -1009,43 +1008,6 @@ func TestConfigZeroValuesNotMapped(t *testing.T) { } } -// TestConfigWorkerScalingDefaultOn pins the v1.4.6 default-on contract -// for issue #281: nil Config.WorkerScaling resolves to a non-nil, -// zero-value resource.WorkerScalingConfig — the scaler activates -// automatically with the data-validated defaults -// (Strategy=StartHigh, MinActive=max(2,NumCPU/2), -// TargetConnsPerWorker=20, Interval=250ms, ScaleUpStep=2, -// ScaleDownStep=1, ScaleDownHysteresis=1, ScaleDownIdleTicks=4). -// -// This brings the "just-works" public design intent (Adaptive engine, -// Auto protocol) into alignment for the third major default. -func TestConfigWorkerScalingDefaultOn(t *testing.T) { - cfg := Config{Addr: ":8080"} // no WorkerScaling field set - rc := cfg.toResourceConfig() - if rc.WorkerScaling == nil { - t.Fatal("nil Config.WorkerScaling did not resolve to a non-nil resource.WorkerScalingConfig — scaler default-on regressed") - } - if *rc.WorkerScaling != (resource.WorkerScalingConfig{}) { - t.Fatalf("expected zero-value WorkerScalingConfig (so scaler picks data-validated defaults), got %+v", *rc.WorkerScaling) - } -} - -// TestConfigWorkerScalingExplicitPreserved pins that user-supplied -// values are passed through verbatim: the default-on path only fires -// when the user did NOT configure the scaler. -func TestConfigWorkerScalingExplicitPreserved(t *testing.T) { - want := &resource.WorkerScalingConfig{ - Strategy: resource.ScalingStrategyStartLow, - MinActive: 7, - TargetConnsPerWorker: 99, - } - cfg := Config{Addr: ":8080", WorkerScaling: want} - rc := cfg.toResourceConfig() - if rc.WorkerScaling != want { - t.Fatalf("explicit WorkerScaling pointer was not preserved: got %p, want %p", rc.WorkerScaling, want) - } -} - // Phase 2.1 tests func TestRouteUseMiddleware(t *testing.T) { diff --git a/test/drivercmp/memcached/go.mod b/test/drivercmp/memcached/go.mod index 144cb857..df597ee0 100644 --- a/test/drivercmp/memcached/go.mod +++ b/test/drivercmp/memcached/go.mod @@ -3,7 +3,7 @@ module github.com/goceleris/celeris/test/drivercmp/memcached go 1.26.4 require ( - github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf + github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c github.com/goceleris/celeris v0.0.0 ) diff --git a/test/drivercmp/memcached/go.sum b/test/drivercmp/memcached/go.sum index 581e931c..0f9dd4b5 100644 --- a/test/drivercmp/memcached/go.sum +++ b/test/drivercmp/memcached/go.sum @@ -1,4 +1,4 @@ -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= diff --git a/test/perfmatrix/go.mod b/test/perfmatrix/go.mod index 954c9d03..2caad41f 100644 --- a/test/perfmatrix/go.mod +++ b/test/perfmatrix/go.mod @@ -4,12 +4,12 @@ go 1.26.4 require ( github.com/HdrHistogram/hdrhistogram-go v1.2.0 - github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf + github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c github.com/cloudwego/hertz v0.10.5 github.com/gin-gonic/gin v1.12.0 github.com/go-chi/chi/v5 v5.3.0 github.com/goceleris/celeris v1.4.15 - github.com/goceleris/loadgen v1.4.8 + github.com/goceleris/loadgen v1.4.9 github.com/gofiber/fiber/v3 v3.3.0 github.com/google/uuid v1.6.0 github.com/gorilla/sessions v1.4.0 diff --git a/test/perfmatrix/go.sum b/test/perfmatrix/go.sum index efbc701a..b04f85db 100644 --- a/test/perfmatrix/go.sum +++ b/test/perfmatrix/go.sum @@ -18,8 +18,8 @@ github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eT github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf h1:TqhNAT4zKbTdLa62d2HDBFdvgSbIGB3eJE8HqhgiL9I= -github.com/bradfitz/gomemcache v0.0.0-20250403215159-8d39553ac7cf/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c h1:6Gpm9YYUEQx2T9zMsYolQhr6sjwwGtFitSA0pQsa7a8= +github.com/bradfitz/gomemcache v0.0.0-20260422231931-4d751bb6e37c/go.mod h1:r5xuitiExdLAJ09PR7vBVENGvp4ZuTBeWTGtxuX3K+c= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -76,8 +76,8 @@ github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM= github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= -github.com/goceleris/loadgen v1.4.8 h1:r162NrVoLxuCQ3IlhWp+nqAgeFikHImlUSagJbP98m8= -github.com/goceleris/loadgen v1.4.8/go.mod h1:BtjUHc0ULnqa2LsSoJNzDdBt05xUx5jajeF6XnJfFJA= +github.com/goceleris/loadgen v1.4.9 h1:Kd/AmLHP520Su3azQ9tCNoc6tsaeEf7Nx8ECr4AdYfg= +github.com/goceleris/loadgen v1.4.9/go.mod h1:Olg2awQufUnRemRlCvFPFL6Ww3byUd+UvZYQAMJm6Co= github.com/gofiber/fiber/v3 v3.3.0 h1:QBd3sYCqdy6Qs5gJYzSw4I4SbqL204jPqpdub/ueiw8= github.com/gofiber/fiber/v3 v3.3.0/go.mod h1:YH7/TAoRaU4kF8slDCtQuFJ1NzC+3MtxUI4KfvQtaIA= github.com/gofiber/schema v1.7.1 h1:oSJBKdgP8JeIME4TQSAqlNKTU2iBB+2RNmKi8Nsc+TI=