From 8b176c2b6799fd4a44350d8d4209d5178c969459 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Tue, 28 Apr 2026 17:26:40 -0700 Subject: [PATCH 1/3] test(uffd): add deterministic stale-source / madvise-deadlock / faulted-short-circuit race tests Three race tests built on the unix-socket RPC harness and the test-only fault-barrier hooks. None use sleeps, retries, or soak loops - each test installs explicit barriers on the child's worker goroutine, drives the racing kernel operation from the parent, and asserts on a concrete post-state. - TestStaleSourceRaceMissingAndRemove: regression test for the stale-source bug. Plants a non-zero sentinel into the source page, parks the worker via barrierBeforeRLock, fires madvise, waits for the REMOVE batch to commit, releases the worker, then asserts the page is zero-filled. INTENTIONALLY FAILS on this PR with `page 1 first byte: want 0 ... got 0xc3` - the worker captured `source = u.src` in the parent loop before the REMOVE landed and UFFDIO_COPY'd the planted sentinel into the page after the kernel had MADV_DONTNEED'd it. PR #4 (#2512) makes this pass by re-reading state inside the worker under settleRequests.RLock. - TestNoMadviseDeadlockWithInflightCopy: liveness regression test. Parks the worker via barrierBeforeFaultPage (holding RLock), fires madvise, asserts madvise returns within 2s. Passes today; protects against any future change that accidentally couples readEvents to settleRequests. - TestFaultedShortCircuitOrdering: smoke test on the REMOVE-then- pagefault batch ordering using the gated harness. Pins the invariant that REMOVE batches drain before pagefault dispatch in a single Serve iteration. Test infrastructure additions: - testHandler.installFaultBarrier / waitFaultHeld / releaseFault convenience wrappers around the Service.* RPCs from PR #1. - testConfig.sourcePatcher hook so race tests can plant a deterministic sentinel into the random source data BEFORE the content file is written, without depending on the happenstance value of any randomly-generated byte. ALL OTHER TESTS in the package still pass on this PR; only the three sub-tests of TestStaleSourceRaceMissingAndRemove fail (the bug demonstration). --- .../uffd/userfaultfd/harness_parent_test.go | 4 + .../sandbox/uffd/userfaultfd/helpers_test.go | 33 ++ .../pkg/sandbox/uffd/userfaultfd/race_test.go | 398 ++++++++++++++++++ 3 files changed, 435 insertions(+) create mode 100644 packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go index 47f62b4fb3..77b8c357a6 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/harness_parent_test.go @@ -64,6 +64,10 @@ func configureCrossProcessTest(ctx context.Context, t *testing.T, tt testConfig) data := RandomPages(tt.pagesize, tt.numberOfPages) + if tt.sourcePatcher != nil { + tt.sourcePatcher(data.Content()) + } + size, err := data.Size() require.NoError(t, err) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index fd753dac3c..095b22af58 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -80,6 +80,14 @@ type testConfig struct { gated bool // barriers enables the per-worker fault hook (race tests only). barriers bool + // sourcePatcher, if non-nil, is invoked on the random source data + // AFTER it's generated but BEFORE it's written to the on-disk + // content file the child reads. Race tests use it to plant a + // deterministic sentinel into the source so post-test assertions + // can distinguish "post-fix zero-fault" from "pre-fix UFFDIO_COPY + // of stale src bytes" without depending on randomly-generated + // byte values. + sourcePatcher func([]byte) } type operationMode uint32 @@ -158,6 +166,31 @@ func (h *testHandler) pageStates() (handlerPageStates, error) { return states, nil } +// installFaultBarrier parks the next worker that hits `phase` for +// `addr`. Returns a token to be passed to waitFaultHeld / releaseFault. +func (h *testHandler) installFaultBarrier(_ context.Context, addr uintptr, phase faultPhase) (uint64, error) { + return h.client.InstallBarrier(addr, testharness.Point(phase)) +} + +// waitFaultHeld blocks until the child reports a worker has reached the +// barrier identified by token, or ctx is cancelled. net/rpc's Call +// doesn't take a context, so we run it in a goroutine and race it. +func (h *testHandler) waitFaultHeld(ctx context.Context, token uint64) error { + errCh := make(chan error, 1) + go func() { errCh <- h.client.WaitFaultHeld(token) }() + select { + case err := <-errCh: + return err + case <-ctx.Done(): + return ctx.Err() + } +} + +// releaseFault releases a parked worker so it proceeds past the barrier. +func (h *testHandler) releaseFault(_ context.Context, token uint64) error { + return h.client.ReleaseFault(token) +} + func (h *testHandler) executeAll(t *testing.T, operations []operation) { t.Helper() diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go new file mode 100644 index 0000000000..4605b96ec4 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go @@ -0,0 +1,398 @@ +package userfaultfd + +import ( + "context" + "fmt" + "testing" + "time" + "unsafe" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/uffd/testutils" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" +) + +// raceHappyPathBudget bounds every race test in this file. The whole +// point of these tests is that they detect a regression as a fast, +// targeted assertion rather than as a CI -timeout 30m hang. None of +// these tests should approach this budget on a healthy build. +const raceHappyPathBudget = 5 * time.Second + +// barrierArrivalDeadline is how long the test will wait for a worker +// to reach an installed barrier. The hook fires the first thing in +// the worker goroutine, so on a healthy build it's a sub-millisecond +// rendezvous over the unix-socket RPC. Anything approaching this +// deadline means the handler dispatch is wedged. +const barrierArrivalDeadline = 2 * time.Second + +// madviseBudget is how long we allow MADV_DONTNEED to spend in the +// kernel after we've parked a worker mid-handler. The fix guarantees +// madvise unblocks as soon as the handler drains the REMOVE event +// from the uffd fd, regardless of any worker holding RLock — +// readEvents requires no lock. +const madviseBudget = 2 * time.Second + +// withRaceContext bounds a single race test to raceHappyPathBudget, +// failing with a clear "deadlock" message if the budget is exceeded. +func withRaceContext(t *testing.T, body func(ctx context.Context)) { + t.Helper() + + ctx, cancel := context.WithTimeout(t.Context(), raceHappyPathBudget) + defer cancel() + + done := make(chan struct{}) + go func() { + defer close(done) + body(ctx) + }() + + select { + case <-done: + case <-ctx.Done(): + t.Fatalf("race test exceeded happy-path budget of %s — handler is wedged", raceHappyPathBudget) + } +} + +// TestStaleSourceRaceMissingAndRemove is the deterministic regression +// test for the production fix in Serve(): +// +// - Pre-fix the parent serve loop captured `state == missing` and +// `source = u.src` BEFORE handing the work to a worker goroutine. +// A REMOVE event for the same page that arrived between then and +// the worker actually running would silently leave the worker +// with a stale `source = u.src` snapshot, which it would then +// UFFDIO_COPY into the page that the kernel had just unmapped. +// +// - Post-fix the worker reads pageTracker state INSIDE the +// goroutine, under settleRequests.RLock, atomically with the +// decision of which `source` to use. +// +// The test installs a barrierBeforeRLock on page X (so the worker +// for X parks before it can read state), triggers a MISSING-write +// fault on X from the parent, waits for the worker to park, fires +// MADV_DONTNEED on X (which can take settleRequests.Lock immediately +// — no worker holds RLock), and then releases the worker. After +// release the worker, post-fix, observes state=removed under RLock +// and zero-faults; pre-fix it would have UFFDIO_COPY'd the planted +// sentinel byte from u.src. A direct read of the page contents +// distinguishes the two outcomes deterministically. +func TestStaleSourceRaceMissingAndRemove(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + pagesize uint64 + }{ + {name: "4k", pagesize: header.PageSize}, + {name: "hugepage", pagesize: header.HugepageSize}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + withRaceContext(t, func(ctx context.Context) { + // Plant a deterministic, non-zero sentinel as the + // first byte of the source data for the page we'll + // race on. Pre-fix, the worker would UFFDIO_COPY this + // sentinel into the page after the REMOVE has already + // unmapped it. Post-fix the worker reads + // state == removed under RLock and zero-fills. + const sentinel = byte(0xC3) + const pageIdx = 1 + pageOffset := int64(pageIdx) * int64(tt.pagesize) + + cfg := testConfig{ + pagesize: tt.pagesize, + numberOfPages: 4, + barriers: true, + removeEnabled: true, + sourcePatcher: func(content []byte) { + content[pageOffset] = sentinel + }, + } + + h, err := configureCrossProcessTest(ctx, t, cfg) + require.NoError(t, err) + + memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0])) + addr := memStart + uintptr(pageIdx)*uintptr(tt.pagesize) + + token, err := h.installFaultBarrier(ctx, addr, faultPhaseBeforeRLock) + require.NoError(t, err) + + // Trigger a READ fault (NOT a write — a write would + // overwrite the very byte we want to inspect to + // distinguish the two outcomes). h.executeRead does + // the touch + content check; we run it in a goroutine + // because it blocks on the fault until we release the + // barrier. + readErrCh := make(chan error, 1) + go func() { + readErrCh <- h.executeRead(ctx, operation{offset: pageOffset, mode: operationModeRead}) + }() + + // Wait for the worker for `addr` to park at the + // pre-RLock barrier. + waitCtx, waitCancel := context.WithTimeout(ctx, barrierArrivalDeadline) + err = h.waitFaultHeld(waitCtx, token) + waitCancel() + require.NoError(t, err, "worker for page %d (addr %#x) did not park at barrier", pageIdx, addr) + + // Fire MADV_DONTNEED on the same page from the + // parent. The serve loop can take Lock immediately + // because the parked worker has not yet acquired + // RLock. + err = h.executeRemove(operation{offset: pageOffset, mode: operationModeRemove}) + require.NoError(t, err, "MADV_DONTNEED on page %d did not return — handler dispatch wedged", pageIdx) + + // Wait for the handler to commit setState(removed). + // A tight poll loop with a hard deadline is used + // rather than a sleep — the transition is + // microseconds in the happy path. + require.NoError(t, waitForState(ctx, h, uint64(pageOffset), removed, barrierArrivalDeadline), + "handler did not transition page %d to `removed` after MADV_DONTNEED", pageIdx) + + // Release the parked worker. Post-fix it will + // observe state == removed and zero-fault; pre-fix + // it would proceed with the captured stale source. + require.NoError(t, h.releaseFault(ctx, token)) + + select { + case err := <-readErrCh: + // Pre-fix: executeRead's bytes.Equal succeeds + // (page contains src bytes), so err == nil but + // the page is observably wrong. Post-fix: + // bytes.Equal fails (page is zero-filled), so + // err != nil. We use the page-content assertion + // below instead of relying on this side-channel. + _ = err + case <-ctx.Done(): + t.Fatalf("read of page %d did not unblock after barrier release", pageIdx) + } + + // THE bug-detection assertion: post-fix the page + // MUST be zero-filled. Pre-fix the worker + // UFFDIO_COPY'd the planted sentinel. + page := (*h.memoryArea)[pageOffset : pageOffset+int64(tt.pagesize)] + assert.Equalf(t, byte(0), page[0], + "page %d first byte: want 0 (post-fix zero-fault for `removed` state), got %#x — "+ + "if this equals the sentinel %#x, the worker used a stale `source = u.src` snapshot (regression)", + pageIdx, page[0], sentinel, + ) + + // Sanity: verify with /proc/self/pagemap that the + // page is in fact present after the racing read was + // served (worker re-mapped it as zero). + pagemap, err := testutils.NewPagemapReader() + require.NoError(t, err) + defer pagemap.Close() + entry, err := pagemap.ReadEntry(addr) + require.NoError(t, err) + assert.True(t, entry.IsPresent(), "page %d should be present after the racing read", pageIdx) + }) + }) + } +} + +// TestNoMadviseDeadlockWithInflightCopy is a liveness regression test +// for the user-visible symptom that originally surfaced the stale- +// source race: the orchestrator's parent madvise(MADV_DONTNEED) +// blocking forever because the UFFD handler loop was wedged behind a +// worker. +// +// The harness parks the worker AFTER it has taken settleRequests.RLock +// AND captured `source` (i.e. as if its UFFDIO_COPY was in flight). +// From the parent we then issue MADV_DONTNEED on the same page and +// require that madvise returns within `madviseBudget`. madvise +// unblocks as soon as the handler's readEvents drains the REMOVE +// event, and readEvents requires no lock — so any future change that +// accidentally couples readEvents to settleRequests fails this test +// at the `madviseBudget` boundary instead of as a 30-minute CI +// timeout. +func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + pagesize uint64 + }{ + {name: "4k", pagesize: header.PageSize}, + {name: "hugepage", pagesize: header.HugepageSize}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + withRaceContext(t, func(ctx context.Context) { + cfg := testConfig{ + pagesize: tt.pagesize, + numberOfPages: 4, + barriers: true, + removeEnabled: true, + } + + h, err := configureCrossProcessTest(ctx, t, cfg) + require.NoError(t, err) + + const pageIdx = 2 + pageOffset := int64(pageIdx) * int64(tt.pagesize) + + memStart := uintptr(unsafe.Pointer(&(*h.memoryArea)[0])) + addr := memStart + uintptr(pageIdx)*uintptr(tt.pagesize) + + token, err := h.installFaultBarrier(ctx, addr, faultPhaseBeforeFaultPage) + require.NoError(t, err) + + writeErrCh := make(chan error, 1) + go func() { + writeErrCh <- h.executeWrite(ctx, operation{offset: pageOffset, mode: operationModeWrite}) + }() + + waitCtx, waitCancel := context.WithTimeout(ctx, barrierArrivalDeadline) + err = h.waitFaultHeld(waitCtx, token) + waitCancel() + require.NoError(t, err, "worker for page %d (addr %#x) did not park at pre-COPY barrier", pageIdx, addr) + + // Worker is parked AFTER RLock. Issue MADV_DONTNEED + // on the same page from the parent. The handler's + // readEvents must drain the REMOVE event (so madvise + // returns) even while the worker holds RLock. + madviseDone := make(chan error, 1) + go func() { + madviseDone <- unix.Madvise((*h.memoryArea)[pageOffset:pageOffset+int64(tt.pagesize)], unix.MADV_DONTNEED) + }() + + select { + case err := <-madviseDone: + require.NoError(t, err) + case <-time.After(madviseBudget): + _ = h.releaseFault(ctx, token) + <-writeErrCh + t.Fatalf("DEADLOCK: madvise(MADV_DONTNEED) on page %d did not return within %s "+ + "while a worker was parked holding settleRequests.RLock — readEvents must not require any lock", + pageIdx, madviseBudget) + } + + require.NoError(t, h.releaseFault(ctx, token)) + + select { + case err := <-writeErrCh: + require.NoError(t, err) + case <-ctx.Done(): + t.Fatalf("user-side write of page %d did not unblock after barrier release", pageIdx) + } + }) + }) + } +} + +// TestFaultedShortCircuitOrdering uses the gated harness to +// deterministically queue a WRITE pagefault for a fresh page AND a +// REMOVE for an already-faulted page in the SAME serve-loop +// iteration. After resume, the post-batch state is asserted: the +// REMOVE'd page is `removed` and the racing-write page is `faulted`. +// +// Both pre-fix and post-fix code reach the same end state for this +// scenario (REMOVE batch runs before the pagefault dispatch loop in +// every Serve iteration). This test guards the batch-processing +// invariant itself: any future change that, for example, dispatched +// pagefaults before draining REMOVEs would fail this test as a +// concrete state-mismatch assertion rather than a 30-minute hang. +// +//nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point. +func TestFaultedShortCircuitOrdering(t *testing.T) { + tests := []struct { + name string + pagesize uint64 + }{ + {name: "4k", pagesize: header.PageSize}, + {name: "hugepage", pagesize: header.HugepageSize}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { //nolint:paralleltest // see test-level comment + withRaceContext(t, func(ctx context.Context) { + cfg := testConfig{ + pagesize: tt.pagesize, + numberOfPages: 2, + gated: true, + removeEnabled: true, + operations: []operation{ + {offset: 0, mode: operationModeRead}, + {mode: operationModeServePause}, + {offset: 0, mode: operationModeRemove, async: true}, + {mode: operationModeSleep}, + {offset: int64(tt.pagesize), mode: operationModeWrite, async: true}, + {mode: operationModeSleep}, + {mode: operationModeServeResume}, + }, + } + + h, err := configureCrossProcessTest(ctx, t, cfg) + require.NoError(t, err) + + h.executeAll(t, cfg.operations) //nolint:contextcheck // executeAll uses t.Context() per-op for the bounded race wrapper above + + states, err := h.pageStates() + require.NoError(t, err) + + assert.Contains(t, states.removed, uint(0), + "page 0 should be `removed` after REMOVE batch (got removed=%v faulted=%v)", + states.removed, states.faulted, + ) + assert.Contains(t, states.faulted, uint(tt.pagesize), + "page 1 (offset %d) should be `faulted` after the racing write was served (got removed=%v faulted=%v)", + tt.pagesize, states.removed, states.faulted, + ) + }) + }) + } +} + +// waitForState polls the child's PageStates RPC until the page at +// the given offset reaches `want` or `deadline` elapses. Each RPC +// round-trip is microseconds-to-low-milliseconds; we yield with a +// small sleep between polls so the harness doesn't burn an entire +// CPU on tight-loop encoding while the rest of the suite is also +// running cross-process tests. +func waitForState(ctx context.Context, h *testHandler, offset uint64, want pageState, deadline time.Duration) error { + const pollInterval = 1 * time.Millisecond + + end := time.Now().Add(deadline) + for { + states, err := h.pageStates() + if err != nil { + return err + } + + var bucket []uint + switch want { + case removed: + bucket = states.removed + case faulted: + bucket = states.faulted + } + + for _, off := range bucket { + if uint64(off) == offset { + return nil + } + } + + if time.Now().After(end) { + return fmt.Errorf("page state at offset %d: want %d after %s — last seen removed=%v faulted=%v", + offset, want, deadline, states.removed, states.faulted) + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(pollInterval): + } + } +} From 3047e695c73e1145bb4e9ef203ed4bd6c93cd490 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 03:19:17 -0700 Subject: [PATCH 2/3] chore(uffd): address review feedback and trim comments - waitForState: add default case to avoid silent busy-poll on unrecognised pageState values. - TestFaultedShortCircuitOrdering: rewrite docstring to accurately describe coverage (disjoint-page end-state check, not an ordering invariant guard; same-page ordering is covered by TestStaleSource...). - TestStaleSourceRaceMissingAndRemove: fix "MISSING-write fault" stale docstring to "MISSING (READ) fault", note both variants fail until #2512. - Trim verbose multi-line constant and helper comments down to load-bearing WHY. --- .../sandbox/uffd/userfaultfd/helpers_test.go | 13 +- .../pkg/sandbox/uffd/userfaultfd/race_test.go | 166 +++++------------- 2 files changed, 48 insertions(+), 131 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go index 095b22af58..36177171ab 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/helpers_test.go @@ -80,13 +80,9 @@ type testConfig struct { gated bool // barriers enables the per-worker fault hook (race tests only). barriers bool - // sourcePatcher, if non-nil, is invoked on the random source data - // AFTER it's generated but BEFORE it's written to the on-disk - // content file the child reads. Race tests use it to plant a - // deterministic sentinel into the source so post-test assertions - // can distinguish "post-fix zero-fault" from "pre-fix UFFDIO_COPY - // of stale src bytes" without depending on randomly-generated - // byte values. + // sourcePatcher is invoked on the random source data after it's + // generated but before it's written to the content file the child + // reads. Race tests use it to plant a deterministic sentinel. sourcePatcher func([]byte) } @@ -166,8 +162,6 @@ func (h *testHandler) pageStates() (handlerPageStates, error) { return states, nil } -// installFaultBarrier parks the next worker that hits `phase` for -// `addr`. Returns a token to be passed to waitFaultHeld / releaseFault. func (h *testHandler) installFaultBarrier(_ context.Context, addr uintptr, phase faultPhase) (uint64, error) { return h.client.InstallBarrier(addr, testharness.Point(phase)) } @@ -186,7 +180,6 @@ func (h *testHandler) waitFaultHeld(ctx context.Context, token uint64) error { } } -// releaseFault releases a parked worker so it proceeds past the barrier. func (h *testHandler) releaseFault(_ context.Context, token uint64) error { return h.client.ReleaseFault(token) } diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go index 4605b96ec4..ebac540d0d 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go @@ -15,28 +15,16 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" ) -// raceHappyPathBudget bounds every race test in this file. The whole -// point of these tests is that they detect a regression as a fast, -// targeted assertion rather than as a CI -timeout 30m hang. None of -// these tests should approach this budget on a healthy build. -const raceHappyPathBudget = 5 * time.Second - -// barrierArrivalDeadline is how long the test will wait for a worker -// to reach an installed barrier. The hook fires the first thing in -// the worker goroutine, so on a healthy build it's a sub-millisecond -// rendezvous over the unix-socket RPC. Anything approaching this -// deadline means the handler dispatch is wedged. -const barrierArrivalDeadline = 2 * time.Second - -// madviseBudget is how long we allow MADV_DONTNEED to spend in the -// kernel after we've parked a worker mid-handler. The fix guarantees -// madvise unblocks as soon as the handler drains the REMOVE event -// from the uffd fd, regardless of any worker holding RLock — -// readEvents requires no lock. -const madviseBudget = 2 * time.Second - -// withRaceContext bounds a single race test to raceHappyPathBudget, -// failing with a clear "deadlock" message if the budget is exceeded. +// Bounded budgets so a regression surfaces as a fast assertion, not a +// 30-minute CI hang. madviseBudget is the load-bearing one: madvise must +// return as soon as the handler drains the REMOVE event, which requires +// no lock — coupling readEvents to settleRequests would push us past it. +const ( + raceHappyPathBudget = 5 * time.Second + barrierArrivalDeadline = 2 * time.Second + madviseBudget = 2 * time.Second +) + func withRaceContext(t *testing.T, body func(ctx context.Context)) { t.Helper() @@ -56,29 +44,17 @@ func withRaceContext(t *testing.T, body func(ctx context.Context)) { } } -// TestStaleSourceRaceMissingAndRemove is the deterministic regression -// test for the production fix in Serve(): -// -// - Pre-fix the parent serve loop captured `state == missing` and -// `source = u.src` BEFORE handing the work to a worker goroutine. -// A REMOVE event for the same page that arrived between then and -// the worker actually running would silently leave the worker -// with a stale `source = u.src` snapshot, which it would then -// UFFDIO_COPY into the page that the kernel had just unmapped. +// TestStaleSourceRaceMissingAndRemove deterministically reproduces the +// stale-source race: a worker that captured state == missing in the +// parent loop must not UFFDIO_COPY u.src after a concurrent REMOVE has +// transitioned the page to removed. The test plants a non-zero +// sentinel into source data, parks the worker at faultPhaseBeforeRLock, +// fires MADV_DONTNEED on the same page, releases the worker, and +// asserts the resulting page is zero-filled (regression: page[0] +// equals the sentinel). // -// - Post-fix the worker reads pageTracker state INSIDE the -// goroutine, under settleRequests.RLock, atomically with the -// decision of which `source` to use. -// -// The test installs a barrierBeforeRLock on page X (so the worker -// for X parks before it can read state), triggers a MISSING-write -// fault on X from the parent, waits for the worker to park, fires -// MADV_DONTNEED on X (which can take settleRequests.Lock immediately -// — no worker holds RLock), and then releases the worker. After -// release the worker, post-fix, observes state=removed under RLock -// and zero-faults; pre-fix it would have UFFDIO_COPY'd the planted -// sentinel byte from u.src. A direct read of the page contents -// distinguishes the two outcomes deterministically. +// Both variants fail until the fix in #2512 lands — the failure is +// intentional and demonstrates the stale-source bug. func TestStaleSourceRaceMissingAndRemove(t *testing.T) { t.Parallel() @@ -95,12 +71,6 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) { t.Parallel() withRaceContext(t, func(ctx context.Context) { - // Plant a deterministic, non-zero sentinel as the - // first byte of the source data for the page we'll - // race on. Pre-fix, the worker would UFFDIO_COPY this - // sentinel into the page after the REMOVE has already - // unmapped it. Post-fix the worker reads - // state == removed under RLock and zero-fills. const sentinel = byte(0xC3) const pageIdx = 1 pageOffset := int64(pageIdx) * int64(tt.pagesize) @@ -124,69 +94,43 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) { token, err := h.installFaultBarrier(ctx, addr, faultPhaseBeforeRLock) require.NoError(t, err) - // Trigger a READ fault (NOT a write — a write would - // overwrite the very byte we want to inspect to - // distinguish the two outcomes). h.executeRead does - // the touch + content check; we run it in a goroutine - // because it blocks on the fault until we release the - // barrier. + // READ, not write: a write would overwrite the byte + // we read below to distinguish the two outcomes. readErrCh := make(chan error, 1) go func() { readErrCh <- h.executeRead(ctx, operation{offset: pageOffset, mode: operationModeRead}) }() - // Wait for the worker for `addr` to park at the - // pre-RLock barrier. waitCtx, waitCancel := context.WithTimeout(ctx, barrierArrivalDeadline) err = h.waitFaultHeld(waitCtx, token) waitCancel() require.NoError(t, err, "worker for page %d (addr %#x) did not park at barrier", pageIdx, addr) - // Fire MADV_DONTNEED on the same page from the - // parent. The serve loop can take Lock immediately - // because the parked worker has not yet acquired - // RLock. err = h.executeRemove(operation{offset: pageOffset, mode: operationModeRemove}) require.NoError(t, err, "MADV_DONTNEED on page %d did not return — handler dispatch wedged", pageIdx) - // Wait for the handler to commit setState(removed). - // A tight poll loop with a hard deadline is used - // rather than a sleep — the transition is - // microseconds in the happy path. require.NoError(t, waitForState(ctx, h, uint64(pageOffset), removed, barrierArrivalDeadline), "handler did not transition page %d to `removed` after MADV_DONTNEED", pageIdx) - // Release the parked worker. Post-fix it will - // observe state == removed and zero-fault; pre-fix - // it would proceed with the captured stale source. require.NoError(t, h.releaseFault(ctx, token)) select { - case err := <-readErrCh: - // Pre-fix: executeRead's bytes.Equal succeeds - // (page contains src bytes), so err == nil but - // the page is observably wrong. Post-fix: - // bytes.Equal fails (page is zero-filled), so - // err != nil. We use the page-content assertion - // below instead of relying on this side-channel. - _ = err + case <-readErrCh: + // Pre-fix the read sees src bytes (err == nil); post-fix + // it sees zeros (err != nil). The page-content assertion + // below is the bug-detection path; the read just + // completes the fault. case <-ctx.Done(): t.Fatalf("read of page %d did not unblock after barrier release", pageIdx) } - // THE bug-detection assertion: post-fix the page - // MUST be zero-filled. Pre-fix the worker - // UFFDIO_COPY'd the planted sentinel. page := (*h.memoryArea)[pageOffset : pageOffset+int64(tt.pagesize)] assert.Equalf(t, byte(0), page[0], - "page %d first byte: want 0 (post-fix zero-fault for `removed` state), got %#x — "+ - "if this equals the sentinel %#x, the worker used a stale `source = u.src` snapshot (regression)", + "page %d first byte: want 0 (zero-fault for `removed`), got %#x — "+ + "if this equals the sentinel %#x, the worker used a stale source (regression)", pageIdx, page[0], sentinel, ) - // Sanity: verify with /proc/self/pagemap that the - // page is in fact present after the racing read was - // served (worker re-mapped it as zero). pagemap, err := testutils.NewPagemapReader() require.NoError(t, err) defer pagemap.Close() @@ -198,21 +142,11 @@ func TestStaleSourceRaceMissingAndRemove(t *testing.T) { } } -// TestNoMadviseDeadlockWithInflightCopy is a liveness regression test -// for the user-visible symptom that originally surfaced the stale- -// source race: the orchestrator's parent madvise(MADV_DONTNEED) -// blocking forever because the UFFD handler loop was wedged behind a -// worker. -// -// The harness parks the worker AFTER it has taken settleRequests.RLock -// AND captured `source` (i.e. as if its UFFDIO_COPY was in flight). -// From the parent we then issue MADV_DONTNEED on the same page and -// require that madvise returns within `madviseBudget`. madvise -// unblocks as soon as the handler's readEvents drains the REMOVE -// event, and readEvents requires no lock — so any future change that -// accidentally couples readEvents to settleRequests fails this test -// at the `madviseBudget` boundary instead of as a 30-minute CI -// timeout. +// TestNoMadviseDeadlockWithInflightCopy is the liveness guard for +// MADV_DONTNEED while a worker holds settleRequests.RLock. madvise +// must return within madviseBudget because readEvents drains REMOVE +// events without taking any lock — any future change that couples +// readEvents to settleRequests fails this test at the budget boundary. func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) { t.Parallel() @@ -258,10 +192,7 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) { waitCancel() require.NoError(t, err, "worker for page %d (addr %#x) did not park at pre-COPY barrier", pageIdx, addr) - // Worker is parked AFTER RLock. Issue MADV_DONTNEED - // on the same page from the parent. The handler's - // readEvents must drain the REMOVE event (so madvise - // returns) even while the worker holds RLock. + // Worker is parked holding RLock; madvise must still complete. madviseDone := make(chan error, 1) go func() { madviseDone <- unix.Madvise((*h.memoryArea)[pageOffset:pageOffset+int64(tt.pagesize)], unix.MADV_DONTNEED) @@ -291,18 +222,13 @@ func TestNoMadviseDeadlockWithInflightCopy(t *testing.T) { } } -// TestFaultedShortCircuitOrdering uses the gated harness to -// deterministically queue a WRITE pagefault for a fresh page AND a -// REMOVE for an already-faulted page in the SAME serve-loop -// iteration. After resume, the post-batch state is asserted: the -// REMOVE'd page is `removed` and the racing-write page is `faulted`. -// -// Both pre-fix and post-fix code reach the same end state for this -// scenario (REMOVE batch runs before the pagefault dispatch loop in -// every Serve iteration). This test guards the batch-processing -// invariant itself: any future change that, for example, dispatched -// pagefaults before draining REMOVEs would fail this test as a -// concrete state-mismatch assertion rather than a 30-minute hang. +// TestFaultedShortCircuitOrdering is an end-state sanity check for a +// REMOVE + PAGEFAULT batch on disjoint pages: page 0 (already faulted) +// is REMOVE'd, page 1 (missing) gets a write fault, and after resume +// page 0 must be `removed` and page 1 must be `faulted`. The two +// orderings happen to commute on disjoint pages, so this test does +// not by itself prove drain-order; same-page ordering is covered by +// TestStaleSourceRaceMissingAndRemove. // //nolint:paralleltest,tparallel // serialised: a paused gated handler keeps a faulting goroutine suspended in the kernel pagefault path; a STW GC pause from another parallel test would wait forever for that goroutine to reach a safe point. func TestFaultedShortCircuitOrdering(t *testing.T) { @@ -355,11 +281,7 @@ func TestFaultedShortCircuitOrdering(t *testing.T) { } // waitForState polls the child's PageStates RPC until the page at -// the given offset reaches `want` or `deadline` elapses. Each RPC -// round-trip is microseconds-to-low-milliseconds; we yield with a -// small sleep between polls so the harness doesn't burn an entire -// CPU on tight-loop encoding while the rest of the suite is also -// running cross-process tests. +// `offset` reaches `want` or `deadline` elapses. func waitForState(ctx context.Context, h *testHandler, offset uint64, want pageState, deadline time.Duration) error { const pollInterval = 1 * time.Millisecond @@ -376,6 +298,8 @@ func waitForState(ctx context.Context, h *testHandler, offset uint64, want pageS bucket = states.removed case faulted: bucket = states.faulted + default: + return fmt.Errorf("waitForState: unsupported want=%d", want) } for _, off := range bucket { From 4e7a838fefaa736b658c7e8fb91d9ca179f8c368 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sat, 2 May 2026 03:52:01 -0700 Subject: [PATCH 3/3] fix(uffd-tests): increase raceHappyPathBudget to 30s for CI hugepage setup latency --- packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go index ebac540d0d..2f0b1323a7 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/race_test.go @@ -20,7 +20,7 @@ import ( // return as soon as the handler drains the REMOVE event, which requires // no lock — coupling readEvents to settleRequests would push us past it. const ( - raceHappyPathBudget = 5 * time.Second + raceHappyPathBudget = 30 * time.Second barrierArrivalDeadline = 2 * time.Second madviseBudget = 2 * time.Second )