From f5bc4c7b8c70ff1bf9577bc8611beb016af174c0 Mon Sep 17 00:00:00 2001 From: Pritom14 Date: Sat, 30 May 2026 16:02:07 +0530 Subject: [PATCH 01/10] feat(backend): SQLite storage layer + CDC pipeline, LCM/reaper wiring Add the two real outbound adapters that replace the in-memory fakeStore: internal/storage/sqlite (persistence satisfying ports.LifecycleStore) and internal/cdc (transactional-outbox publisher, JSONL delivery, durable consumer). Wire them into main.go alongside the Lifecycle Manager and reaper so the write path is live end-to-end: LCM.Upsert -> store -> outbox -> JSONL -> broadcaster. Storage (internal/storage/sqlite): - modernc.org/sqlite (pure Go, no CGO) for clean cross-compile; goose embedded migrations; sqlc-generated typed queries under gen/. - Atomic Upsert: session row + change_log + outbox written in one tx. - revision is an optimistic-concurrency (CAS) check: insert requires revision 0 and persists 1; update requires loaded revision == stored and bumps +1; zero rows affected returns a revision-mismatch error. - Metadata is an opaque map in session_metadata, off the CDC path. - Durable reaction_trackers (fixes the in-memory-only escalation budget that re-fired human pages on restart). CDC (internal/cdc): - Publisher drains the outbox to a JSONL log; size-based rotation with a reset marker. - Consumer tails via byte cursor, detects rotation (os.SameFile), resyncs from a full-state snapshot on gaps, and tracks a durable consumer_offsets cursor. - Janitor reclaims acknowledged outbox rows. - Broadcaster is the in-process fan-out port the FE transport will subscribe to (WS/SSE wiring deferred). Composition root (main.go + *_wiring.go): - startCDC stands up publisher/consumer/janitor + broadcaster. - startLifecycle constructs the LCM, makes escalation budgets durable via WithReactionStore, teaches it to enumerate sessions via WithSessionLister, and starts the reaper. - Notifier, AgentMessenger, and the reaper's runtime registry are TEMPORARY no-op/empty stubs (lifecycle_wiring.go) with TODO markers; see the PR description for how to fill them in. Tests: contract-parity, revision CAS, outbox atomicity, CDC ordering and idempotency, rotation/resync, janitor vacuum, reaction durability across a simulated restart, and composition-root adapters. gofmt/build/vet clean and go test -race ./... green. --- .gitignore | 9 + backend/cdc_wiring.go | 143 ++++++++ backend/go.mod | 24 +- backend/go.sum | 66 ++++ backend/internal/cdc/broadcast.go | 44 +++ backend/internal/cdc/cdc_integration_test.go | 256 +++++++++++++++ backend/internal/cdc/consumer.go | 221 +++++++++++++ backend/internal/cdc/event.go | 32 ++ backend/internal/cdc/janitor.go | 84 +++++ backend/internal/cdc/jsonl.go | 109 +++++++ backend/internal/cdc/publisher.go | 115 +++++++ backend/internal/config/config.go | 24 ++ backend/internal/lifecycle/manager.go | 7 +- .../internal/lifecycle/manager_parity_test.go | 144 ++++++++ .../lifecycle/reaction_durability_test.go | 140 ++++++++ backend/internal/lifecycle/reaction_store.go | 94 ++++++ backend/internal/lifecycle/reactions.go | 30 +- backend/internal/storage/sqlite/cdc_store.go | 104 ++++++ backend/internal/storage/sqlite/db.go | 63 ++++ .../internal/storage/sqlite/gen/cdc.sql.go | 199 ++++++++++++ backend/internal/storage/sqlite/gen/db.go | 31 ++ .../storage/sqlite/gen/metadata.sql.go | 59 ++++ backend/internal/storage/sqlite/gen/models.go | 74 +++++ .../internal/storage/sqlite/gen/querier.go | 42 +++ .../storage/sqlite/gen/reactions.sql.go | 100 ++++++ .../storage/sqlite/gen/sessions.sql.go | 307 ++++++++++++++++++ backend/internal/storage/sqlite/mapping.go | 129 ++++++++ .../storage/sqlite/migrations/0001_init.sql | 109 +++++++ .../internal/storage/sqlite/queries/cdc.sql | 42 +++ .../storage/sqlite/queries/metadata.sql | 7 + .../storage/sqlite/queries/reactions.sql | 18 + .../storage/sqlite/queries/sessions.sql | 58 ++++ .../internal/storage/sqlite/reaction_store.go | 80 +++++ backend/internal/storage/sqlite/spike_test.go | 92 ++++++ backend/internal/storage/sqlite/store.go | 118 +++++++ backend/internal/storage/sqlite/store_test.go | 256 +++++++++++++++ backend/internal/storage/sqlite/upsert.go | 113 +++++++ backend/lifecycle_wiring.go | 126 +++++++ backend/main.go | 44 +++ backend/main_test.go | 134 ++++++++ backend/sqlc.yaml | 13 + 41 files changed, 3849 insertions(+), 11 deletions(-) create mode 100644 backend/cdc_wiring.go create mode 100644 backend/internal/cdc/broadcast.go create mode 100644 backend/internal/cdc/cdc_integration_test.go create mode 100644 backend/internal/cdc/consumer.go create mode 100644 backend/internal/cdc/event.go create mode 100644 backend/internal/cdc/janitor.go create mode 100644 backend/internal/cdc/jsonl.go create mode 100644 backend/internal/cdc/publisher.go create mode 100644 backend/internal/lifecycle/manager_parity_test.go create mode 100644 backend/internal/lifecycle/reaction_durability_test.go create mode 100644 backend/internal/lifecycle/reaction_store.go create mode 100644 backend/internal/storage/sqlite/cdc_store.go create mode 100644 backend/internal/storage/sqlite/db.go create mode 100644 backend/internal/storage/sqlite/gen/cdc.sql.go create mode 100644 backend/internal/storage/sqlite/gen/db.go create mode 100644 backend/internal/storage/sqlite/gen/metadata.sql.go create mode 100644 backend/internal/storage/sqlite/gen/models.go create mode 100644 backend/internal/storage/sqlite/gen/querier.go create mode 100644 backend/internal/storage/sqlite/gen/reactions.sql.go create mode 100644 backend/internal/storage/sqlite/gen/sessions.sql.go create mode 100644 backend/internal/storage/sqlite/mapping.go create mode 100644 backend/internal/storage/sqlite/migrations/0001_init.sql create mode 100644 backend/internal/storage/sqlite/queries/cdc.sql create mode 100644 backend/internal/storage/sqlite/queries/metadata.sql create mode 100644 backend/internal/storage/sqlite/queries/reactions.sql create mode 100644 backend/internal/storage/sqlite/queries/sessions.sql create mode 100644 backend/internal/storage/sqlite/reaction_store.go create mode 100644 backend/internal/storage/sqlite/spike_test.go create mode 100644 backend/internal/storage/sqlite/store.go create mode 100644 backend/internal/storage/sqlite/store_test.go create mode 100644 backend/internal/storage/sqlite/upsert.go create mode 100644 backend/lifecycle_wiring.go create mode 100644 backend/main_test.go create mode 100644 backend/sqlc.yaml diff --git a/.gitignore b/.gitignore index e5ea212a..425b31d7 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,15 @@ vendor/ /backend/backend agent-orchestrator.yaml +# Backend runtime data artifacts (SQLite store + WAL, CDC event log). +# Created at AO_DATA_DIR (outside the repo by default); ignored here so a +# data dir pointed at the tree never gets committed. +*.db +*.db-shm +*.db-wal +session-events.jsonl +session-events.jsonl.* + # Environment .env .env.* diff --git a/backend/cdc_wiring.go b/backend/cdc_wiring.go new file mode 100644 index 00000000..89997e7d --- /dev/null +++ b/backend/cdc_wiring.go @@ -0,0 +1,143 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "path/filepath" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// cdcConsumerName is the durable consumer_offsets key for the in-process FE +// broadcast consumer. A second transport (e.g. a cloud relay) would use its own +// key so each tracks an independent cursor. +const cdcConsumerName = "fe-broadcast" + +// cdcPipeline owns the running CDC goroutines and the broadcaster the FE +// transport subscribes to. It is the durable change-delivery substrate: the +// publisher drains the outbox to JSONL, the consumer tails the log and fans out +// through the broadcaster, and the janitor reclaims acknowledged outbox rows. +type cdcPipeline struct { + Broadcaster *cdc.Broadcaster + log *cdc.Log + dones []<-chan struct{} +} + +// startCDC opens the JSONL log and starts the publisher, consumer, and janitor +// against store, returning a handle whose Stop waits for the goroutines to +// drain after ctx is cancelled. The goroutines stop when ctx is cancelled. +func startCDC(ctx context.Context, store *sqlite.Store, dataDir string, logger *slog.Logger) (*cdcPipeline, error) { + log, err := cdc.OpenLog(dataDir, 0) + if err != nil { + return nil, fmt.Errorf("open cdc log: %w", err) + } + + bcast := cdc.NewBroadcaster() + logPath := filepath.Join(dataDir, cdc.LogFileName) + + pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{Logger: logger}) + con := cdc.NewConsumer(cdcConsumerName, logPath, store, bcast, cdc.ConsumerConfig{ + Snapshot: snapshotSource{store}, + Logger: logger, + }) + jan := cdc.NewJanitor(store, cdc.JanitorConfig{Logger: logger}) + + conDone, err := con.Start(ctx) + if err != nil { + log.Close() + return nil, fmt.Errorf("start cdc consumer: %w", err) + } + + return &cdcPipeline{ + Broadcaster: bcast, + log: log, + dones: []<-chan struct{}{pub.Start(ctx), conDone, jan.Start(ctx)}, + }, nil +} + +// Stop waits for every CDC goroutine to exit (the caller must have cancelled the +// ctx passed to startCDC) and closes the log file. +func (p *cdcPipeline) Stop() error { + for _, d := range p.dones { + <-d + } + return p.log.Close() +} + +// outboxAdapter bridges *sqlite.Store's outbox methods to cdc.OutboxStore, +// mapping the storage-native OutboxEvent to the transport's PendingEvent. (The +// offset and vacuum contracts need no adapter — *sqlite.Store satisfies +// cdc.OffsetStore and cdc.Vacuum directly.) +type outboxAdapter struct{ store *sqlite.Store } + +func (a outboxAdapter) ListUnsent(ctx context.Context, limit int) ([]cdc.PendingEvent, error) { + evs, err := a.store.ListUnsent(ctx, limit) + if err != nil { + return nil, err + } + out := make([]cdc.PendingEvent, len(evs)) + for i, e := range evs { + out[i] = cdc.PendingEvent{ + OutboxID: e.OutboxID, + Event: cdc.Event{ + Seq: e.Seq, + SessionID: e.SessionID, + EventType: e.EventType, + Revision: e.Revision, + Payload: e.Payload, + CreatedAt: e.CreatedAt, + }, + } + } + return out, nil +} + +func (a outboxAdapter) MarkSent(ctx context.Context, id int64, at time.Time) error { + return a.store.MarkSent(ctx, id, at) +} + +func (a outboxAdapter) MarkFailed(ctx context.Context, id int64, msg string) error { + return a.store.MarkFailed(ctx, id, msg) +} + +// snapshotSource rebuilds current state from the sessions table after a +// log-rotation gap, emitting one full-state event per session. Each event +// carries the change_log high-water seq so the consumer resumes its cursor +// there; the payload mirrors the canonical change_log payload (metadata +// excluded, version stamped) so subscribers parse snapshot and live events the +// same way. +type snapshotSource struct{ store *sqlite.Store } + +func (s snapshotSource) Snapshot(ctx context.Context) ([]cdc.Event, int64, error) { + recs, err := s.store.ListAll(ctx) + if err != nil { + return nil, 0, err + } + maxSeq, err := s.store.MaxChangeLogSeq(ctx) + if err != nil { + return nil, 0, err + } + events := make([]cdc.Event, 0, len(recs)) + for _, r := range recs { + r.Lifecycle.Version = domain.LifecycleVersion + r.Metadata = nil + blob, err := json.Marshal(r) + if err != nil { + return nil, 0, fmt.Errorf("marshal snapshot %s: %w", r.ID, err) + } + events = append(events, cdc.Event{ + Seq: maxSeq, + SessionID: string(r.ID), + EventType: "session_snapshot", + Revision: int64(r.Lifecycle.Revision), + Payload: string(blob), + CreatedAt: r.UpdatedAt, + }) + } + return events, maxSeq, nil +} diff --git a/backend/go.mod b/backend/go.mod index 311cea28..88ca590c 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -1,5 +1,25 @@ module github.com/aoagents/agent-orchestrator/backend -go 1.22 +go 1.25.7 -require github.com/go-chi/chi/v5 v5.1.0 +require ( + github.com/go-chi/chi/v5 v5.1.0 + github.com/pressly/goose/v3 v3.27.1 + modernc.org/sqlite v1.51.0 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/mattn/go-isatty v0.0.21 // indirect + github.com/mfridman/interpolate v0.0.2 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/sethvargo/go-retry v0.3.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.43.0 // indirect + modernc.org/libc v1.72.3 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect +) diff --git a/backend/go.sum b/backend/go.sum index 823cdbb1..89f83929 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,2 +1,68 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= +github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= +github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= +github.com/mfridman/interpolate v0.0.2/go.mod h1:p+7uk6oE07mpE/Ik1b8EckO0O4ZXiGAfshKBWLUM9Xg= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pressly/goose/v3 v3.27.1 h1:6uEvcprBybDmW4hcz3gYujhARhye+GoWKhEWyzD5sh4= +github.com/pressly/goose/v3 v3.27.1/go.mod h1:maruOxsPnIG2yHHyo8UqKWXYKFcH7Q76csUV7+7KYoM= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= +github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/cc/v4 v4.28.2 h1:3tQ0lf2ADtoby2EtSP+J7IE2SHwEJdP8ioR59wx7XpY= +modernc.org/cc/v4 v4.28.2/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI= +modernc.org/ccgo/v4 v4.34.0 h1:yRLPFZieg532OT4rp4JFNIVcquwalMX26G95WQDqwCQ= +modernc.org/ccgo/v4 v4.34.0/go.mod h1:AS5WYMyBakQ+fhsHhtP8mWB82KTGPkNNJDGfGQCe0/A= +modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= +modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.72.3 h1:ZnDF4tXn4NBXFutMMQC4vtbTFSXhhKzR73fv0beZEAU= +modernc.org/libc v1.72.3/go.mod h1:dn0dZNnnn1clLyvRxLxYExxiKRZIRENOfqQ8XEeg4Qs= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.2.0 h1:tGyef5ApycA7FSEOMraay9SaTk5zmbx7Tu+cJs4QKZg= +modernc.org/opt v0.2.0/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.51.0 h1:aH/MMSoayAIhozZ7uJbVTT9QO/VhzBf0J9tymmmuC/U= +modernc.org/sqlite v1.51.0/go.mod h1:tcNzv5p84E0skkmJn038y+hWJbLQXQqEnQfeh5r2JLM= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/backend/internal/cdc/broadcast.go b/backend/internal/cdc/broadcast.go new file mode 100644 index 00000000..a7458e38 --- /dev/null +++ b/backend/internal/cdc/broadcast.go @@ -0,0 +1,44 @@ +package cdc + +import "sync" + +// Broadcaster is the in-process fan-out the consumer feeds. Subscribers (the +// WS/SSE transport, wired in the frontend task) register a callback; every +// consumed Event is delivered to all current subscribers. It is the single +// seam between the CDC pipeline and live delivery, so the transport can be +// built and swapped without touching the pipeline. +type Broadcaster struct { + mu sync.RWMutex + nextID int + subs map[int]func(Event) +} + +// NewBroadcaster returns an empty Broadcaster ready for subscriptions. +func NewBroadcaster() *Broadcaster { + return &Broadcaster{subs: map[int]func(Event){}} +} + +// Subscribe registers fn and returns an unsubscribe function. fn is called +// synchronously from the consumer loop, so it must not block; a transport that +// needs buffering should push onto its own channel inside fn. +func (b *Broadcaster) Subscribe(fn func(Event)) (unsubscribe func()) { + b.mu.Lock() + id := b.nextID + b.nextID++ + b.subs[id] = fn + b.mu.Unlock() + return func() { + b.mu.Lock() + delete(b.subs, id) + b.mu.Unlock() + } +} + +// Publish delivers e to every current subscriber. +func (b *Broadcaster) Publish(e Event) { + b.mu.RLock() + defer b.mu.RUnlock() + for _, fn := range b.subs { + fn(e) + } +} diff --git a/backend/internal/cdc/cdc_integration_test.go b/backend/internal/cdc/cdc_integration_test.go new file mode 100644 index 00000000..9390afe0 --- /dev/null +++ b/backend/internal/cdc/cdc_integration_test.go @@ -0,0 +1,256 @@ +package cdc_test + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// outboxAdapter bridges sqlite.Store's outbox methods to cdc.OutboxStore. This +// is the same glue the composition root (main.go) installs. +type outboxAdapter struct{ s *sqlite.Store } + +func (a outboxAdapter) ListUnsent(ctx context.Context, limit int) ([]cdc.PendingEvent, error) { + evs, err := a.s.ListUnsent(ctx, limit) + if err != nil { + return nil, err + } + out := make([]cdc.PendingEvent, len(evs)) + for i, e := range evs { + out[i] = cdc.PendingEvent{ + OutboxID: e.OutboxID, + Event: cdc.Event{ + Seq: e.Seq, + SessionID: e.SessionID, + EventType: e.EventType, + Revision: e.Revision, + Payload: e.Payload, + CreatedAt: e.CreatedAt, + }, + } + } + return out, nil +} + +func (a outboxAdapter) MarkSent(ctx context.Context, id int64, at time.Time) error { + return a.s.MarkSent(ctx, id, at) +} +func (a outboxAdapter) MarkFailed(ctx context.Context, id int64, msg string) error { + return a.s.MarkFailed(ctx, id, msg) +} + +func newStore(t *testing.T) *sqlite.Store { + t.Helper() + db, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { db.Close() }) + return sqlite.NewStore(db) +} + +func rec(id string) domain.SessionRecord { + now := time.Now().UTC() + return domain.SessionRecord{ + ID: domain.SessionID(id), ProjectID: "p", Kind: domain.KindWorker, CreatedAt: now, UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, + PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, + Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + }, + } +} + +func TestEndToEndPublishConsume(t *testing.T) { + ctx := context.Background() + store := newStore(t) + dir := t.TempDir() + log, err := cdc.OpenLog(dir, 0) + if err != nil { + t.Fatal(err) + } + defer log.Close() + + // Three canonical writes => three outbox rows, seq 1..3. + r := rec("s1") + if err := store.Upsert(ctx, r, ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + r.Lifecycle.Revision = 1 + if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { + t.Fatal(err) + } + r.Lifecycle.Revision = 2 + if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { + t.Fatal(err) + } + + pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) + if err := pub.Drain(ctx); err != nil { + t.Fatalf("drain: %v", err) + } + + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) + + con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{}) + if _, err := con.Start(ctx); err != nil { + t.Fatal(err) + } + // Drive one poll synchronously instead of waiting on the goroutine. + if err := con.Poll(ctx); err != nil { + t.Fatalf("poll: %v", err) + } + + if len(got) != 3 { + t.Fatalf("delivered %d events, want 3", len(got)) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d has seq %d, want %d", i, e.Seq, i+1) + } + } + if got[0].EventType != string(ports.EventSessionCreated) { + t.Fatalf("first event type = %q", got[0].EventType) + } + + // Idempotency: a second poll with no new bytes delivers nothing more. + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + if len(got) != 3 { + t.Fatalf("re-poll delivered extra events: %d", len(got)) + } + + // Offset persisted at seq 3. + off, _ := store.GetOffset(ctx, "fe") + if off != 3 { + t.Fatalf("offset = %d, want 3", off) + } + + // Janitor: consumer ACKed 3, so sent rows with seq < 3 are reclaimed. + jan := cdc.NewJanitor(store, cdc.JanitorConfig{}) + deleted, err := jan.Sweep(ctx) + if err != nil { + t.Fatal(err) + } + if deleted != 2 { + t.Fatalf("janitor deleted %d, want 2 (seq 1,2 < watermark 3)", deleted) + } +} + +func TestConsumerRestartSkipsDelivered(t *testing.T) { + ctx := context.Background() + store := newStore(t) + dir := t.TempDir() + log, _ := cdc.OpenLog(dir, 0) + defer log.Close() + + if err := store.Upsert(ctx, rec("s1"), ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) + if err := pub.Drain(ctx); err != nil { + t.Fatal(err) + } + + // Pre-seed the durable offset as if a prior consumer already delivered seq 1. + if err := store.SetOffset(ctx, "fe", 1, time.Now().UTC()); err != nil { + t.Fatal(err) + } + + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) + con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{}) + if _, err := con.Start(ctx); err != nil { + t.Fatal(err) + } + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + if len(got) != 0 { + t.Fatalf("restart re-delivered already-acked events: %d", len(got)) + } +} + +// fakeSnapshot stands in for the sessions-table snapshot source on resync. +type fakeSnapshot struct { + events []cdc.Event + maxSeq int64 +} + +func (f fakeSnapshot) Snapshot(context.Context) ([]cdc.Event, int64, error) { + return f.events, f.maxSeq, nil +} + +func TestRotationTriggersResync(t *testing.T) { + ctx := context.Background() + store := newStore(t) + dir := t.TempDir() + // Tiny cap so a couple of writes force a rotation. + log, err := cdc.OpenLog(dir, 80) + if err != nil { + t.Fatal(err) + } + defer log.Close() + + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) + + snap := fakeSnapshot{events: []cdc.Event{{Seq: 5, SessionID: "s1", EventType: "session_updated"}}, maxSeq: 5} + con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{Snapshot: snap}) + if _, err := con.Start(ctx); err != nil { + t.Fatal(err) + } + + pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) + + // First write + drain + poll: consumer reads it and advances its cursor. + if err := store.Upsert(ctx, rec("s1"), ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + if err := pub.Drain(ctx); err != nil { + t.Fatal(err) + } + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + cursorBefore := len(got) + + // Force rotation by writing past the cap, then poll: the file shrank, so the + // consumer must resync from the snapshot source. + r := rec("s1") + r.Lifecycle.Revision = 1 + if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { + t.Fatal(err) + } + if err := pub.Drain(ctx); err != nil { + t.Fatal(err) + } + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + + if len(got) <= cursorBefore { + t.Fatal("expected resync to deliver the snapshot event") + } + // The snapshot event (seq 5) must be among the delivered events. + var sawSnapshot bool + for _, e := range got { + if e.Seq == 5 { + sawSnapshot = true + } + } + if !sawSnapshot { + t.Fatalf("resync did not deliver snapshot event; got %+v", got) + } +} diff --git a/backend/internal/cdc/consumer.go b/backend/internal/cdc/consumer.go new file mode 100644 index 00000000..00edb0f1 --- /dev/null +++ b/backend/internal/cdc/consumer.go @@ -0,0 +1,221 @@ +package cdc + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "os" + "time" +) + +// DefaultPollInterval is how often the consumer checks the log for new bytes. +// Polling (rather than fs-notify) keeps the consumer dependency-free; at this +// cadence live updates stay well under a human-perceptible delay. +const DefaultPollInterval = 100 * time.Millisecond + +// OffsetStore persists the consumer's durable seq cursor (at-least-once). +type OffsetStore interface { + GetOffset(ctx context.Context, consumer string) (int64, error) + SetOffset(ctx context.Context, consumer string, seq int64, at time.Time) error +} + +// SnapshotSource rebuilds current state from the source of truth (the sessions +// table) after a rotation gap, where log lines for unconsumed-but-already-sent +// events were truncated away. It returns one Event per live session plus the +// MAX(change_log seq) the snapshot corresponds to, so the consumer can resume. +type SnapshotSource interface { + Snapshot(ctx context.Context) (events []Event, maxSeq int64, err error) +} + +// Consumer tails the JSONL log, deduplicates by seq, and fans each new event +// out through the Broadcaster, persisting its durable offset as it goes. +type Consumer struct { + name string + path string + offsets OffsetStore + bcast *Broadcaster + snapshot SnapshotSource + interval time.Duration + clock func() time.Time + logger *slog.Logger + + cursor int64 // byte offset into the log + lastSeq int64 // highest seq delivered + prevInfo os.FileInfo // identity of the file last polled (rotation detection) +} + +// ConsumerConfig holds optional knobs and the snapshot source. +type ConsumerConfig struct { + Snapshot SnapshotSource + Interval time.Duration + Clock func() time.Time + Logger *slog.Logger +} + +// NewConsumer constructs a Consumer named name (the consumer_offsets key) over +// the log at path, fanning out through bcast and persisting offsets via offsets. +func NewConsumer(name, path string, offsets OffsetStore, bcast *Broadcaster, cfg ConsumerConfig) *Consumer { + c := &Consumer{ + name: name, + path: path, + offsets: offsets, + bcast: bcast, + snapshot: cfg.Snapshot, + interval: cfg.Interval, + clock: cfg.Clock, + logger: cfg.Logger, + } + if c.interval <= 0 { + c.interval = DefaultPollInterval + } + if c.clock == nil { + c.clock = time.Now + } + if c.logger == nil { + c.logger = slog.Default() + } + return c +} + +// Start loads the durable offset and runs the poll loop until ctx is cancelled; +// the returned channel closes when the loop has exited. +func (c *Consumer) Start(ctx context.Context) (<-chan struct{}, error) { + seq, err := c.offsets.GetOffset(ctx, c.name) + if err != nil { + return nil, fmt.Errorf("load consumer offset: %w", err) + } + c.lastSeq = seq + + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(c.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := c.Poll(ctx); err != nil { + c.logger.Error("cdc consumer: poll failed", "err", err) + } + } + } + }() + return done, nil +} + +// Poll reads any new bytes since the last cursor and delivers complete lines. It +// detects rotation (the file shrank below the cursor) and resyncs from the DB +// snapshot before resuming. +func (c *Consumer) Poll(ctx context.Context) error { + f, err := os.Open(c.path) + if err != nil { + if os.IsNotExist(err) { + return nil // publisher has not created the log yet + } + return fmt.Errorf("open cdc log: %w", err) + } + defer f.Close() + + info, err := f.Stat() + if err != nil { + return fmt.Errorf("stat cdc log: %w", err) + } + size := info.Size() + + rotated := (c.prevInfo != nil && !os.SameFile(c.prevInfo, info)) || size < c.cursor + c.prevInfo = info + if rotated { + // The previous file's bytes are void. Resync from the DB snapshot (if + // wired), then resume reading the fresh file from the top. + if err := c.resync(ctx); err != nil { + return err + } + c.cursor = 0 + } + if size == c.cursor { + return nil + } + + if _, err := f.Seek(c.cursor, io.SeekStart); err != nil { + return fmt.Errorf("seek cdc log: %w", err) + } + data, err := io.ReadAll(f) + if err != nil { + return fmt.Errorf("read cdc log: %w", err) + } + + consumed, maxSeq := c.processLines(data) + c.cursor += int64(consumed) + + if maxSeq > c.lastSeq { + c.lastSeq = maxSeq + if err := c.offsets.SetOffset(ctx, c.name, c.lastSeq, c.clock().UTC()); err != nil { + return fmt.Errorf("persist consumer offset: %w", err) + } + } + return nil +} + +// processLines delivers each complete (newline-terminated) line, skipping reset +// markers and any event whose seq was already delivered. It returns the number +// of bytes consumed (only complete lines) and the highest seq seen. +func (c *Consumer) processLines(data []byte) (consumed int, maxSeq int64) { + maxSeq = c.lastSeq + for { + nl := bytes.IndexByte(data[consumed:], '\n') + if nl < 0 { + return consumed, maxSeq // partial trailing line: leave for next poll + } + line := data[consumed : consumed+nl] + consumed += nl + 1 + + if isResetMarker(line) { + continue + } + var e Event + if err := json.Unmarshal(line, &e); err != nil { + c.logger.Error("cdc consumer: bad line skipped", "err", err) + continue + } + if e.Seq <= c.lastSeq { + continue // idempotent: already delivered + } + c.bcast.Publish(e) + if e.Seq > maxSeq { + maxSeq = e.Seq + } + } +} + +func (c *Consumer) resync(ctx context.Context) error { + if c.snapshot == nil { + return nil + } + events, maxSeq, err := c.snapshot.Snapshot(ctx) + if err != nil { + return fmt.Errorf("cdc consumer resync: %w", err) + } + for _, e := range events { + c.bcast.Publish(e) + } + if maxSeq > c.lastSeq { + c.lastSeq = maxSeq + if err := c.offsets.SetOffset(ctx, c.name, c.lastSeq, c.clock().UTC()); err != nil { + return fmt.Errorf("persist offset after resync: %w", err) + } + } + return nil +} + +func isResetMarker(line []byte) bool { + var m resetMarker + if err := json.Unmarshal(line, &m); err != nil { + return false + } + return m.Type == "reset" +} diff --git a/backend/internal/cdc/event.go b/backend/internal/cdc/event.go new file mode 100644 index 00000000..b0eddf98 --- /dev/null +++ b/backend/internal/cdc/event.go @@ -0,0 +1,32 @@ +// Package cdc is the change-data-capture pipeline that turns the storage layer's +// transactional outbox into a durable, ordered event stream for the frontend. +// +// The flow: the publisher drains the SQLite outbox (sent=0, seq order) and +// appends each change as one JSON line to a rotating log file. The consumer +// tails that file from a durable byte cursor, deduplicates by seq, and fans each +// change out through the Broadcaster to in-process subscribers (the WS/SSE +// transport, wired later). The janitor reclaims outbox rows every consumer has +// acknowledged. Delivery is at-least-once; seq is the idempotency key. +package cdc + +import "time" + +// Event is one change-data-capture record. It is the JSONL line shape and the +// value handed to Broadcaster subscribers. Seq is the monotonic ordering and +// idempotency key (the change_log seq). +type Event struct { + Seq int64 `json:"seq"` + SessionID string `json:"sessionId"` + EventType string `json:"eventType"` + Revision int64 `json:"revision"` + Payload string `json:"payload"` + CreatedAt time.Time `json:"createdAt"` +} + +// resetMarker is written as the first line of a freshly rotated log file. A +// consumer that reads it knows the byte offsets of the previous file are void +// and must snapshot-resync, then resume from the current MAX(seq). +type resetMarker struct { + Type string `json:"type"` // always "reset" + RotatedAt time.Time `json:"rotatedAt"` +} diff --git a/backend/internal/cdc/janitor.go b/backend/internal/cdc/janitor.go new file mode 100644 index 00000000..3968b2cf --- /dev/null +++ b/backend/internal/cdc/janitor.go @@ -0,0 +1,84 @@ +package cdc + +import ( + "context" + "log/slog" + "time" +) + +// DefaultJanitorInterval is the outbox-vacuum cadence. +const DefaultJanitorInterval = 60 * time.Second + +// Vacuum is the janitor's view of storage: the safe deletion watermark and the +// delete itself. +type Vacuum interface { + MinConsumerOffset(ctx context.Context) (int64, error) + DeleteSentOutboxBelow(ctx context.Context, seq int64) (int64, error) +} + +// Janitor reclaims delivered outbox rows every consumer has acknowledged. +// +// Watermark: MIN(consumer_offsets.last_seq). Rows with seq < watermark are sent +// AND past every consumer's cursor, so they are safe to drop. When the watermark +// is 0 (a consumer exists but has acknowledged nothing, or none is registered +// yet) the janitor deletes nothing — it never races ahead of a consumer that +// has not yet read an event. change_log is never touched: it is the durable +// history and the snapshot-resync floor. +type Janitor struct { + store Vacuum + interval time.Duration + logger *slog.Logger +} + +// JanitorConfig holds optional knobs; zero values fall back to defaults. +type JanitorConfig struct { + Interval time.Duration + Logger *slog.Logger +} + +// NewJanitor constructs a Janitor over store. +func NewJanitor(store Vacuum, cfg JanitorConfig) *Janitor { + j := &Janitor{store: store, interval: cfg.Interval, logger: cfg.Logger} + if j.interval <= 0 { + j.interval = DefaultJanitorInterval + } + if j.logger == nil { + j.logger = slog.Default() + } + return j +} + +// Start runs the vacuum loop until ctx is cancelled; the returned channel closes +// when the loop has exited. +func (j *Janitor) Start(ctx context.Context) <-chan struct{} { + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(j.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if _, err := j.Sweep(ctx); err != nil { + j.logger.Error("cdc janitor: sweep failed", "err", err) + } + } + } + }() + return done +} + +// Sweep deletes delivered outbox rows below the safe watermark and returns the +// number removed. +func (j *Janitor) Sweep(ctx context.Context) (int64, error) { + watermark, err := j.store.MinConsumerOffset(ctx) + if err != nil { + return 0, err + } + if watermark <= 0 { + return 0, nil + } + return j.store.DeleteSentOutboxBelow(ctx, watermark) +} diff --git a/backend/internal/cdc/jsonl.go b/backend/internal/cdc/jsonl.go new file mode 100644 index 00000000..74dc0695 --- /dev/null +++ b/backend/internal/cdc/jsonl.go @@ -0,0 +1,109 @@ +package cdc + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sync" + "time" +) + +// LogFileName is the active CDC log under the data dir. +const LogFileName = "session-events.jsonl" + +// DefaultMaxBytes is the size at which the log rotates (1 MiB). +const DefaultMaxBytes int64 = 1 << 20 + +// Log is the append-only JSONL sink the publisher writes to. When it grows past +// maxBytes it rotates by truncating in place and writing a reset marker as the +// new first line — the consumer treats a shrunken file as "resync from the DB +// snapshot", so the log itself is not the durable source of truth (SQLite is). +type Log struct { + mu sync.Mutex + path string + maxBytes int64 + f *os.File + size int64 +} + +// OpenLog opens (creating if absent) the JSONL log in dir. maxBytes <= 0 uses +// DefaultMaxBytes. +func OpenLog(dir string, maxBytes int64) (*Log, error) { + if maxBytes <= 0 { + maxBytes = DefaultMaxBytes + } + path := filepath.Join(dir, LogFileName) + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) + if err != nil { + return nil, fmt.Errorf("open cdc log: %w", err) + } + info, err := f.Stat() + if err != nil { + f.Close() + return nil, fmt.Errorf("stat cdc log: %w", err) + } + return &Log{path: path, maxBytes: maxBytes, f: f, size: info.Size()}, nil +} + +// Append writes one event as a JSON line, flushing to disk. It rotates first if +// the file is already at/over the size cap, so a single oversized burst still +// lands in a fresh segment. +func (l *Log) Append(e Event) error { + l.mu.Lock() + defer l.mu.Unlock() + + if l.size >= l.maxBytes { + if err := l.rotateLocked(); err != nil { + return err + } + } + return l.writeLocked(e) +} + +func (l *Log) writeLocked(v any) error { + line, err := json.Marshal(v) + if err != nil { + return fmt.Errorf("marshal cdc line: %w", err) + } + line = append(line, '\n') + n, err := l.f.Write(line) + l.size += int64(n) + if err != nil { + return fmt.Errorf("write cdc line: %w", err) + } + if err := l.f.Sync(); err != nil { + return fmt.Errorf("sync cdc log: %w", err) + } + return nil +} + +// rotateLocked renames the active file aside and starts a fresh one whose first +// line is a reset marker. Renaming (not truncating in place) gives the file a +// new identity, so a polling consumer reliably detects rotation via +// os.SameFile even if the fresh file grows past its old byte cursor between +// polls. The consumer then resyncs from the DB snapshot. +func (l *Log) rotateLocked() error { + if err := l.f.Close(); err != nil { + return fmt.Errorf("close cdc log for rotate: %w", err) + } + archive := l.path + ".1" + _ = os.Remove(archive) // best-effort: history lives in SQLite, not the log + if err := os.Rename(l.path, archive); err != nil { + return fmt.Errorf("rotate cdc log: %w", err) + } + f, err := os.OpenFile(l.path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) + if err != nil { + return fmt.Errorf("reopen cdc log after rotate: %w", err) + } + l.f = f + l.size = 0 + return l.writeLocked(resetMarker{Type: "reset", RotatedAt: time.Now().UTC()}) +} + +// Close closes the underlying file. +func (l *Log) Close() error { + l.mu.Lock() + defer l.mu.Unlock() + return l.f.Close() +} diff --git a/backend/internal/cdc/publisher.go b/backend/internal/cdc/publisher.go new file mode 100644 index 00000000..3283a236 --- /dev/null +++ b/backend/internal/cdc/publisher.go @@ -0,0 +1,115 @@ +package cdc + +import ( + "context" + "log/slog" + "time" +) + +// DefaultPublishInterval is the outbox drain cadence. +const DefaultPublishInterval = 50 * time.Millisecond + +// DefaultBatchSize bounds how many outbox rows one drain pass handles. +const DefaultBatchSize = 256 + +// PendingEvent is an undelivered outbox row paired with its CDC event payload. +type PendingEvent struct { + OutboxID int64 + Event +} + +// OutboxStore is the publisher's view of the storage layer: read undelivered +// rows in seq order, then mark each delivered or failed. +type OutboxStore interface { + ListUnsent(ctx context.Context, limit int) ([]PendingEvent, error) + MarkSent(ctx context.Context, outboxID int64, at time.Time) error + MarkFailed(ctx context.Context, outboxID int64, errMsg string) error +} + +// Publisher drains the outbox into the JSONL log on a fixed cadence. +type Publisher struct { + src OutboxStore + log *Log + interval time.Duration + batch int + clock func() time.Time + logger *slog.Logger +} + +// PublisherConfig holds optional knobs; zero values fall back to defaults. +type PublisherConfig struct { + Interval time.Duration + Batch int + Clock func() time.Time + Logger *slog.Logger +} + +// NewPublisher constructs a Publisher over src and log. +func NewPublisher(src OutboxStore, log *Log, cfg PublisherConfig) *Publisher { + p := &Publisher{ + src: src, + log: log, + interval: cfg.Interval, + batch: cfg.Batch, + clock: cfg.Clock, + logger: cfg.Logger, + } + if p.interval <= 0 { + p.interval = DefaultPublishInterval + } + if p.batch <= 0 { + p.batch = DefaultBatchSize + } + if p.clock == nil { + p.clock = time.Now + } + if p.logger == nil { + p.logger = slog.Default() + } + return p +} + +// Start runs the drain loop until ctx is cancelled; the returned channel closes +// when the loop has exited. +func (p *Publisher) Start(ctx context.Context) <-chan struct{} { + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(p.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := p.Drain(ctx); err != nil { + p.logger.Error("cdc publisher: drain failed", "err", err) + } + } + } + }() + return done +} + +// Drain runs one pass: append each undelivered row to the log in seq order, +// marking it sent. A write failure stops the pass (the row is marked failed and +// retried next tick) so ordering is never violated by skipping ahead. +func (p *Publisher) Drain(ctx context.Context) error { + pending, err := p.src.ListUnsent(ctx, p.batch) + if err != nil { + return err + } + for _, pe := range pending { + if err := p.log.Append(pe.Event); err != nil { + p.logger.Error("cdc publisher: append failed", "outboxId", pe.OutboxID, "seq", pe.Seq, "err", err) + if merr := p.src.MarkFailed(ctx, pe.OutboxID, err.Error()); merr != nil { + p.logger.Error("cdc publisher: mark failed errored", "outboxId", pe.OutboxID, "err", merr) + } + return nil + } + if err := p.src.MarkSent(ctx, pe.OutboxID, p.clock().UTC()); err != nil { + return err + } + } + return nil +} diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index d6765dba..68aab00e 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -47,6 +47,9 @@ type Config struct { // RunFilePath is where the PID + port handshake file (running.json) is // written so the Electron supervisor can discover and reap the daemon. RunFilePath string + // DataDir is the directory holding durable state (the SQLite database and + // the CDC JSONL log). It is created on first use by the storage layer. + DataDir string } // Addr returns the host:port the HTTP server binds. It uses net.JoinHostPort so @@ -65,6 +68,7 @@ func (c Config) Addr() string { // AO_REQUEST_TIMEOUT per-request timeout (Go duration > 0, default 60s) // AO_SHUTDOWN_TIMEOUT shutdown deadline (Go duration > 0, default 10s) // AO_RUN_FILE running.json path (default /running.json) +// AO_DATA_DIR durable state dir (default /data) // // The bind host is not configurable: the daemon is loopback-only by design. func Load() (Config, error) { @@ -108,6 +112,12 @@ func Load() (Config, error) { } cfg.RunFilePath = runFile + dataDir, err := resolveDataDir() + if err != nil { + return Config{}, err + } + cfg.DataDir = dataDir + return cfg, nil } @@ -138,3 +148,17 @@ func resolveRunFilePath() (string, error) { } return filepath.Join(dir, "agent-orchestrator", "running.json"), nil } + +// resolveDataDir picks where durable state (SQLite DB, CDC JSONL) lives. An +// explicit AO_DATA_DIR wins; otherwise it sits under the per-user state +// directory alongside running.json. +func resolveDataDir() (string, error) { + if p, ok := os.LookupEnv("AO_DATA_DIR"); ok && p != "" { + return p, nil + } + dir, err := os.UserConfigDir() + if err != nil { + return "", fmt.Errorf("resolve state dir: %w", err) + } + return filepath.Join(dir, "agent-orchestrator", "data"), nil +} diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index b5751e86..54e6887f 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -53,6 +53,11 @@ type Manager struct { trackerMu sync.Mutex clock func() time.Time + // reactionStore, when wired via WithReactionStore, makes the trackers map a + // write-through cache over durable rows so a restart does not re-fire an + // already-escalated human page. nil keeps the in-memory-only default. + reactionStore ReactionStore + // sessionLister returns every session known to persistence so RunningSessions // can filter by runtime axis without coupling the LCM to a cross-project // store API the Tom-store does not yet expose. The daemon (lane #10) injects @@ -423,7 +428,7 @@ func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, r po // A kill is terminal but bypasses react()'s incident-over cleanup (it fires // no reaction). Drop any escalation trackers here so a later duration-based // TickEscalations can't emit reaction.escalated for a dead session. - m.clearSessionTrackers(id) + m.clearSessionTrackers(ctx, id) return nil } diff --git a/backend/internal/lifecycle/manager_parity_test.go b/backend/internal/lifecycle/manager_parity_test.go new file mode 100644 index 00000000..146dcc16 --- /dev/null +++ b/backend/internal/lifecycle/manager_parity_test.go @@ -0,0 +1,144 @@ +package lifecycle + +import ( + "context" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// TestStoreParity is the key contract test from the plan: it drives the REAL +// Lifecycle Manager through identical operation sequences against the in-memory +// fakeStore (the authoritative store semantics) and the SQLite-backed Store, +// then asserts the resulting canonical lifecycle is byte-identical. If the +// SQLite adapter honored the port exactly, the two managers cannot diverge. +// +// Both stores are seeded the same way (via the public Upsert insert path, so +// both start at revision 1) — this makes revision numbers, not just states, +// directly comparable. +func TestStoreParity(t *testing.T) { + seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) + seed.Activity = domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: t0, Source: domain.SourceNative} + + cases := []struct { + name string + ops []func(*Manager) error + }{ + { + name: "runtime dead then activity signal", + ops: []func(*Manager) error{ + func(m *Manager) error { + return m.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{ + RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0, + }) + }, + func(m *Manager) error { + return m.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{ + State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook, + }) + }, + }, + }, + { + name: "scm pr open then changes requested", + ops: []func(*Manager) error{ + func(m *Manager) error { + return m.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, PRNumber: 7, PRURL: "http://x/7", + }) + }, + }, + }, + { + name: "kill request terminates", + ops: []func(*Manager) error{ + func(m *Manager) error { + return m.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual, Detail: "x"}) + }, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + fakeMgr, fakeS := newManager() + sqlMgr, sqlS := newSQLiteManager(t) + + seedViaUpsert(t, fakeS, seed) + seedViaUpsert(t, sqlS, seed) + + for i, op := range tc.ops { + errF := op(fakeMgr) + errS := op(sqlMgr) + if (errF == nil) != (errS == nil) { + t.Fatalf("op %d error divergence: fake=%v sqlite=%v", i, errF, errS) + } + } + + fl, okF, _ := fakeS.Load(context.Background(), sid) + sl, okS, _ := sqlS.Load(context.Background(), sid) + if okF != okS { + t.Fatalf("presence divergence: fake=%v sqlite=%v", okF, okS) + } + assertLifecycleEqual(t, fl, sl) + }) + } +} + +func newSQLiteManager(t *testing.T) (*Manager, *sqlite.Store) { + t.Helper() + db, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open sqlite: %v", err) + } + t.Cleanup(func() { db.Close() }) + store := sqlite.NewStore(db) + return New(store, &recordingNotifier{}, &recordingMessenger{}), store +} + +func seedViaUpsert(t *testing.T, store ports.LifecycleStore, l domain.CanonicalSessionLifecycle) { + t.Helper() + rec := domain.SessionRecord{ + ID: sid, + ProjectID: "proj", + Kind: domain.KindWorker, + CreatedAt: t0, + UpdatedAt: t0, + Lifecycle: l, + } + if err := store.Upsert(context.Background(), rec, ports.EventSessionCreated); err != nil { + t.Fatalf("seed upsert: %v", err) + } +} + +func assertLifecycleEqual(t *testing.T, a, b domain.CanonicalSessionLifecycle) { + t.Helper() + if a.Revision != b.Revision { + t.Errorf("revision: fake=%d sqlite=%d", a.Revision, b.Revision) + } + if a.Session != b.Session { + t.Errorf("session: fake=%+v sqlite=%+v", a.Session, b.Session) + } + if a.PR != b.PR { + t.Errorf("pr: fake=%+v sqlite=%+v", a.PR, b.PR) + } + if a.Runtime != b.Runtime { + t.Errorf("runtime: fake=%+v sqlite=%+v", a.Runtime, b.Runtime) + } + if a.Activity.State != b.Activity.State || a.Activity.Source != b.Activity.Source || + !a.Activity.LastActivityAt.Equal(b.Activity.LastActivityAt) { + t.Errorf("activity: fake=%+v sqlite=%+v", a.Activity, b.Activity) + } + switch { + case a.Detecting == nil && b.Detecting == nil: + case a.Detecting == nil || b.Detecting == nil: + t.Errorf("detecting presence: fake=%v sqlite=%v", a.Detecting, b.Detecting) + default: + if a.Detecting.Attempts != b.Detecting.Attempts || a.Detecting.EvidenceHash != b.Detecting.EvidenceHash || + !a.Detecting.StartedAt.Equal(b.Detecting.StartedAt) { + t.Errorf("detecting: fake=%+v sqlite=%+v", a.Detecting, b.Detecting) + } + } +} diff --git a/backend/internal/lifecycle/reaction_durability_test.go b/backend/internal/lifecycle/reaction_durability_test.go new file mode 100644 index 00000000..1866c8c9 --- /dev/null +++ b/backend/internal/lifecycle/reaction_durability_test.go @@ -0,0 +1,140 @@ +package lifecycle + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// reactionStoreAdapter bridges the concrete *sqlite.Store to the lifecycle +// package's ReactionStore interface (string/row types <-> domain types). This is +// the same glue the composition root installs. +type reactionStoreAdapter struct{ s *sqlite.Store } + +func (a reactionStoreAdapter) LoadReactionTrackers(ctx context.Context) ([]PersistedTracker, error) { + rows, err := a.s.ListReactionTrackers(ctx) + if err != nil { + return nil, err + } + out := make([]PersistedTracker, len(rows)) + for i, r := range rows { + out[i] = PersistedTracker{ + SessionID: domain.SessionID(r.SessionID), + Key: r.ReactionKey, + Attempts: r.Attempts, + Escalated: r.Escalated, + FirstAttemptAt: r.FirstAttemptAt, + ProjectID: domain.ProjectID(r.ProjectID), + } + } + return out, nil +} + +func (a reactionStoreAdapter) SaveReactionTracker(ctx context.Context, t PersistedTracker) error { + return a.s.SaveReactionTracker(ctx, sqlite.ReactionTrackerRow{ + SessionID: string(t.SessionID), + ReactionKey: t.Key, + Attempts: t.Attempts, + Escalated: t.Escalated, + FirstAttemptAt: t.FirstAttemptAt, + ProjectID: string(t.ProjectID), + }) +} + +func (a reactionStoreAdapter) DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error { + return a.s.DeleteReactionTracker(ctx, string(id), key) +} + +func (a reactionStoreAdapter) DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error { + return a.s.DeleteSessionReactionTrackers(ctx, string(id)) +} + +// TestReaction_DurabilitySurvivesRestart is the plan's reaction_trackers +// durability check: once a reaction has escalated, a daemon restart (a fresh +// Manager hydrated from the same store) must NOT re-fire the human page — the +// exact failure the in-memory-only version had. +func TestReaction_DurabilitySurvivesRestart(t *testing.T) { + db, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open sqlite: %v", err) + } + t.Cleanup(func() { db.Close() }) + store := sqlite.NewStore(db) + adapter := reactionStoreAdapter{store} + + // --- first process lifetime: drive ci-failed to escalation --- + notf1 := &recordingNotifier{} + m1 := New(store, notf1, &recordingMessenger{}) + m1.clock = func() time.Time { return t0 } + if err := m1.WithReactionStore(context.Background(), adapter); err != nil { + t.Fatalf("hydrate m1: %v", err) + } + seedViaUpsert(t, store, lcOpenPR(domain.PRReasonReviewPending)) + + // ci-failed: retries 2, persistent → escalate on the third failure. + for i := 0; i < 4; i++ { + failCI(t, m1) + pendingCI(t, m1) + } + if c := notifyCount(notf1, "reaction.escalated"); c != 1 { + t.Fatalf("precondition: want one escalation in first lifetime, got %d", c) + } + + // --- simulated restart: a fresh Manager hydrated from the same store --- + notf2 := &recordingNotifier{} + msgr2 := &recordingMessenger{} + m2 := New(store, notf2, msgr2) + m2.clock = func() time.Time { return t0 } + if err := m2.WithReactionStore(context.Background(), adapter); err != nil { + t.Fatalf("hydrate m2: %v", err) + } + + // The ci-failed tracker rehydrates with escalated=true, so further failures + // are silenced: no new send-to-agent, no re-escalation. + failCI(t, m2) + if c := notifyCount(notf2, "reaction.escalated"); c != 0 { + t.Errorf("restart re-fired an already-escalated page: got %d escalations", c) + } + if len(msgr2.sent) != 0 { + t.Errorf("restart re-sent to agent despite escalated budget: got %d sends", len(msgr2.sent)) + } +} + +// TestReaction_DurabilityClearsOnIncidentOver proves the durable rows are +// removed when an incident resolves, so a later unrelated incident starts from a +// fresh budget rather than a stale escalated=true. +func TestReaction_DurabilityClearsOnIncidentOver(t *testing.T) { + db, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open sqlite: %v", err) + } + t.Cleanup(func() { db.Close() }) + store := sqlite.NewStore(db) + adapter := reactionStoreAdapter{store} + + m := New(store, &recordingNotifier{}, &recordingMessenger{}) + m.clock = func() time.Time { return t0 } + if err := m.WithReactionStore(context.Background(), adapter); err != nil { + t.Fatalf("hydrate: %v", err) + } + seedViaUpsert(t, store, lcOpenPR(domain.PRReasonReviewPending)) + + failCI(t, m) + if rows, _ := store.ListReactionTrackers(context.Background()); len(rows) == 0 { + t.Fatalf("precondition: expected a persisted ci-failed tracker") + } + + // Approved+green ends the incident → recovered() clears every tracker. + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, CISummary: ports.CIPassing, PRNumber: 7, + }); err != nil { + t.Fatalf("recover: %v", err) + } + if rows, _ := store.ListReactionTrackers(context.Background()); len(rows) != 0 { + t.Errorf("incident-over must clear durable trackers, got %d rows", len(rows)) + } +} diff --git a/backend/internal/lifecycle/reaction_store.go b/backend/internal/lifecycle/reaction_store.go new file mode 100644 index 00000000..f8da7415 --- /dev/null +++ b/backend/internal/lifecycle/reaction_store.go @@ -0,0 +1,94 @@ +package lifecycle + +// reaction_store.go is the optional durability seam for the escalation engine. +// By default the Manager keeps escalation budgets in memory only (a restart +// resets them, which costs at most a few extra agent retries — never a missed +// human page). When a ReactionStore is wired via WithReactionStore the in-memory +// map becomes a write-through cache over durable rows, so a restart does NOT +// re-fire an already-escalated human notification. +// +// The interface uses lifecycle-local types so the package stays free of any +// storage dependency; the composition root adapts the concrete store to it +// (mirroring the cdc.OutboxStore adapter). + +import ( + "context" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// PersistedTracker is the durable form of one (session,reaction) escalation +// budget — the storage-facing mirror of the in-memory reactionTracker. +type PersistedTracker struct { + SessionID domain.SessionID + Key string + Attempts int + Escalated bool + FirstAttemptAt time.Time + ProjectID domain.ProjectID +} + +// ReactionStore persists escalation budgets so they survive a daemon restart. +type ReactionStore interface { + LoadReactionTrackers(ctx context.Context) ([]PersistedTracker, error) + SaveReactionTracker(ctx context.Context, t PersistedTracker) error + DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error + DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error +} + +// WithReactionStore makes escalation budgets durable: it hydrates the in-memory +// trackers from rs and turns on write-through for subsequent mutations. Like +// WithSessionLister it must be called BEFORE any reaper or Apply* dispatch +// starts, since it populates the tracker map without holding trackerMu against +// concurrent reactors. A hydration error is returned so the caller can decide +// whether to proceed with an empty (in-memory) budget set. +func (m *Manager) WithReactionStore(ctx context.Context, rs ReactionStore) error { + m.reactionStore = rs + rows, err := rs.LoadReactionTrackers(ctx) + if err != nil { + return err + } + for _, r := range rows { + m.trackers[trackerKey{id: r.SessionID, key: reactionKey(r.Key)}] = &reactionTracker{ + attempts: r.Attempts, + escalated: r.Escalated, + firstAttemptAt: r.FirstAttemptAt, + projectID: r.ProjectID, + } + } + return nil +} + +// persistTracker write-throughs one tracker's current state. Best-effort: a +// failed write degrades durability to the in-memory default (a restart may +// re-fire one page), so it must not break the synchronous dispatch path. The +// snapshot is taken by the caller under trackerMu and passed by value here so no +// DB I/O happens while the lock is held. +func (m *Manager) persistTracker(ctx context.Context, id domain.SessionID, key reactionKey, snap reactionTracker) { + if m.reactionStore == nil { + return + } + _ = m.reactionStore.SaveReactionTracker(ctx, PersistedTracker{ + SessionID: id, + Key: string(key), + Attempts: snap.attempts, + Escalated: snap.escalated, + FirstAttemptAt: snap.firstAttemptAt, + ProjectID: snap.projectID, + }) +} + +func (m *Manager) deletePersistedTracker(ctx context.Context, id domain.SessionID, key reactionKey) { + if m.reactionStore == nil { + return + } + _ = m.reactionStore.DeleteReactionTracker(ctx, id, string(key)) +} + +func (m *Manager) deletePersistedSessionTrackers(ctx context.Context, id domain.SessionID) { + if m.reactionStore == nil { + return + } + _ = m.reactionStore.DeleteSessionReactionTrackers(ctx, id) +} diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go index 26dea562..ac4de400 100644 --- a/backend/internal/lifecycle/reactions.go +++ b/backend/internal/lifecycle/reactions.go @@ -233,14 +233,14 @@ func (m *Manager) react(ctx context.Context, id domain.SessionID, tr *transition // transition is typically review_pending->approved (beforeKey empty), so // clearing only beforeKey would leak the ci-failed tracker and leave its // escalated=true to silence a future regression. Clear them all. - m.clearSessionTrackers(id) + m.clearSessionTrackers(ctx, id) case hadBefore && (!hasAfter || changed): // Within an unresolved open PR: a normal tracker resets when its state is // left. A persistent one (ci-failed) is NOT cleared here — it must survive // the ambiguous review_pending limbo (the fail->pending->fail flap, §4.2); // it only resets via the recovery/incident-over branch above. if !defaultReactions[beforeKey].persistent { - m.clearTracker(id, beforeKey) + m.clearTracker(ctx, id, beforeKey) } } @@ -324,13 +324,21 @@ func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, projectI tk.firstAttemptAt = now } tk.attempts++ - if shouldEscalate(tk, cfg, now) { + escalateNow := shouldEscalate(tk, cfg, now) + if escalateNow { tk.escalated = true - m.trackerMu.Unlock() - return m.escalate(ctx, id, tk.projectID, key) } + snap := *tk m.trackerMu.Unlock() + // Write through the new budget (incl. escalated) before dispatching, so a + // crash between persist and notify re-fires at most the same page on restart. + m.persistTracker(ctx, id, key, snap) + + if escalateNow { + return m.escalate(ctx, id, snap.projectID, key) + } + if err := m.messenger.Send(ctx, id, composeMessage(cfg, rc)); err != nil { // A delivery failure must not consume escalation budget: roll this // attempt back so the next relevant transition retries from the same @@ -341,7 +349,9 @@ func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, projectI if freshFirst { tk.firstAttemptAt = time.Time{} } + rolled := *tk m.trackerMu.Unlock() + m.persistTracker(ctx, id, key, rolled) return err } return nil @@ -393,16 +403,17 @@ func (m *Manager) trackerFor(id domain.SessionID, key reactionKey) *reactionTrac return tk } -func (m *Manager) clearTracker(id domain.SessionID, key reactionKey) { +func (m *Manager) clearTracker(ctx context.Context, id domain.SessionID, key reactionKey) { m.trackerMu.Lock() delete(m.trackers, trackerKey{id: id, key: key}) m.trackerMu.Unlock() + m.deletePersistedTracker(ctx, id, key) } // clearSessionTrackers drops every tracker for a session — used when its // incident is over, so no budget (and no stale escalated=true) survives into a // later unrelated incident. -func (m *Manager) clearSessionTrackers(id domain.SessionID) { +func (m *Manager) clearSessionTrackers(ctx context.Context, id domain.SessionID) { m.trackerMu.Lock() for k := range m.trackers { if k.id == id { @@ -410,6 +421,7 @@ func (m *Manager) clearSessionTrackers(id domain.SessionID) { } } m.trackerMu.Unlock() + m.deletePersistedSessionTrackers(ctx, id) } // TickEscalations fires the duration-based escalations the synchronous LCM @@ -421,6 +433,7 @@ func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { id domain.SessionID projectID domain.ProjectID key reactionKey + snap reactionTracker } var fire []due @@ -432,12 +445,13 @@ func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { cfg := defaultReactions[k.key] if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { tk.escalated = true - fire = append(fire, due{id: k.id, projectID: tk.projectID, key: k.key}) + fire = append(fire, due{id: k.id, projectID: tk.projectID, key: k.key, snap: *tk}) } } m.trackerMu.Unlock() for _, d := range fire { + m.persistTracker(ctx, d.id, d.key, d.snap) if err := m.escalate(ctx, d.id, d.projectID, d.key); err != nil { return err } diff --git a/backend/internal/storage/sqlite/cdc_store.go b/backend/internal/storage/sqlite/cdc_store.go new file mode 100644 index 00000000..3386f988 --- /dev/null +++ b/backend/internal/storage/sqlite/cdc_store.go @@ -0,0 +1,104 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// OutboxEvent is a single undelivered change, joined from outbox + change_log. +// It is the unit the CDC publisher drains to JSONL. +type OutboxEvent struct { + OutboxID int64 + Seq int64 + SessionID string + EventType string + Revision int64 + Payload string + CreatedAt time.Time +} + +// ListUnsent returns up to limit undelivered events in seq order. +func (s *Store) ListUnsent(ctx context.Context, limit int) ([]OutboxEvent, error) { + rows, err := s.q.ListUnsentOutbox(ctx, int64(limit)) + if err != nil { + return nil, fmt.Errorf("list unsent outbox: %w", err) + } + out := make([]OutboxEvent, 0, len(rows)) + for _, r := range rows { + out = append(out, OutboxEvent{ + OutboxID: r.ID, + Seq: r.ChangeLogSeq, + SessionID: r.SessionID, + EventType: r.EventType, + Revision: r.Revision, + Payload: r.Payload, + CreatedAt: r.CreatedAt, + }) + } + return out, nil +} + +// MarkSent flags an outbox row delivered. +func (s *Store) MarkSent(ctx context.Context, outboxID int64, at time.Time) error { + return s.q.MarkOutboxSent(ctx, gen.MarkOutboxSentParams{ + SentAt: sql.NullTime{Time: at, Valid: true}, + ID: outboxID, + }) +} + +// MarkFailed bumps the attempt count and records the last error for an outbox row. +func (s *Store) MarkFailed(ctx context.Context, outboxID int64, errMsg string) error { + return s.q.MarkOutboxFailed(ctx, gen.MarkOutboxFailedParams{LastError: errMsg, ID: outboxID}) +} + +// GetOffset returns a consumer's last acknowledged seq (0 if it has none). +func (s *Store) GetOffset(ctx context.Context, consumer string) (int64, error) { + seq, err := s.q.GetConsumerOffset(ctx, consumer) + if errors.Is(err, sql.ErrNoRows) { + return 0, nil + } + if err != nil { + return 0, fmt.Errorf("get consumer offset %s: %w", consumer, err) + } + return seq, nil +} + +// SetOffset durably records a consumer's acknowledged seq. +func (s *Store) SetOffset(ctx context.Context, consumer string, seq int64, at time.Time) error { + return s.q.UpsertConsumerOffset(ctx, gen.UpsertConsumerOffsetParams{ + Consumer: consumer, + LastSeq: seq, + UpdatedAt: at, + }) +} + +// MaxChangeLogSeq returns the highest change_log seq (0 if empty). Used by the +// consumer to resume after a snapshot resync. +func (s *Store) MaxChangeLogSeq(ctx context.Context) (int64, error) { + v, err := s.q.MaxChangeLogSeq(ctx) + if err != nil { + return 0, fmt.Errorf("max change_log seq: %w", err) + } + return v, nil +} + +// MinConsumerOffset returns the lowest acknowledged seq across all consumers +// (0 if none). The janitor uses it as the safe outbox-deletion watermark. +func (s *Store) MinConsumerOffset(ctx context.Context) (int64, error) { + v, err := s.q.MinConsumerOffset(ctx) + if err != nil { + return 0, fmt.Errorf("min consumer offset: %w", err) + } + return v, nil +} + +// DeleteSentOutboxBelow removes delivered outbox rows whose seq is below the +// watermark, returning the number removed. +func (s *Store) DeleteSentOutboxBelow(ctx context.Context, seq int64) (int64, error) { + return s.q.DeleteSentOutboxBelow(ctx, seq) +} diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go new file mode 100644 index 00000000..78eb3ae9 --- /dev/null +++ b/backend/internal/storage/sqlite/db.go @@ -0,0 +1,63 @@ +// Package sqlite is the durable persistence adapter behind ports.LifecycleStore. +// It owns the SQLite schema (goose migrations), the revision-CAS upsert, and the +// transactional outbox (one txn writes the session row, a change_log entry, and +// the outbox row that the CDC publisher later drains to JSONL). +package sqlite + +import ( + "database/sql" + "embed" + "fmt" + "os" + "path/filepath" + + "github.com/pressly/goose/v3" + _ "modernc.org/sqlite" +) + +//go:embed migrations/*.sql +var migrationsFS embed.FS + +// pragmas are applied on every connection open. WAL + NORMAL gives concurrent +// reads alongside the single writer; busy_timeout absorbs brief writer +// contention; foreign_keys enforces the session_metadata cascade. +const pragmas = "?_pragma=journal_mode(WAL)" + + "&_pragma=busy_timeout(5000)" + + "&_pragma=foreign_keys(ON)" + + "&_pragma=synchronous(NORMAL)" + +// Open opens (creating if absent) the SQLite database under dataDir, applies the +// connection pragmas, and runs all goose migrations up. The returned *sql.DB is +// safe for the single-writer / many-reader workload the LCM and readers impose. +func Open(dataDir string) (*sql.DB, error) { + if err := os.MkdirAll(dataDir, 0o755); err != nil { + return nil, fmt.Errorf("create data dir: %w", err) + } + dsn := "file:" + filepath.Join(dataDir, "ao.db") + pragmas + db, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("open sqlite: %w", err) + } + // Single writer: serialize all access through one connection so WAL's + // single-writer rule is never violated by the pool handing out a second + // writable conn mid-transaction. + db.SetMaxOpenConns(1) + + if err := migrate(db); err != nil { + db.Close() + return nil, err + } + return db, nil +} + +func migrate(db *sql.DB) error { + goose.SetBaseFS(migrationsFS) + goose.SetLogger(goose.NopLogger()) + if err := goose.SetDialect("sqlite3"); err != nil { + return fmt.Errorf("set goose dialect: %w", err) + } + if err := goose.Up(db, "migrations"); err != nil { + return fmt.Errorf("run migrations: %w", err) + } + return nil +} diff --git a/backend/internal/storage/sqlite/gen/cdc.sql.go b/backend/internal/storage/sqlite/gen/cdc.sql.go new file mode 100644 index 00000000..c2eedc8c --- /dev/null +++ b/backend/internal/storage/sqlite/gen/cdc.sql.go @@ -0,0 +1,199 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: cdc.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const deleteSentOutboxBelow = `-- name: DeleteSentOutboxBelow :execrows +DELETE FROM outbox WHERE sent = 1 AND change_log_seq < ? +` + +func (q *Queries) DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) { + result, err := q.db.ExecContext(ctx, deleteSentOutboxBelow, changeLogSeq) + if err != nil { + return 0, err + } + return result.RowsAffected() +} + +const getConsumerOffset = `-- name: GetConsumerOffset :one +SELECT last_seq FROM consumer_offsets WHERE consumer = ? +` + +func (q *Queries) GetConsumerOffset(ctx context.Context, consumer string) (int64, error) { + row := q.db.QueryRowContext(ctx, getConsumerOffset, consumer) + var last_seq int64 + err := row.Scan(&last_seq) + return last_seq, err +} + +const insertChangeLog = `-- name: InsertChangeLog :one +INSERT INTO change_log (session_id, event_type, revision, payload, created_at) +VALUES (?, ?, ?, ?, ?) +RETURNING seq +` + +type InsertChangeLogParams struct { + SessionID string + EventType string + Revision int64 + Payload string + CreatedAt time.Time +} + +// Appends a canonical-write record and returns its monotonic seq so the same +// transaction can thread it into the outbox row. +func (q *Queries) InsertChangeLog(ctx context.Context, arg InsertChangeLogParams) (int64, error) { + row := q.db.QueryRowContext(ctx, insertChangeLog, + arg.SessionID, + arg.EventType, + arg.Revision, + arg.Payload, + arg.CreatedAt, + ) + var seq int64 + err := row.Scan(&seq) + return seq, err +} + +const insertOutbox = `-- name: InsertOutbox :exec +INSERT INTO outbox (change_log_seq, created_at) +VALUES (?, ?) +` + +type InsertOutboxParams struct { + ChangeLogSeq int64 + CreatedAt time.Time +} + +func (q *Queries) InsertOutbox(ctx context.Context, arg InsertOutboxParams) error { + _, err := q.db.ExecContext(ctx, insertOutbox, arg.ChangeLogSeq, arg.CreatedAt) + return err +} + +const listUnsentOutbox = `-- name: ListUnsentOutbox :many +SELECT o.id, o.change_log_seq, o.attempts, + c.session_id, c.event_type, c.revision, c.payload, c.created_at +FROM outbox o +JOIN change_log c ON c.seq = o.change_log_seq +WHERE o.sent = 0 +ORDER BY o.change_log_seq +LIMIT ? +` + +type ListUnsentOutboxRow struct { + ID int64 + ChangeLogSeq int64 + Attempts int64 + SessionID string + EventType string + Revision int64 + Payload string + CreatedAt time.Time +} + +func (q *Queries) ListUnsentOutbox(ctx context.Context, limit int64) ([]ListUnsentOutboxRow, error) { + rows, err := q.db.QueryContext(ctx, listUnsentOutbox, limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ListUnsentOutboxRow{} + for rows.Next() { + var i ListUnsentOutboxRow + if err := rows.Scan( + &i.ID, + &i.ChangeLogSeq, + &i.Attempts, + &i.SessionID, + &i.EventType, + &i.Revision, + &i.Payload, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const markOutboxFailed = `-- name: MarkOutboxFailed :exec +UPDATE outbox SET attempts = attempts + 1, last_error = ? WHERE id = ? +` + +type MarkOutboxFailedParams struct { + LastError string + ID int64 +} + +func (q *Queries) MarkOutboxFailed(ctx context.Context, arg MarkOutboxFailedParams) error { + _, err := q.db.ExecContext(ctx, markOutboxFailed, arg.LastError, arg.ID) + return err +} + +const markOutboxSent = `-- name: MarkOutboxSent :exec +UPDATE outbox SET sent = 1, sent_at = ? WHERE id = ? +` + +type MarkOutboxSentParams struct { + SentAt sql.NullTime + ID int64 +} + +func (q *Queries) MarkOutboxSent(ctx context.Context, arg MarkOutboxSentParams) error { + _, err := q.db.ExecContext(ctx, markOutboxSent, arg.SentAt, arg.ID) + return err +} + +const maxChangeLogSeq = `-- name: MaxChangeLogSeq :one +SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) FROM change_log +` + +func (q *Queries) MaxChangeLogSeq(ctx context.Context) (int64, error) { + row := q.db.QueryRowContext(ctx, maxChangeLogSeq) + var column_1 int64 + err := row.Scan(&column_1) + return column_1, err +} + +const minConsumerOffset = `-- name: MinConsumerOffset :one +SELECT CAST(COALESCE(MIN(last_seq), 0) AS INTEGER) FROM consumer_offsets +` + +func (q *Queries) MinConsumerOffset(ctx context.Context) (int64, error) { + row := q.db.QueryRowContext(ctx, minConsumerOffset) + var column_1 int64 + err := row.Scan(&column_1) + return column_1, err +} + +const upsertConsumerOffset = `-- name: UpsertConsumerOffset :exec +INSERT INTO consumer_offsets (consumer, last_seq, updated_at) +VALUES (?, ?, ?) +ON CONFLICT (consumer) DO UPDATE SET last_seq = excluded.last_seq, updated_at = excluded.updated_at +` + +type UpsertConsumerOffsetParams struct { + Consumer string + LastSeq int64 + UpdatedAt time.Time +} + +func (q *Queries) UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error { + _, err := q.db.ExecContext(ctx, upsertConsumerOffset, arg.Consumer, arg.LastSeq, arg.UpdatedAt) + return err +} diff --git a/backend/internal/storage/sqlite/gen/db.go b/backend/internal/storage/sqlite/gen/db.go new file mode 100644 index 00000000..b6fcf6be --- /dev/null +++ b/backend/internal/storage/sqlite/gen/db.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "context" + "database/sql" +) + +type DBTX interface { + ExecContext(context.Context, string, ...interface{}) (sql.Result, error) + PrepareContext(context.Context, string) (*sql.Stmt, error) + QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) + QueryRowContext(context.Context, string, ...interface{}) *sql.Row +} + +func New(db DBTX) *Queries { + return &Queries{db: db} +} + +type Queries struct { + db DBTX +} + +func (q *Queries) WithTx(tx *sql.Tx) *Queries { + return &Queries{ + db: tx, + } +} diff --git a/backend/internal/storage/sqlite/gen/metadata.sql.go b/backend/internal/storage/sqlite/gen/metadata.sql.go new file mode 100644 index 00000000..96510eb8 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/metadata.sql.go @@ -0,0 +1,59 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: metadata.sql + +package gen + +import ( + "context" +) + +const getMetadata = `-- name: GetMetadata :many +SELECT key, value FROM session_metadata WHERE session_id = ? +` + +type GetMetadataRow struct { + Key string + Value string +} + +func (q *Queries) GetMetadata(ctx context.Context, sessionID string) ([]GetMetadataRow, error) { + rows, err := q.db.QueryContext(ctx, getMetadata, sessionID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []GetMetadataRow{} + for rows.Next() { + var i GetMetadataRow + if err := rows.Scan(&i.Key, &i.Value); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertMetadata = `-- name: UpsertMetadata :exec +INSERT INTO session_metadata (session_id, key, value) +VALUES (?, ?, ?) +ON CONFLICT (session_id, key) DO UPDATE SET value = excluded.value +` + +type UpsertMetadataParams struct { + SessionID string + Key string + Value string +} + +func (q *Queries) UpsertMetadata(ctx context.Context, arg UpsertMetadataParams) error { + _, err := q.db.ExecContext(ctx, upsertMetadata, arg.SessionID, arg.Key, arg.Value) + return err +} diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go new file mode 100644 index 00000000..210fe245 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/models.go @@ -0,0 +1,74 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "database/sql" + "time" +) + +type ChangeLog struct { + Seq int64 + SessionID string + EventType string + Revision int64 + Payload string + CreatedAt time.Time +} + +type ConsumerOffset struct { + Consumer string + LastSeq int64 + UpdatedAt time.Time +} + +type Outbox struct { + ID int64 + ChangeLogSeq int64 + Sent int64 + SentAt sql.NullTime + Attempts int64 + LastError string + CreatedAt time.Time +} + +type ReactionTracker struct { + SessionID string + ReactionKey string + Attempts int64 + Escalated int64 + FirstAttemptAt sql.NullTime + ProjectID string +} + +type Session struct { + ID string + ProjectID string + IssueID string + Kind string + CreatedAt time.Time + UpdatedAt time.Time + Revision int64 + SessionState string + SessionReason string + PrState string + PrReason string + PrNumber int64 + PrUrl string + RuntimeState string + RuntimeReason string + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString +} + +type SessionMetadatum struct { + SessionID string + Key string + Value string +} diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go new file mode 100644 index 00000000..074fe053 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/querier.go @@ -0,0 +1,42 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "context" +) + +type Querier interface { + DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error + DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) + DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error + GetConsumerOffset(ctx context.Context, consumer string) (int64, error) + GetMetadata(ctx context.Context, sessionID string) ([]GetMetadataRow, error) + GetSession(ctx context.Context, id string) (Session, error) + GetSessionRevision(ctx context.Context, id string) (int64, error) + // Appends a canonical-write record and returns its monotonic seq so the same + // transaction can thread it into the outbox row. + InsertChangeLog(ctx context.Context, arg InsertChangeLogParams) (int64, error) + InsertOutbox(ctx context.Context, arg InsertOutboxParams) error + // CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; + // the row is persisted at revision 1. + InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) + ListAllSessions(ctx context.Context) ([]Session, error) + ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) + ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) + ListUnsentOutbox(ctx context.Context, limit int64) ([]ListUnsentOutboxRow, error) + MarkOutboxFailed(ctx context.Context, arg MarkOutboxFailedParams) error + MarkOutboxSent(ctx context.Context, arg MarkOutboxSentParams) error + MaxChangeLogSeq(ctx context.Context) (int64, error) + MinConsumerOffset(ctx context.Context) (int64, error) + // CAS update: succeeds only when the stored revision equals the caller's loaded + // revision (@expected_revision). 0 rows affected => revision mismatch. + UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) + UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error + UpsertMetadata(ctx context.Context, arg UpsertMetadataParams) error + UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error +} + +var _ Querier = (*Queries)(nil) diff --git a/backend/internal/storage/sqlite/gen/reactions.sql.go b/backend/internal/storage/sqlite/gen/reactions.sql.go new file mode 100644 index 00000000..dc7b01c2 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/reactions.sql.go @@ -0,0 +1,100 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: reactions.sql + +package gen + +import ( + "context" + "database/sql" +) + +const deleteReactionTracker = `-- name: DeleteReactionTracker :exec +DELETE FROM reaction_trackers WHERE session_id = ? AND reaction_key = ? +` + +type DeleteReactionTrackerParams struct { + SessionID string + ReactionKey string +} + +func (q *Queries) DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error { + _, err := q.db.ExecContext(ctx, deleteReactionTracker, arg.SessionID, arg.ReactionKey) + return err +} + +const deleteSessionReactionTrackers = `-- name: DeleteSessionReactionTrackers :exec +DELETE FROM reaction_trackers WHERE session_id = ? +` + +func (q *Queries) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error { + _, err := q.db.ExecContext(ctx, deleteSessionReactionTrackers, sessionID) + return err +} + +const listReactionTrackers = `-- name: ListReactionTrackers :many +SELECT session_id, reaction_key, attempts, escalated, first_attempt_at, project_id +FROM reaction_trackers +` + +func (q *Queries) ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) { + rows, err := q.db.QueryContext(ctx, listReactionTrackers) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ReactionTracker{} + for rows.Next() { + var i ReactionTracker + if err := rows.Scan( + &i.SessionID, + &i.ReactionKey, + &i.Attempts, + &i.Escalated, + &i.FirstAttemptAt, + &i.ProjectID, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertReactionTracker = `-- name: UpsertReactionTracker :exec +INSERT INTO reaction_trackers (session_id, reaction_key, attempts, escalated, first_attempt_at, project_id) +VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id, reaction_key) DO UPDATE SET + attempts = excluded.attempts, + escalated = excluded.escalated, + first_attempt_at = excluded.first_attempt_at, + project_id = excluded.project_id +` + +type UpsertReactionTrackerParams struct { + SessionID string + ReactionKey string + Attempts int64 + Escalated int64 + FirstAttemptAt sql.NullTime + ProjectID string +} + +func (q *Queries) UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error { + _, err := q.db.ExecContext(ctx, upsertReactionTracker, + arg.SessionID, + arg.ReactionKey, + arg.Attempts, + arg.Escalated, + arg.FirstAttemptAt, + arg.ProjectID, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/sessions.sql.go b/backend/internal/storage/sqlite/gen/sessions.sql.go new file mode 100644 index 00000000..00d97ad6 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/sessions.sql.go @@ -0,0 +1,307 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: sessions.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const getSession = `-- name: GetSession :one +SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions WHERE id = ? +` + +func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { + row := q.db.QueryRowContext(ctx, getSession, id) + var i Session + err := row.Scan( + &i.ID, + &i.ProjectID, + &i.IssueID, + &i.Kind, + &i.CreatedAt, + &i.UpdatedAt, + &i.Revision, + &i.SessionState, + &i.SessionReason, + &i.PrState, + &i.PrReason, + &i.PrNumber, + &i.PrUrl, + &i.RuntimeState, + &i.RuntimeReason, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + ) + return i, err +} + +const getSessionRevision = `-- name: GetSessionRevision :one +SELECT revision FROM sessions WHERE id = ? +` + +func (q *Queries) GetSessionRevision(ctx context.Context, id string) (int64, error) { + row := q.db.QueryRowContext(ctx, getSessionRevision, id) + var revision int64 + err := row.Scan(&revision) + return revision, err +} + +const insertSession = `-- name: InsertSession :execrows +INSERT INTO sessions ( + id, project_id, issue_id, kind, created_at, updated_at, + revision, + session_state, session_reason, + pr_state, pr_reason, pr_number, pr_url, + runtime_state, runtime_reason, + activity_state, activity_last_at, activity_source, + detecting_attempts, detecting_started_at, detecting_evidence_hash +) VALUES ( + ?, ?, ?, ?, ?, ?, + 1, + ?, ?, + ?, ?, ?, ?, + ?, ?, + ?, ?, ?, + ?, ?, ? +) +ON CONFLICT (id) DO NOTHING +` + +type InsertSessionParams struct { + ID string + ProjectID string + IssueID string + Kind string + CreatedAt time.Time + UpdatedAt time.Time + SessionState string + SessionReason string + PrState string + PrReason string + PrNumber int64 + PrUrl string + RuntimeState string + RuntimeReason string + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString +} + +// CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; +// the row is persisted at revision 1. +func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) { + result, err := q.db.ExecContext(ctx, insertSession, + arg.ID, + arg.ProjectID, + arg.IssueID, + arg.Kind, + arg.CreatedAt, + arg.UpdatedAt, + arg.SessionState, + arg.SessionReason, + arg.PrState, + arg.PrReason, + arg.PrNumber, + arg.PrUrl, + arg.RuntimeState, + arg.RuntimeReason, + arg.ActivityState, + arg.ActivityLastAt, + arg.ActivitySource, + arg.DetectingAttempts, + arg.DetectingStartedAt, + arg.DetectingEvidenceHash, + ) + if err != nil { + return 0, err + } + return result.RowsAffected() +} + +const listAllSessions = `-- name: ListAllSessions :many +SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions +` + +func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { + rows, err := q.db.QueryContext(ctx, listAllSessions) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Session{} + for rows.Next() { + var i Session + if err := rows.Scan( + &i.ID, + &i.ProjectID, + &i.IssueID, + &i.Kind, + &i.CreatedAt, + &i.UpdatedAt, + &i.Revision, + &i.SessionState, + &i.SessionReason, + &i.PrState, + &i.PrReason, + &i.PrNumber, + &i.PrUrl, + &i.RuntimeState, + &i.RuntimeReason, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const listSessionsByProject = `-- name: ListSessionsByProject :many +SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions WHERE project_id = ? +` + +func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) { + rows, err := q.db.QueryContext(ctx, listSessionsByProject, projectID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Session{} + for rows.Next() { + var i Session + if err := rows.Scan( + &i.ID, + &i.ProjectID, + &i.IssueID, + &i.Kind, + &i.CreatedAt, + &i.UpdatedAt, + &i.Revision, + &i.SessionState, + &i.SessionReason, + &i.PrState, + &i.PrReason, + &i.PrNumber, + &i.PrUrl, + &i.RuntimeState, + &i.RuntimeReason, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const updateSessionCAS = `-- name: UpdateSessionCAS :execrows +UPDATE sessions SET + project_id = ?, + issue_id = ?, + kind = ?, + updated_at = ?, + revision = revision + 1, + session_state = ?, + session_reason = ?, + pr_state = ?, + pr_reason = ?, + pr_number = ?, + pr_url = ?, + runtime_state = ?, + runtime_reason = ?, + activity_state = ?, + activity_last_at = ?, + activity_source = ?, + detecting_attempts = ?, + detecting_started_at = ?, + detecting_evidence_hash = ? +WHERE id = ? AND revision = ? +` + +type UpdateSessionCASParams struct { + ProjectID string + IssueID string + Kind string + UpdatedAt time.Time + SessionState string + SessionReason string + PrState string + PrReason string + PrNumber int64 + PrUrl string + RuntimeState string + RuntimeReason string + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString + ID string + Revision int64 +} + +// CAS update: succeeds only when the stored revision equals the caller's loaded +// revision (@expected_revision). 0 rows affected => revision mismatch. +func (q *Queries) UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) { + result, err := q.db.ExecContext(ctx, updateSessionCAS, + arg.ProjectID, + arg.IssueID, + arg.Kind, + arg.UpdatedAt, + arg.SessionState, + arg.SessionReason, + arg.PrState, + arg.PrReason, + arg.PrNumber, + arg.PrUrl, + arg.RuntimeState, + arg.RuntimeReason, + arg.ActivityState, + arg.ActivityLastAt, + arg.ActivitySource, + arg.DetectingAttempts, + arg.DetectingStartedAt, + arg.DetectingEvidenceHash, + arg.ID, + arg.Revision, + ) + if err != nil { + return 0, err + } + return result.RowsAffected() +} diff --git a/backend/internal/storage/sqlite/mapping.go b/backend/internal/storage/sqlite/mapping.go new file mode 100644 index 00000000..39ae2127 --- /dev/null +++ b/backend/internal/storage/sqlite/mapping.go @@ -0,0 +1,129 @@ +package sqlite + +import ( + "database/sql" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// recordToInsert maps a domain record to the generated insert params. The +// revision column is fixed to 1 by the query itself (insert path), so it is not +// carried here. +func recordToInsert(rec domain.SessionRecord) gen.InsertSessionParams { + lc := rec.Lifecycle + da, ds, dh := detectingToNull(lc.Detecting) + return gen.InsertSessionParams{ + ID: string(rec.ID), + ProjectID: string(rec.ProjectID), + IssueID: string(rec.IssueID), + Kind: string(rec.Kind), + CreatedAt: rec.CreatedAt, + UpdatedAt: rec.UpdatedAt, + SessionState: string(lc.Session.State), + SessionReason: string(lc.Session.Reason), + PrState: string(lc.PR.State), + PrReason: string(lc.PR.Reason), + PrNumber: int64(lc.PR.Number), + PrUrl: lc.PR.URL, + RuntimeState: string(lc.Runtime.State), + RuntimeReason: string(lc.Runtime.Reason), + ActivityState: string(lc.Activity.State), + ActivityLastAt: lc.Activity.LastActivityAt, + ActivitySource: string(lc.Activity.Source), + DetectingAttempts: da, + DetectingStartedAt: ds, + DetectingEvidenceHash: dh, + } +} + +// recordToUpdate maps a domain record to the CAS update params. expectedRevision +// is the caller's loaded revision, used in the WHERE clause for the CAS check. +func recordToUpdate(rec domain.SessionRecord, expectedRevision int64) gen.UpdateSessionCASParams { + lc := rec.Lifecycle + da, ds, dh := detectingToNull(lc.Detecting) + return gen.UpdateSessionCASParams{ + ProjectID: string(rec.ProjectID), + IssueID: string(rec.IssueID), + Kind: string(rec.Kind), + UpdatedAt: rec.UpdatedAt, + SessionState: string(lc.Session.State), + SessionReason: string(lc.Session.Reason), + PrState: string(lc.PR.State), + PrReason: string(lc.PR.Reason), + PrNumber: int64(lc.PR.Number), + PrUrl: lc.PR.URL, + RuntimeState: string(lc.Runtime.State), + RuntimeReason: string(lc.Runtime.Reason), + ActivityState: string(lc.Activity.State), + ActivityLastAt: lc.Activity.LastActivityAt, + ActivitySource: string(lc.Activity.Source), + DetectingAttempts: da, + DetectingStartedAt: ds, + DetectingEvidenceHash: dh, + ID: string(rec.ID), + Revision: expectedRevision, + } +} + +// rowToRecord maps a stored session row back to a domain record. Metadata is +// deliberately left nil: it is a side-channel (session_metadata) read only by +// GetMetadata, never reconstructed here — mirroring the in-memory fakeStore. +func rowToRecord(row gen.Session) domain.SessionRecord { + return domain.SessionRecord{ + ID: domain.SessionID(row.ID), + ProjectID: domain.ProjectID(row.ProjectID), + IssueID: domain.IssueID(row.IssueID), + Kind: domain.SessionKind(row.Kind), + Lifecycle: rowToLifecycle(row), + CreatedAt: row.CreatedAt, + UpdatedAt: row.UpdatedAt, + } +} + +func rowToLifecycle(row gen.Session) domain.CanonicalSessionLifecycle { + return domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Revision: int(row.Revision), + Session: domain.SessionSubstate{ + State: domain.SessionState(row.SessionState), + Reason: domain.SessionReason(row.SessionReason), + }, + PR: domain.PRSubstate{ + State: domain.PRState(row.PrState), + Reason: domain.PRReason(row.PrReason), + Number: int(row.PrNumber), + URL: row.PrUrl, + }, + Runtime: domain.RuntimeSubstate{ + State: domain.RuntimeState(row.RuntimeState), + Reason: domain.RuntimeReason(row.RuntimeReason), + }, + Activity: domain.ActivitySubstate{ + State: domain.ActivityState(row.ActivityState), + LastActivityAt: row.ActivityLastAt, + Source: domain.ActivitySource(row.ActivitySource), + }, + Detecting: nullToDetecting(row), + } +} + +func detectingToNull(d *domain.DetectingState) (sql.NullInt64, sql.NullTime, sql.NullString) { + if d == nil { + return sql.NullInt64{}, sql.NullTime{}, sql.NullString{} + } + return sql.NullInt64{Int64: int64(d.Attempts), Valid: true}, + sql.NullTime{Time: d.StartedAt, Valid: true}, + sql.NullString{String: d.EvidenceHash, Valid: true} +} + +func nullToDetecting(row gen.Session) *domain.DetectingState { + if !row.DetectingAttempts.Valid { + return nil + } + return &domain.DetectingState{ + Attempts: int(row.DetectingAttempts.Int64), + StartedAt: row.DetectingStartedAt.Time, + EvidenceHash: row.DetectingEvidenceHash.String, + } +} diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql new file mode 100644 index 00000000..f343e16d --- /dev/null +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -0,0 +1,109 @@ +-- +goose Up +-- +goose StatementBegin + +-- sessions holds identity + the canonical lifecycle as typed columns. The +-- display status is NEVER stored (it is derived on read). Metadata is NOT here — +-- it lives in session_metadata, written by a side-channel that bypasses CDC. +CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + project_id TEXT NOT NULL, + issue_id TEXT NOT NULL DEFAULT '', + kind TEXT NOT NULL, + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL, + + -- canonical lifecycle: revision is the optimistic-concurrency (CAS) counter, + -- bumped only by the storage layer's Upsert. + revision INTEGER NOT NULL, + + session_state TEXT NOT NULL, + session_reason TEXT NOT NULL, + + pr_state TEXT NOT NULL, + pr_reason TEXT NOT NULL, + pr_number INTEGER NOT NULL DEFAULT 0, + pr_url TEXT NOT NULL DEFAULT '', + + runtime_state TEXT NOT NULL, + runtime_reason TEXT NOT NULL, + + activity_state TEXT NOT NULL, + activity_last_at TIMESTAMP NOT NULL, + activity_source TEXT NOT NULL, + + -- detecting quarantine memory; NULL when the session is not in detecting. + detecting_attempts INTEGER, + detecting_started_at TIMESTAMP, + detecting_evidence_hash TEXT +); + +CREATE INDEX idx_sessions_project ON sessions (project_id); + +-- session_metadata is the opaque key/value side-channel (branch, workspacePath, +-- runtimeHandleId, runtimeName, agentSessionId, prompt). Written by +-- PatchMetadata; never bumps revision and never emits a CDC event. +CREATE TABLE session_metadata ( + session_id TEXT NOT NULL REFERENCES sessions (id) ON DELETE CASCADE, + key TEXT NOT NULL, + value TEXT NOT NULL, + PRIMARY KEY (session_id, key) +); + +-- change_log is the durable, ordered record of every canonical write. seq is the +-- monotonic CDC ordering/idempotency key. +CREATE TABLE change_log ( + seq INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + event_type TEXT NOT NULL, + revision INTEGER NOT NULL, + payload TEXT NOT NULL, + created_at TIMESTAMP NOT NULL +); + +-- outbox is the transactional-outbox: one unsent row per canonical write, drained +-- by the publisher into JSONL. change_log_seq links it to its change_log row. +CREATE TABLE outbox ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + change_log_seq INTEGER NOT NULL REFERENCES change_log (seq), + sent INTEGER NOT NULL DEFAULT 0, + sent_at TIMESTAMP, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL +); + +CREATE INDEX idx_outbox_unsent ON outbox (change_log_seq) WHERE sent = 0; + +-- consumer_offsets is the durable per-consumer cursor (at-least-once delivery). +CREATE TABLE consumer_offsets ( + consumer TEXT PRIMARY KEY, + last_seq INTEGER NOT NULL DEFAULT 0, + updated_at TIMESTAMP NOT NULL +); + +-- reaction_trackers is the durable escalation budget (persisted so a restart does +-- not re-fire human pages). Off the canonical CDC path. Mirrors the LCM's +-- in-memory reactionTracker: attempts (numeric budget), escalated (silences +-- further auto-dispatch), first_attempt_at (duration-escalation anchor), +-- project_id (captured at first attempt for the escalation event). +CREATE TABLE reaction_trackers ( + session_id TEXT NOT NULL, + reaction_key TEXT NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + escalated INTEGER NOT NULL DEFAULT 0, + first_attempt_at TIMESTAMP, + project_id TEXT NOT NULL DEFAULT '', + PRIMARY KEY (session_id, reaction_key) +); + +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE reaction_trackers; +DROP TABLE consumer_offsets; +DROP TABLE outbox; +DROP TABLE change_log; +DROP TABLE session_metadata; +DROP TABLE sessions; +-- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/queries/cdc.sql b/backend/internal/storage/sqlite/queries/cdc.sql new file mode 100644 index 00000000..b818194a --- /dev/null +++ b/backend/internal/storage/sqlite/queries/cdc.sql @@ -0,0 +1,42 @@ +-- name: InsertChangeLog :one +-- Appends a canonical-write record and returns its monotonic seq so the same +-- transaction can thread it into the outbox row. +INSERT INTO change_log (session_id, event_type, revision, payload, created_at) +VALUES (?, ?, ?, ?, ?) +RETURNING seq; + +-- name: InsertOutbox :exec +INSERT INTO outbox (change_log_seq, created_at) +VALUES (?, ?); + +-- name: ListUnsentOutbox :many +SELECT o.id, o.change_log_seq, o.attempts, + c.session_id, c.event_type, c.revision, c.payload, c.created_at +FROM outbox o +JOIN change_log c ON c.seq = o.change_log_seq +WHERE o.sent = 0 +ORDER BY o.change_log_seq +LIMIT ?; + +-- name: MarkOutboxSent :exec +UPDATE outbox SET sent = 1, sent_at = ? WHERE id = ?; + +-- name: MarkOutboxFailed :exec +UPDATE outbox SET attempts = attempts + 1, last_error = ? WHERE id = ?; + +-- name: GetConsumerOffset :one +SELECT last_seq FROM consumer_offsets WHERE consumer = ?; + +-- name: UpsertConsumerOffset :exec +INSERT INTO consumer_offsets (consumer, last_seq, updated_at) +VALUES (?, ?, ?) +ON CONFLICT (consumer) DO UPDATE SET last_seq = excluded.last_seq, updated_at = excluded.updated_at; + +-- name: MaxChangeLogSeq :one +SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) FROM change_log; + +-- name: MinConsumerOffset :one +SELECT CAST(COALESCE(MIN(last_seq), 0) AS INTEGER) FROM consumer_offsets; + +-- name: DeleteSentOutboxBelow :execrows +DELETE FROM outbox WHERE sent = 1 AND change_log_seq < ?; diff --git a/backend/internal/storage/sqlite/queries/metadata.sql b/backend/internal/storage/sqlite/queries/metadata.sql new file mode 100644 index 00000000..45079bb2 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/metadata.sql @@ -0,0 +1,7 @@ +-- name: GetMetadata :many +SELECT key, value FROM session_metadata WHERE session_id = ?; + +-- name: UpsertMetadata :exec +INSERT INTO session_metadata (session_id, key, value) +VALUES (?, ?, ?) +ON CONFLICT (session_id, key) DO UPDATE SET value = excluded.value; diff --git a/backend/internal/storage/sqlite/queries/reactions.sql b/backend/internal/storage/sqlite/queries/reactions.sql new file mode 100644 index 00000000..0ccd99c3 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/reactions.sql @@ -0,0 +1,18 @@ +-- name: ListReactionTrackers :many +SELECT session_id, reaction_key, attempts, escalated, first_attempt_at, project_id +FROM reaction_trackers; + +-- name: UpsertReactionTracker :exec +INSERT INTO reaction_trackers (session_id, reaction_key, attempts, escalated, first_attempt_at, project_id) +VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id, reaction_key) DO UPDATE SET + attempts = excluded.attempts, + escalated = excluded.escalated, + first_attempt_at = excluded.first_attempt_at, + project_id = excluded.project_id; + +-- name: DeleteReactionTracker :exec +DELETE FROM reaction_trackers WHERE session_id = ? AND reaction_key = ?; + +-- name: DeleteSessionReactionTrackers :exec +DELETE FROM reaction_trackers WHERE session_id = ?; diff --git a/backend/internal/storage/sqlite/queries/sessions.sql b/backend/internal/storage/sqlite/queries/sessions.sql new file mode 100644 index 00000000..48cdcacf --- /dev/null +++ b/backend/internal/storage/sqlite/queries/sessions.sql @@ -0,0 +1,58 @@ +-- name: InsertSession :execrows +-- CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; +-- the row is persisted at revision 1. +INSERT INTO sessions ( + id, project_id, issue_id, kind, created_at, updated_at, + revision, + session_state, session_reason, + pr_state, pr_reason, pr_number, pr_url, + runtime_state, runtime_reason, + activity_state, activity_last_at, activity_source, + detecting_attempts, detecting_started_at, detecting_evidence_hash +) VALUES ( + ?, ?, ?, ?, ?, ?, + 1, + ?, ?, + ?, ?, ?, ?, + ?, ?, + ?, ?, ?, + ?, ?, ? +) +ON CONFLICT (id) DO NOTHING; + +-- name: UpdateSessionCAS :execrows +-- CAS update: succeeds only when the stored revision equals the caller's loaded +-- revision (@expected_revision). 0 rows affected => revision mismatch. +UPDATE sessions SET + project_id = ?, + issue_id = ?, + kind = ?, + updated_at = ?, + revision = revision + 1, + session_state = ?, + session_reason = ?, + pr_state = ?, + pr_reason = ?, + pr_number = ?, + pr_url = ?, + runtime_state = ?, + runtime_reason = ?, + activity_state = ?, + activity_last_at = ?, + activity_source = ?, + detecting_attempts = ?, + detecting_started_at = ?, + detecting_evidence_hash = ? +WHERE id = ? AND revision = ?; + +-- name: GetSessionRevision :one +SELECT revision FROM sessions WHERE id = ?; + +-- name: GetSession :one +SELECT * FROM sessions WHERE id = ?; + +-- name: ListSessionsByProject :many +SELECT * FROM sessions WHERE project_id = ?; + +-- name: ListAllSessions :many +SELECT * FROM sessions; diff --git a/backend/internal/storage/sqlite/reaction_store.go b/backend/internal/storage/sqlite/reaction_store.go new file mode 100644 index 00000000..819d9716 --- /dev/null +++ b/backend/internal/storage/sqlite/reaction_store.go @@ -0,0 +1,80 @@ +package sqlite + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ReactionTrackerRow is one persisted escalation budget, the durable mirror of +// the LCM's in-memory reactionTracker. It is the unit the lifecycle Manager +// hydrates on startup and writes through on each mutation. +type ReactionTrackerRow struct { + SessionID string + ReactionKey string + Attempts int + Escalated bool + FirstAttemptAt time.Time + ProjectID string +} + +// ListReactionTrackers returns every persisted escalation budget so the Manager +// can rehydrate its in-memory trackers after a restart. +func (s *Store) ListReactionTrackers(ctx context.Context) ([]ReactionTrackerRow, error) { + rows, err := s.q.ListReactionTrackers(ctx) + if err != nil { + return nil, fmt.Errorf("list reaction trackers: %w", err) + } + out := make([]ReactionTrackerRow, 0, len(rows)) + for _, r := range rows { + var first time.Time + if r.FirstAttemptAt.Valid { + first = r.FirstAttemptAt.Time + } + out = append(out, ReactionTrackerRow{ + SessionID: r.SessionID, + ReactionKey: r.ReactionKey, + Attempts: int(r.Attempts), + Escalated: r.Escalated != 0, + FirstAttemptAt: first, + ProjectID: r.ProjectID, + }) + } + return out, nil +} + +// SaveReactionTracker durably persists one escalation budget (insert or update). +func (s *Store) SaveReactionTracker(ctx context.Context, r ReactionTrackerRow) error { + escalated := int64(0) + if r.Escalated { + escalated = 1 + } + first := sql.NullTime{} + if !r.FirstAttemptAt.IsZero() { + first = sql.NullTime{Time: r.FirstAttemptAt, Valid: true} + } + return s.q.UpsertReactionTracker(ctx, gen.UpsertReactionTrackerParams{ + SessionID: r.SessionID, + ReactionKey: r.ReactionKey, + Attempts: int64(r.Attempts), + Escalated: escalated, + FirstAttemptAt: first, + ProjectID: r.ProjectID, + }) +} + +// DeleteReactionTracker drops one escalation budget. +func (s *Store) DeleteReactionTracker(ctx context.Context, sessionID, reactionKey string) error { + return s.q.DeleteReactionTracker(ctx, gen.DeleteReactionTrackerParams{ + SessionID: sessionID, + ReactionKey: reactionKey, + }) +} + +// DeleteSessionReactionTrackers drops every escalation budget for a session. +func (s *Store) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error { + return s.q.DeleteSessionReactionTrackers(ctx, sessionID) +} diff --git a/backend/internal/storage/sqlite/spike_test.go b/backend/internal/storage/sqlite/spike_test.go new file mode 100644 index 00000000..30b43fc7 --- /dev/null +++ b/backend/internal/storage/sqlite/spike_test.go @@ -0,0 +1,92 @@ +package sqlite + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// TestSpikeOutboxTxn de-risks the whole adapter: it proves the sqlc-generated +// Querier composes inside one *sql.Tx and that the change_log seq returned +// mid-transaction threads into the outbox row — the transactional-outbox shape +// the publisher later drains. Step 0 of the implementation plan. +func TestSpikeOutboxTxn(t *testing.T) { + db, err := Open(t.TempDir()) + if err != nil { + t.Fatalf("open: %v", err) + } + defer db.Close() + + ctx := context.Background() + now := time.Now().UTC() + + tx, err := db.BeginTx(ctx, nil) + if err != nil { + t.Fatalf("begin: %v", err) + } + defer tx.Rollback() + + q := gen.New(db).WithTx(tx) + + // 1. CAS insert of a brand-new session (revision 0 -> persisted 1). + rows, err := q.InsertSession(ctx, gen.InsertSessionParams{ + ID: "s1", + ProjectID: "p1", + Kind: "worker", + CreatedAt: now, + UpdatedAt: now, + SessionState: "working", + SessionReason: "spawn_requested", + PrState: "none", + PrReason: "not_created", + RuntimeState: "unknown", + RuntimeReason: "spawn_incomplete", + ActivityState: "active", + ActivityLastAt: now, + ActivitySource: "none", + }) + if err != nil { + t.Fatalf("insert session: %v", err) + } + if rows != 1 { + t.Fatalf("insert session affected %d rows, want 1", rows) + } + + // 2. Append the change_log entry and capture its seq mid-transaction. + seq, err := q.InsertChangeLog(ctx, gen.InsertChangeLogParams{ + SessionID: "s1", + EventType: "session_created", + Revision: 1, + Payload: `{"id":"s1"}`, + CreatedAt: now, + }) + if err != nil { + t.Fatalf("insert change_log: %v", err) + } + if seq != 1 { + t.Fatalf("change_log seq = %d, want 1", seq) + } + + // 3. Thread the seq into the outbox row — the key thing the spike validates. + if err := q.InsertOutbox(ctx, gen.InsertOutboxParams{ChangeLogSeq: seq, CreatedAt: now}); err != nil { + t.Fatalf("insert outbox: %v", err) + } + + if err := tx.Commit(); err != nil { + t.Fatalf("commit: %v", err) + } + + // Verify the outbox row is visible, unsent, and linked to change_log seq 1. + unsent, err := gen.New(db).ListUnsentOutbox(ctx, 10) + if err != nil { + t.Fatalf("list unsent: %v", err) + } + if len(unsent) != 1 { + t.Fatalf("unsent outbox = %d rows, want 1", len(unsent)) + } + if unsent[0].ChangeLogSeq != 1 || unsent[0].SessionID != "s1" || unsent[0].EventType != "session_created" { + t.Fatalf("unexpected outbox row: %+v", unsent[0]) + } +} diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go new file mode 100644 index 00000000..bd61e73b --- /dev/null +++ b/backend/internal/storage/sqlite/store.go @@ -0,0 +1,118 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// Store is the SQLite-backed ports.LifecycleStore. The LCM is its sole logical +// writer (via Upsert); readers (Session Manager, reaper) use Load/Get/List. +type Store struct { + db *sql.DB + q *gen.Queries +} + +var _ ports.LifecycleStore = (*Store)(nil) + +// NewStore wraps an opened *sql.DB (see Open) as a LifecycleStore. +func NewStore(db *sql.DB) *Store { + return &Store{db: db, q: gen.New(db)} +} + +// Load returns the canonical lifecycle for a session, or ok=false if absent. +func (s *Store) Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { + row, err := s.q.GetSession(ctx, string(id)) + if errors.Is(err, sql.ErrNoRows) { + return domain.CanonicalSessionLifecycle{}, false, nil + } + if err != nil { + return domain.CanonicalSessionLifecycle{}, false, fmt.Errorf("load session %s: %w", id, err) + } + return rowToLifecycle(row), true, nil +} + +// Get returns the full record (no derived status) for a session. +func (s *Store) Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + row, err := s.q.GetSession(ctx, string(id)) + if errors.Is(err, sql.ErrNoRows) { + return domain.SessionRecord{}, false, nil + } + if err != nil { + return domain.SessionRecord{}, false, fmt.Errorf("get session %s: %w", id, err) + } + return rowToRecord(row), true, nil +} + +// List returns every record for a project (no archive filter — mirrors the +// in-memory store contract; terminal filtering is the caller's job). +func (s *Store) List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + rows, err := s.q.ListSessionsByProject(ctx, string(project)) + if err != nil { + return nil, fmt.Errorf("list sessions for %s: %w", project, err) + } + out := make([]domain.SessionRecord, 0, len(rows)) + for _, row := range rows { + out = append(out, rowToRecord(row)) + } + return out, nil +} + +// ListAll returns every persisted session across all projects. The CDC snapshot +// source uses it to rebuild current state after a log-rotation gap. +func (s *Store) ListAll(ctx context.Context) ([]domain.SessionRecord, error) { + rows, err := s.q.ListAllSessions(ctx) + if err != nil { + return nil, fmt.Errorf("list all sessions: %w", err) + } + out := make([]domain.SessionRecord, 0, len(rows)) + for _, row := range rows { + out = append(out, rowToRecord(row)) + } + return out, nil +} + +// GetMetadata returns the opaque key/value metadata for a session. +func (s *Store) GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error) { + rows, err := s.q.GetMetadata(ctx, string(id)) + if err != nil { + return nil, fmt.Errorf("get metadata %s: %w", id, err) + } + if len(rows) == 0 { + return nil, nil + } + m := make(map[string]string, len(rows)) + for _, r := range rows { + m[r.Key] = r.Value + } + return m, nil +} + +// PatchMetadata merges kv into the session's metadata. It is outside the +// canonical write path: no revision bump, no CDC event. +func (s *Store) PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error { + if len(kv) == 0 { + return nil + } + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin patch metadata: %w", err) + } + defer tx.Rollback() + qtx := s.q.WithTx(tx) + for k, v := range kv { + if err := qtx.UpsertMetadata(ctx, gen.UpsertMetadataParams{ + SessionID: string(id), + Key: k, + Value: v, + }); err != nil { + return fmt.Errorf("patch metadata %s[%s]: %w", id, k, err) + } + } + return tx.Commit() +} diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store_test.go new file mode 100644 index 00000000..5457855d --- /dev/null +++ b/backend/internal/storage/sqlite/store_test.go @@ -0,0 +1,256 @@ +package sqlite + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +func newTestStore(t *testing.T) *Store { + t.Helper() + db, err := Open(t.TempDir()) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { db.Close() }) + return NewStore(db) +} + +func sampleRecord(id string) domain.SessionRecord { + now := time.Now().UTC().Truncate(time.Second) + return domain.SessionRecord{ + ID: domain.SessionID(id), + ProjectID: "proj", + IssueID: "issue-1", + Kind: domain.KindWorker, + CreatedAt: now, + UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, + PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, + Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + }, + } +} + +func TestUpsertInsertThenUpdateBumpsRevision(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + rec := sampleRecord("s1") + + if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { + t.Fatalf("insert: %v", err) + } + lc, ok, err := s.Load(ctx, "s1") + if err != nil || !ok { + t.Fatalf("load after insert: ok=%v err=%v", ok, err) + } + if lc.Revision != 1 { + t.Fatalf("revision after insert = %d, want 1", lc.Revision) + } + + // Update must carry the loaded revision (1) and persist as 2. + rec.Lifecycle.Revision = 1 + rec.Lifecycle.Session.State = domain.SessionIdle + if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { + t.Fatalf("update: %v", err) + } + lc, _, _ = s.Load(ctx, "s1") + if lc.Revision != 2 { + t.Fatalf("revision after update = %d, want 2", lc.Revision) + } + if lc.Session.State != domain.SessionIdle { + t.Fatalf("state after update = %q, want idle", lc.Session.State) + } +} + +func TestUpsertStaleRevisionMismatch(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + rec := sampleRecord("s1") + if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { + t.Fatalf("insert: %v", err) + } + + // Stored revision is 1; submitting revision 0 (stale) must mismatch and + // write nothing new (no extra outbox/change_log rows). + rec.Lifecycle.Revision = 0 + err := s.Upsert(ctx, rec, ports.EventSessionStateChanged) + if err == nil || !strings.Contains(err.Error(), "revision mismatch") { + t.Fatalf("stale update err = %v, want revision mismatch", err) + } + assertOutboxCount(t, s, ctx, 1) +} + +func TestUpsertInsertNonZeroRevisionErrors(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + rec := sampleRecord("s1") + rec.Lifecycle.Revision = 5 + err := s.Upsert(ctx, rec, ports.EventSessionCreated) + if err == nil || !strings.Contains(err.Error(), "revision mismatch") { + t.Fatalf("insert with revision 5 err = %v, want revision mismatch", err) + } + // Nothing should be persisted. + if _, ok, _ := s.Get(ctx, "s1"); ok { + t.Fatal("session persisted despite revision-mismatch insert") + } + assertOutboxCount(t, s, ctx, 0) +} + +func TestUpsertOutboxAtomicityAndOrdering(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + + rec := sampleRecord("s1") + if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { + t.Fatalf("insert: %v", err) + } + rec.Lifecycle.Revision = 1 + if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { + t.Fatalf("update: %v", err) + } + + rows, err := NewStore(s.db).q.ListUnsentOutbox(ctx, 100) + if err != nil { + t.Fatalf("list outbox: %v", err) + } + if len(rows) != 2 { + t.Fatalf("outbox rows = %d, want 2", len(rows)) + } + // seq strictly monotonic, event types verbatim, revisions 1 then 2. + if rows[0].ChangeLogSeq != 1 || rows[1].ChangeLogSeq != 2 { + t.Fatalf("seq not monotonic: %d, %d", rows[0].ChangeLogSeq, rows[1].ChangeLogSeq) + } + if rows[0].EventType != string(ports.EventSessionCreated) || rows[1].EventType != string(ports.EventSessionStateChanged) { + t.Fatalf("event types = %q, %q", rows[0].EventType, rows[1].EventType) + } + if rows[0].Revision != 1 || rows[1].Revision != 2 { + t.Fatalf("revisions = %d, %d, want 1, 2", rows[0].Revision, rows[1].Revision) + } +} + +func TestGetListRoundTrip(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + + a := sampleRecord("a") + b := sampleRecord("b") + b.ProjectID = "other" + if err := s.Upsert(ctx, a, ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + if err := s.Upsert(ctx, b, ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + + got, ok, err := s.Get(ctx, "a") + if err != nil || !ok { + t.Fatalf("get a: ok=%v err=%v", ok, err) + } + if got.ID != "a" || got.Lifecycle.Revision != 1 || got.IssueID != "issue-1" { + t.Fatalf("unexpected record: %+v", got) + } + if got.Metadata != nil { + t.Fatalf("Get must not reconstruct metadata, got %v", got.Metadata) + } + + list, err := s.List(ctx, "proj") + if err != nil { + t.Fatal(err) + } + if len(list) != 1 || list[0].ID != "a" { + t.Fatalf("List(proj) = %+v, want only a", list) + } +} + +func TestMetadataSideChannel(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + + if err := s.PatchMetadata(ctx, "s1", map[string]string{"branch": "feat/x", "prompt": "do it"}); err != nil { + t.Fatalf("patch: %v", err) + } + if err := s.PatchMetadata(ctx, "s1", map[string]string{"branch": "feat/y"}); err != nil { + t.Fatalf("patch overwrite: %v", err) + } + + m, err := s.GetMetadata(ctx, "s1") + if err != nil { + t.Fatal(err) + } + if m["branch"] != "feat/y" || m["prompt"] != "do it" { + t.Fatalf("metadata = %v", m) + } + // Metadata writes must not bump revision (off the canonical path). + lc, _, _ := s.Load(ctx, "s1") + if lc.Revision != 1 { + t.Fatalf("revision = %d after metadata patch, want 1 (no bump)", lc.Revision) + } +} + +func TestDetectingRoundTrip(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + rec := sampleRecord("s1") + rec.Lifecycle.Session.State = domain.SessionDetecting + rec.Lifecycle.Detecting = &domain.DetectingState{ + Attempts: 2, + StartedAt: time.Now().UTC().Truncate(time.Second), + EvidenceHash: "abc123", + } + if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + lc, _, _ := s.Load(ctx, "s1") + if lc.Detecting == nil { + t.Fatal("Detecting lost on round-trip") + } + if lc.Detecting.Attempts != 2 || lc.Detecting.EvidenceHash != "abc123" { + t.Fatalf("detecting = %+v", lc.Detecting) + } + + // Clearing Detecting must null the columns back out. + rec.Lifecycle.Revision = 1 + rec.Lifecycle.Detecting = nil + if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { + t.Fatal(err) + } + lc, _, _ = s.Load(ctx, "s1") + if lc.Detecting != nil { + t.Fatalf("Detecting not cleared: %+v", lc.Detecting) + } +} + +func TestLoadGetMissing(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + if _, ok, err := s.Load(ctx, "nope"); ok || err != nil { + t.Fatalf("Load missing: ok=%v err=%v", ok, err) + } + if _, ok, err := s.Get(ctx, "nope"); ok || err != nil { + t.Fatalf("Get missing: ok=%v err=%v", ok, err) + } + if m, err := s.GetMetadata(ctx, "nope"); err != nil || m != nil { + t.Fatalf("GetMetadata missing: m=%v err=%v", m, err) + } +} + +func assertOutboxCount(t *testing.T, s *Store, ctx context.Context, want int) { + t.Helper() + rows, err := s.q.ListUnsentOutbox(ctx, 1000) + if err != nil { + t.Fatalf("list outbox: %v", err) + } + if len(rows) != want { + t.Fatalf("outbox count = %d, want %d", len(rows), want) + } +} diff --git a/backend/internal/storage/sqlite/upsert.go b/backend/internal/storage/sqlite/upsert.go new file mode 100644 index 00000000..40944005 --- /dev/null +++ b/backend/internal/storage/sqlite/upsert.go @@ -0,0 +1,113 @@ +package sqlite + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// Upsert performs the one atomic canonical write: it CAS-checks and persists the +// session row (bumping revision), appends a change_log entry, and enqueues an +// outbox row linked to that entry's seq — all in a single transaction. Only the +// LCM calls this. +// +// Revision CAS (mirrors the in-memory store contract exactly): +// - existing row: rec.Lifecycle.Revision must equal the stored revision, else +// a revision-mismatch error and nothing is written; on match it persists at +// stored+1. +// - insert: rec.Lifecycle.Revision must be 0, persisted as 1. +func (s *Store) Upsert(ctx context.Context, rec domain.SessionRecord, eventType ports.EventType) error { + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin upsert: %w", err) + } + defer tx.Rollback() + qtx := s.q.WithTx(tx) + + newRevision, err := casPersist(ctx, qtx, rec) + if err != nil { + return err + } + + if err := appendOutbox(ctx, qtx, rec, newRevision, eventType); err != nil { + return err + } + + return tx.Commit() +} + +// casPersist applies the revision-CAS insert-or-update and returns the new +// stored revision. +func casPersist(ctx context.Context, q *gen.Queries, rec domain.SessionRecord) (int, error) { + stored, err := q.GetSessionRevision(ctx, string(rec.ID)) + switch { + case errors.Is(err, sql.ErrNoRows): + // Insert path: incoming revision must be 0; row persists at revision 1. + if rec.Lifecycle.Revision != 0 { + return 0, fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) + } + rows, err := q.InsertSession(ctx, recordToInsert(rec)) + if err != nil { + return 0, fmt.Errorf("insert session %s: %w", rec.ID, err) + } + if rows != 1 { + // Another writer raced us between the revision check and the insert. + // With single-writer this should not happen; treat as a CAS failure. + return 0, fmt.Errorf("revision mismatch for insert %s: row already exists", rec.ID) + } + return 1, nil + case err != nil: + return 0, fmt.Errorf("read revision %s: %w", rec.ID, err) + default: + // Update path: incoming revision must equal the stored revision. + if int64(rec.Lifecycle.Revision) != stored { + return 0, fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, stored) + } + rows, err := q.UpdateSessionCAS(ctx, recordToUpdate(rec, stored)) + if err != nil { + return 0, fmt.Errorf("update session %s: %w", rec.ID, err) + } + if rows != 1 { + return 0, fmt.Errorf("revision mismatch for %s: stale revision %d", rec.ID, rec.Lifecycle.Revision) + } + return int(stored) + 1, nil + } +} + +// appendOutbox writes the change_log entry and threads its seq into a fresh +// outbox row. The change_log payload is the persisted record at its new +// revision (metadata excluded — it is not on the canonical path). +func appendOutbox(ctx context.Context, q *gen.Queries, rec domain.SessionRecord, newRevision int, eventType ports.EventType) error { + now := time.Now().UTC() + payload := rec + payload.Lifecycle.Revision = newRevision + payload.Lifecycle.Version = domain.LifecycleVersion + payload.Metadata = nil + blob, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal change_log payload %s: %w", rec.ID, err) + } + + seq, err := q.InsertChangeLog(ctx, gen.InsertChangeLogParams{ + SessionID: string(rec.ID), + EventType: string(eventType), + Revision: int64(newRevision), + Payload: string(blob), + CreatedAt: now, + }) + if err != nil { + return fmt.Errorf("insert change_log %s: %w", rec.ID, err) + } + + if err := q.InsertOutbox(ctx, gen.InsertOutboxParams{ChangeLogSeq: seq, CreatedAt: now}); err != nil { + return fmt.Errorf("insert outbox %s: %w", rec.ID, err) + } + return nil +} diff --git a/backend/lifecycle_wiring.go b/backend/lifecycle_wiring.go new file mode 100644 index 00000000..3836baf6 --- /dev/null +++ b/backend/lifecycle_wiring.go @@ -0,0 +1,126 @@ +package main + +import ( + "context" + "log/slog" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// lifecycleStack owns the running LCM + reaper. The LCM is the sole writer into +// the store (every Apply*/On* call ends in store.Upsert, which the CDC pipeline +// then drains); the reaper is the OBSERVE-layer timer that probes live runtimes +// and reports facts back through the LCM. Together with the CDC substrate this +// makes the write path live end-to-end: LCM -> store -> outbox -> JSONL -> +// broadcaster. +type lifecycleStack struct { + LCM *lifecycle.Manager + reaperDone <-chan struct{} +} + +// startLifecycle constructs the LCM over store, makes escalation budgets durable, +// teaches it to enumerate sessions for the reaper, and starts the reaper loop. +// The goroutine stops when ctx is cancelled; Stop waits for it to drain. +// +// TEMPORARY STUBS (replace as the daemon lane lands the real collaborators): +// +// - noopNotifier — swap for the production notifier multiplexer once the +// notifier plugins (desktop/Slack/webhook) are ported. Wire it where +// noopNotifier{} is passed to lifecycle.New below. +// - noopMessenger — swap for the AgentMessenger backed by the runtime/agent +// plugins (it injects a prompt into the live agent pane). Wire it at the +// same lifecycle.New call site. +// - reaper.MapRegistry{} — empty runtime registry, so the reaper probes +// nothing. Register the real runtime adapters (tmux/process) keyed by +// runtime name once those plugins exist: reaper.MapRegistry{"tmux": rt}. +func startLifecycle(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*lifecycleStack, error) { + // TODO(daemon-lane): replace noopNotifier{}/noopMessenger{} with the real + // notifier multiplexer and the plugin-backed AgentMessenger. + lcm := lifecycle.New(store, noopNotifier{}, noopMessenger{}) + + // Durable escalation budgets (flaw #3 fix): hydrate from the store and turn + // on write-through so a restart does not re-fire an already-escalated page. + // Must run before the reaper starts dispatching TickEscalations. + if err := lcm.WithReactionStore(ctx, lifecycleReactionStore{store}); err != nil { + return nil, err + } + + // The reaper's RunningSessions snapshot needs to see every session; ListAll + // spans all projects (the per-project List would hide cross-project work). + lcm.WithSessionLister(store.ListAll) + + // TODO(daemon-lane): pass the real runtime registry so the reaper actually + // probes live panes. With an empty registry it ticks escalations but probes + // nothing, which is correct until runtimes exist. + rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Logger: logger}) + + return &lifecycleStack{LCM: lcm, reaperDone: rp.Start(ctx)}, nil +} + +// Stop waits for the reaper goroutine to exit (the caller must have cancelled the +// ctx passed to startLifecycle). +func (l *lifecycleStack) Stop() { + <-l.reaperDone +} + +// noopNotifier satisfies ports.Notifier by dropping every event. TEMPORARY: the +// daemon lane replaces this with the notifier multiplexer over the real notifier +// plugins. Until then human-facing notifications are silently discarded — the +// write path and CDC still work, only the human push is absent. +type noopNotifier struct{} + +func (noopNotifier) Notify(context.Context, ports.OrchestratorEvent) error { return nil } + +// noopMessenger satisfies ports.AgentMessenger by dropping every send. TEMPORARY: +// replace with the runtime/agent-plugin-backed messenger that injects prompts +// into the live agent pane. Until then auto-nudge reactions are no-ops. +type noopMessenger struct{} + +func (noopMessenger) Send(context.Context, domain.SessionID, string) error { return nil } + +// lifecycleReactionStore bridges the concrete *sqlite.Store to the lifecycle +// package's ReactionStore interface (string/row types <-> domain types). It is +// the production twin of the reactionStoreAdapter used in the lifecycle tests. +type lifecycleReactionStore struct{ store *sqlite.Store } + +func (a lifecycleReactionStore) LoadReactionTrackers(ctx context.Context) ([]lifecycle.PersistedTracker, error) { + rows, err := a.store.ListReactionTrackers(ctx) + if err != nil { + return nil, err + } + out := make([]lifecycle.PersistedTracker, len(rows)) + for i, r := range rows { + out[i] = lifecycle.PersistedTracker{ + SessionID: domain.SessionID(r.SessionID), + Key: r.ReactionKey, + Attempts: r.Attempts, + Escalated: r.Escalated, + FirstAttemptAt: r.FirstAttemptAt, + ProjectID: domain.ProjectID(r.ProjectID), + } + } + return out, nil +} + +func (a lifecycleReactionStore) SaveReactionTracker(ctx context.Context, t lifecycle.PersistedTracker) error { + return a.store.SaveReactionTracker(ctx, sqlite.ReactionTrackerRow{ + SessionID: string(t.SessionID), + ReactionKey: t.Key, + Attempts: t.Attempts, + Escalated: t.Escalated, + FirstAttemptAt: t.FirstAttemptAt, + ProjectID: string(t.ProjectID), + }) +} + +func (a lifecycleReactionStore) DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error { + return a.store.DeleteReactionTracker(ctx, string(id), key) +} + +func (a lifecycleReactionStore) DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error { + return a.store.DeleteSessionReactionTrackers(ctx, string(id)) +} diff --git a/backend/main.go b/backend/main.go index 78a23292..8db058ea 100644 --- a/backend/main.go +++ b/backend/main.go @@ -15,6 +15,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/httpd" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) func main() { @@ -46,11 +47,54 @@ func run() error { return err } + // Open the durable store and bring up the CDC substrate (outbox publisher, + // JSONL consumer + broadcaster, outbox janitor). The LCM/Session Manager and + // the HTTP API routes that drive and read this store are owned by the daemon + // lane and are wired there once their collaborators (Notifier, AgentMessenger, + // and the runtime/agent/workspace plugins) have production implementations; + // here we stand up the persistence + change-delivery foundation they build on. + db, err := sqlite.Open(cfg.DataDir) + if err != nil { + return fmt.Errorf("open store: %w", err) + } + defer db.Close() + store := sqlite.NewStore(db) + // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the // graceful shutdown inside Server.Run. ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() + cdcPipe, err := startCDC(ctx, store, cfg.DataDir, log) + if err != nil { + return err + } + defer func() { + if err := cdcPipe.Stop(); err != nil { + log.Error("cdc pipeline shutdown", "err", err) + } + }() + + // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE + // timer). This makes the write path live end-to-end: LCM.Upsert -> store -> + // outbox -> CDC JSONL -> broadcaster. The collaborators it needs that don't + // yet have production implementations (Notifier, AgentMessenger, runtime + // registry) are stubbed in lifecycle_wiring.go with TODO markers. + // + // NOT wired here yet — both await collaborators the daemon lane owns: + // - Session Manager: session.New needs Runtime/Agent/Workspace plugins to + // construct. Stubbing them would make Spawn a silent no-op (a footgun), + // so it's deferred rather than faked. The LCM already exposes the read + // surface (RunningSessions) the SM would wrap. + // - HTTP API routes: httpd.New takes no SM/LCM today; surfacing the store + // over HTTP needs a constructor signature change + handlers, tracked with + // the SM work since the routes call into it. + lcStack, err := startLifecycle(ctx, store, log) + if err != nil { + return fmt.Errorf("start lifecycle: %w", err) + } + defer lcStack.Stop() + return srv.Run(ctx) } diff --git a/backend/main_test.go b/backend/main_test.go new file mode 100644 index 00000000..1a8d60c3 --- /dev/null +++ b/backend/main_test.go @@ -0,0 +1,134 @@ +package main + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// These tests cover the composition-root adapters in cdc_wiring.go directly +// (package main otherwise has no test coverage): the outboxAdapter mapping the +// store's OutboxEvent to cdc.PendingEvent, and the snapshotSource rebuilding +// full-state events from the sessions table. + +func newWiringStore(t *testing.T) *sqlite.Store { + t.Helper() + db, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { db.Close() }) + return sqlite.NewStore(db) +} + +func wiringRec(id string) domain.SessionRecord { + now := time.Now().UTC() + return domain.SessionRecord{ + ID: domain.SessionID(id), ProjectID: "proj", Kind: domain.KindWorker, CreatedAt: now, UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, + PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, + Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + }, + } +} + +func TestOutboxAdapterMapsPendingEvents(t *testing.T) { + ctx := context.Background() + store := newWiringStore(t) + a := outboxAdapter{store} + + if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { + t.Fatalf("upsert: %v", err) + } + + pending, err := a.ListUnsent(ctx, 10) + if err != nil { + t.Fatalf("list unsent: %v", err) + } + if len(pending) != 1 { + t.Fatalf("want 1 pending event, got %d", len(pending)) + } + pe := pending[0] + if pe.Seq != 1 || pe.SessionID != "s1" || pe.EventType != string(ports.EventSessionCreated) || pe.Revision != 1 { + t.Fatalf("unexpected mapping: %+v", pe) + } + if pe.Payload == "" { + t.Fatal("payload should carry the marshaled record") + } + + // MarkSent must clear it from the unsent set. + if err := a.MarkSent(ctx, pe.OutboxID, time.Now().UTC()); err != nil { + t.Fatalf("mark sent: %v", err) + } + again, err := a.ListUnsent(ctx, 10) + if err != nil { + t.Fatalf("list unsent 2: %v", err) + } + if len(again) != 0 { + t.Fatalf("sent event should not reappear, got %d", len(again)) + } +} + +func TestSnapshotSourceRebuildsState(t *testing.T) { + ctx := context.Background() + store := newWiringStore(t) + s := snapshotSource{store} + + // Empty store: no events, maxSeq 0. + events, maxSeq, err := s.Snapshot(ctx) + if err != nil { + t.Fatalf("empty snapshot: %v", err) + } + if len(events) != 0 || maxSeq != 0 { + t.Fatalf("empty store should yield no events and maxSeq 0, got %d events maxSeq %d", len(events), maxSeq) + } + + // Two canonical writes (seq 1,2) across two sessions. + if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { + t.Fatalf("upsert s1: %v", err) + } + if err := store.Upsert(ctx, wiringRec("s2"), ports.EventSessionCreated); err != nil { + t.Fatalf("upsert s2: %v", err) + } + + events, maxSeq, err = s.Snapshot(ctx) + if err != nil { + t.Fatalf("snapshot: %v", err) + } + if maxSeq != 2 { + t.Fatalf("maxSeq = %d, want 2 (change_log high-water)", maxSeq) + } + if len(events) != 2 { + t.Fatalf("want one event per session (2), got %d", len(events)) + } + for _, e := range events { + if e.Seq != maxSeq { + t.Errorf("snapshot event seq = %d, want resume watermark %d", e.Seq, maxSeq) + } + if e.EventType != "session_snapshot" { + t.Errorf("event type = %q, want session_snapshot", e.EventType) + } + // Payload must be a parseable full record at the persisted revision with + // metadata excluded and the schema version stamped. + var rec domain.SessionRecord + if err := json.Unmarshal([]byte(e.Payload), &rec); err != nil { + t.Fatalf("payload not a SessionRecord: %v", err) + } + if rec.Lifecycle.Version != domain.LifecycleVersion { + t.Errorf("payload version = %d, want %d", rec.Lifecycle.Version, domain.LifecycleVersion) + } + if rec.Lifecycle.Revision != 1 { + t.Errorf("payload revision = %d, want 1", rec.Lifecycle.Revision) + } + if rec.Metadata != nil { + t.Errorf("snapshot payload must exclude metadata, got %v", rec.Metadata) + } + } +} diff --git a/backend/sqlc.yaml b/backend/sqlc.yaml new file mode 100644 index 00000000..9659bf77 --- /dev/null +++ b/backend/sqlc.yaml @@ -0,0 +1,13 @@ +version: "2" +sql: + - engine: "sqlite" + schema: "internal/storage/sqlite/migrations" + queries: "internal/storage/sqlite/queries" + gen: + go: + package: "gen" + out: "internal/storage/sqlite/gen" + emit_json_tags: false + emit_prepared_queries: false + emit_interface: true + emit_empty_slices: true From 23b8fe43cf01fefa0262fe49a30b0776d9b4e612 Mon Sep 17 00:00:00 2001 From: Pritom14 Date: Sat, 30 May 2026 21:53:14 +0530 Subject: [PATCH 02/10] feat(backend): add projects and pr_enrichment tables to SQLite store Migration 0002 adds two tables off the canonical CDC path: - projects: durable registry of managed repos (the twin of the old YAML config). Soft-deletable via archived_at so a session's project_id always resolves; ListProjects returns active rows only, GetProject resolves any. - pr_enrichment: per-session cache of rich SCM facts (CI summary, review decision, mergeability, pending comments, CI log tail) that do not live in the canonical lifecycle. 1:1 with a session, cascades on session delete. Both are written outside the LCM write path: no revision bump, no change_log/outbox event. Store methods mirror the reaction_trackers adapter pattern with storage-local row structs. --- backend/internal/storage/sqlite/gen/models.go | 25 +++ .../storage/sqlite/gen/pr_enrichment.sql.go | 76 +++++++++ .../storage/sqlite/gen/projects.sql.go | 154 ++++++++++++++++++ .../internal/storage/sqlite/gen/querier.go | 8 + .../sqlite/migrations/0002_pr_projects.sql | 50 ++++++ .../storage/sqlite/pr_projects_test.go | 128 +++++++++++++++ backend/internal/storage/sqlite/pr_store.go | 66 ++++++++ .../internal/storage/sqlite/project_store.go | 115 +++++++++++++ .../storage/sqlite/queries/pr_enrichment.sql | 18 ++ .../storage/sqlite/queries/projects.sql | 32 ++++ 10 files changed, 672 insertions(+) create mode 100644 backend/internal/storage/sqlite/gen/pr_enrichment.sql.go create mode 100644 backend/internal/storage/sqlite/gen/projects.sql.go create mode 100644 backend/internal/storage/sqlite/migrations/0002_pr_projects.sql create mode 100644 backend/internal/storage/sqlite/pr_projects_test.go create mode 100644 backend/internal/storage/sqlite/pr_store.go create mode 100644 backend/internal/storage/sqlite/project_store.go create mode 100644 backend/internal/storage/sqlite/queries/pr_enrichment.sql create mode 100644 backend/internal/storage/sqlite/queries/projects.sql diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go index 210fe245..dccf25c4 100644 --- a/backend/internal/storage/sqlite/gen/models.go +++ b/backend/internal/storage/sqlite/gen/models.go @@ -34,6 +34,31 @@ type Outbox struct { CreatedAt time.Time } +type PrEnrichment struct { + SessionID string + CiSummary string + ReviewDecision string + Mergeability string + PendingComments string + CiLogTail string + LastFetchedAt time.Time +} + +type Project struct { + ID string + Path string + RepoOwner string + RepoName string + RepoPlatform string + RepoOriginUrl string + DefaultBranch string + DisplayName string + SessionPrefix string + Source string + RegisteredAt time.Time + ArchivedAt sql.NullTime +} + type ReactionTracker struct { SessionID string ReactionKey string diff --git a/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go b/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go new file mode 100644 index 00000000..c0643104 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go @@ -0,0 +1,76 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr_enrichment.sql + +package gen + +import ( + "context" + "time" +) + +const deletePREnrichment = `-- name: DeletePREnrichment :exec +DELETE FROM pr_enrichment WHERE session_id = ? +` + +func (q *Queries) DeletePREnrichment(ctx context.Context, sessionID string) error { + _, err := q.db.ExecContext(ctx, deletePREnrichment, sessionID) + return err +} + +const getPREnrichment = `-- name: GetPREnrichment :one +SELECT session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at +FROM pr_enrichment +WHERE session_id = ? +` + +func (q *Queries) GetPREnrichment(ctx context.Context, sessionID string) (PrEnrichment, error) { + row := q.db.QueryRowContext(ctx, getPREnrichment, sessionID) + var i PrEnrichment + err := row.Scan( + &i.SessionID, + &i.CiSummary, + &i.ReviewDecision, + &i.Mergeability, + &i.PendingComments, + &i.CiLogTail, + &i.LastFetchedAt, + ) + return i, err +} + +const upsertPREnrichment = `-- name: UpsertPREnrichment :exec +INSERT INTO pr_enrichment (session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + ci_summary = excluded.ci_summary, + review_decision = excluded.review_decision, + mergeability = excluded.mergeability, + pending_comments = excluded.pending_comments, + ci_log_tail = excluded.ci_log_tail, + last_fetched_at = excluded.last_fetched_at +` + +type UpsertPREnrichmentParams struct { + SessionID string + CiSummary string + ReviewDecision string + Mergeability string + PendingComments string + CiLogTail string + LastFetchedAt time.Time +} + +func (q *Queries) UpsertPREnrichment(ctx context.Context, arg UpsertPREnrichmentParams) error { + _, err := q.db.ExecContext(ctx, upsertPREnrichment, + arg.SessionID, + arg.CiSummary, + arg.ReviewDecision, + arg.Mergeability, + arg.PendingComments, + arg.CiLogTail, + arg.LastFetchedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/projects.sql.go b/backend/internal/storage/sqlite/gen/projects.sql.go new file mode 100644 index 00000000..33959b76 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/projects.sql.go @@ -0,0 +1,154 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: projects.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const archiveProject = `-- name: ArchiveProject :exec +UPDATE projects SET archived_at = ? WHERE id = ? +` + +type ArchiveProjectParams struct { + ArchivedAt sql.NullTime + ID string +} + +func (q *Queries) ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error { + _, err := q.db.ExecContext(ctx, archiveProject, arg.ArchivedAt, arg.ID) + return err +} + +const deleteProject = `-- name: DeleteProject :exec +DELETE FROM projects WHERE id = ? +` + +func (q *Queries) DeleteProject(ctx context.Context, id string) error { + _, err := q.db.ExecContext(ctx, deleteProject, id) + return err +} + +const getProject = `-- name: GetProject :one +SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at +FROM projects +WHERE id = ? +` + +func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { + row := q.db.QueryRowContext(ctx, getProject, id) + var i Project + err := row.Scan( + &i.ID, + &i.Path, + &i.RepoOwner, + &i.RepoName, + &i.RepoPlatform, + &i.RepoOriginUrl, + &i.DefaultBranch, + &i.DisplayName, + &i.SessionPrefix, + &i.Source, + &i.RegisteredAt, + &i.ArchivedAt, + ) + return i, err +} + +const listProjects = `-- name: ListProjects :many +SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at +FROM projects +WHERE archived_at IS NULL +ORDER BY id +` + +func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { + rows, err := q.db.QueryContext(ctx, listProjects) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Project{} + for rows.Next() { + var i Project + if err := rows.Scan( + &i.ID, + &i.Path, + &i.RepoOwner, + &i.RepoName, + &i.RepoPlatform, + &i.RepoOriginUrl, + &i.DefaultBranch, + &i.DisplayName, + &i.SessionPrefix, + &i.Source, + &i.RegisteredAt, + &i.ArchivedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertProject = `-- name: UpsertProject :exec +INSERT INTO projects (id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (id) DO UPDATE SET + path = excluded.path, + repo_owner = excluded.repo_owner, + repo_name = excluded.repo_name, + repo_platform = excluded.repo_platform, + repo_origin_url = excluded.repo_origin_url, + default_branch = excluded.default_branch, + display_name = excluded.display_name, + session_prefix = excluded.session_prefix, + source = excluded.source, + registered_at = excluded.registered_at, + archived_at = excluded.archived_at +` + +type UpsertProjectParams struct { + ID string + Path string + RepoOwner string + RepoName string + RepoPlatform string + RepoOriginUrl string + DefaultBranch string + DisplayName string + SessionPrefix string + Source string + RegisteredAt time.Time + ArchivedAt sql.NullTime +} + +func (q *Queries) UpsertProject(ctx context.Context, arg UpsertProjectParams) error { + _, err := q.db.ExecContext(ctx, upsertProject, + arg.ID, + arg.Path, + arg.RepoOwner, + arg.RepoName, + arg.RepoPlatform, + arg.RepoOriginUrl, + arg.DefaultBranch, + arg.DisplayName, + arg.SessionPrefix, + arg.Source, + arg.RegisteredAt, + arg.ArchivedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go index 074fe053..76dd1aab 100644 --- a/backend/internal/storage/sqlite/gen/querier.go +++ b/backend/internal/storage/sqlite/gen/querier.go @@ -9,11 +9,16 @@ import ( ) type Querier interface { + ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error + DeletePREnrichment(ctx context.Context, sessionID string) error + DeleteProject(ctx context.Context, id string) error DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error GetConsumerOffset(ctx context.Context, consumer string) (int64, error) GetMetadata(ctx context.Context, sessionID string) ([]GetMetadataRow, error) + GetPREnrichment(ctx context.Context, sessionID string) (PrEnrichment, error) + GetProject(ctx context.Context, id string) (Project, error) GetSession(ctx context.Context, id string) (Session, error) GetSessionRevision(ctx context.Context, id string) (int64, error) // Appends a canonical-write record and returns its monotonic seq so the same @@ -24,6 +29,7 @@ type Querier interface { // the row is persisted at revision 1. InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) ListAllSessions(ctx context.Context) ([]Session, error) + ListProjects(ctx context.Context) ([]Project, error) ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) ListUnsentOutbox(ctx context.Context, limit int64) ([]ListUnsentOutboxRow, error) @@ -36,6 +42,8 @@ type Querier interface { UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error UpsertMetadata(ctx context.Context, arg UpsertMetadataParams) error + UpsertPREnrichment(ctx context.Context, arg UpsertPREnrichmentParams) error + UpsertProject(ctx context.Context, arg UpsertProjectParams) error UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error } diff --git a/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql b/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql new file mode 100644 index 00000000..4421f0dd --- /dev/null +++ b/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql @@ -0,0 +1,50 @@ +-- +goose Up +-- +goose StatementBegin + +-- projects is the durable registry of repos AO manages, the SQLite twin of the +-- old YAML config (global config.yaml + per-repo agent-orchestrator.yaml). id is +-- the {basename}_{sha256(path:originUrl)[:10]} key the session layer references +-- via sessions.project_id. The relationship is app-enforced, NOT a hard FK: +-- SQLite cannot ALTER ADD a FK without a table rebuild, and an existing-session +-- backfill may land sessions before their project row. +CREATE TABLE projects ( + id TEXT PRIMARY KEY, + path TEXT NOT NULL, + repo_owner TEXT NOT NULL DEFAULT '', + repo_name TEXT NOT NULL DEFAULT '', + repo_platform TEXT NOT NULL DEFAULT '', + repo_origin_url TEXT NOT NULL DEFAULT '', + default_branch TEXT NOT NULL DEFAULT '', + display_name TEXT NOT NULL DEFAULT '', + session_prefix TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL DEFAULT '', + registered_at TIMESTAMP NOT NULL, + + -- soft delete: NULL = active. Archiving keeps the row so a session's + -- project_id always resolves (there is no FK to enforce it), avoiding + -- dangling references; active-only reads filter archived_at IS NULL. + archived_at TIMESTAMP +); + +-- pr_enrichment is the SCM observer's per-session cache of the rich PR facts that +-- do NOT live in the canonical lifecycle (which keeps only pr_state/reason/number/ +-- url). It is 1:1 with a session (a PR is always tied to a session by its branch), +-- written by the SCM observer OFF the canonical CDC path (no revision bump, no +-- change_log/outbox event), and cascades away with its session. +CREATE TABLE pr_enrichment ( + session_id TEXT PRIMARY KEY REFERENCES sessions (id) ON DELETE CASCADE, + ci_summary TEXT NOT NULL DEFAULT '', + review_decision TEXT NOT NULL DEFAULT '', + mergeability TEXT NOT NULL DEFAULT '', + pending_comments TEXT NOT NULL DEFAULT '', + ci_log_tail TEXT NOT NULL DEFAULT '', + last_fetched_at TIMESTAMP NOT NULL +); + +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE pr_enrichment; +DROP TABLE projects; +-- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/pr_projects_test.go b/backend/internal/storage/sqlite/pr_projects_test.go new file mode 100644 index 00000000..6cdd20bc --- /dev/null +++ b/backend/internal/storage/sqlite/pr_projects_test.go @@ -0,0 +1,128 @@ +package sqlite + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +func TestProjectUpsertGetListDelete(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + + if _, ok, err := s.GetProject(ctx, "p1"); err != nil || ok { + t.Fatalf("get missing: ok=%v err=%v", ok, err) + } + + p := ProjectRow{ + ID: "p1", Path: "/repo", RepoOwner: "acme", RepoName: "widget", + RepoPlatform: "github", RepoOriginURL: "git@github.com:acme/widget.git", + DefaultBranch: "main", DisplayName: "Widget", SessionPrefix: "wid", + Source: "local", RegisteredAt: now, + } + if err := s.UpsertProject(ctx, p); err != nil { + t.Fatalf("upsert: %v", err) + } + + got, ok, err := s.GetProject(ctx, "p1") + if err != nil || !ok { + t.Fatalf("get: ok=%v err=%v", ok, err) + } + if got != p { + t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, p) + } + + // Upsert again with a changed field updates in place (no duplicate). + p.DisplayName = "Widget 2" + if err := s.UpsertProject(ctx, p); err != nil { + t.Fatalf("re-upsert: %v", err) + } + list, err := s.ListProjects(ctx) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(list) != 1 || list[0].DisplayName != "Widget 2" { + t.Fatalf("list after re-upsert = %+v", list) + } + + if err := s.DeleteProject(ctx, "p1"); err != nil { + t.Fatalf("delete: %v", err) + } + if _, ok, _ := s.GetProject(ctx, "p1"); ok { + t.Fatal("project should be gone after delete") + } +} + +func TestArchiveProjectHidesFromListButGetResolves(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + + if err := s.UpsertProject(ctx, ProjectRow{ID: "p1", Path: "/repo", RegisteredAt: now}); err != nil { + t.Fatalf("upsert: %v", err) + } + if err := s.ArchiveProject(ctx, "p1", now); err != nil { + t.Fatalf("archive: %v", err) + } + + // Active-only list hides it. + list, err := s.ListProjects(ctx) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(list) != 0 { + t.Fatalf("archived project should not appear in ListProjects, got %+v", list) + } + + // Get still resolves it (a session's project_id must not dangle) and reports + // the archived marker. + got, ok, err := s.GetProject(ctx, "p1") + if err != nil || !ok { + t.Fatalf("get archived: ok=%v err=%v", ok, err) + } + if got.ArchivedAt.IsZero() { + t.Fatal("archived project should carry a non-zero ArchivedAt") + } +} + +func TestPREnrichmentUpsertGetDelete(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + + // pr_enrichment FKs sessions(id); seed the session first. + if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { + t.Fatalf("seed session: %v", err) + } + + if _, ok, err := s.GetPREnrichment(ctx, "s1"); err != nil || ok { + t.Fatalf("get missing: ok=%v err=%v", ok, err) + } + + e := PREnrichmentRow{ + SessionID: "s1", CISummary: "3 passing, 1 failing", ReviewDecision: "changes_requested", + Mergeability: "blocked", PendingComments: `[{"path":"a.go"}]`, CILogTail: "FAIL TestX", + LastFetchedAt: now, + } + if err := s.UpsertPREnrichment(ctx, e); err != nil { + t.Fatalf("upsert: %v", err) + } + + got, ok, err := s.GetPREnrichment(ctx, "s1") + if err != nil || !ok { + t.Fatalf("get: ok=%v err=%v", ok, err) + } + if got != e { + t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, e) + } + + if err := s.DeletePREnrichment(ctx, "s1"); err != nil { + t.Fatalf("delete: %v", err) + } + if _, ok, _ := s.GetPREnrichment(ctx, "s1"); ok { + t.Fatal("enrichment should be gone after delete") + } +} diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go new file mode 100644 index 00000000..70efb7ce --- /dev/null +++ b/backend/internal/storage/sqlite/pr_store.go @@ -0,0 +1,66 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// PREnrichmentRow is the SCM observer's cache of the rich PR facts that do not +// live in the canonical lifecycle (which keeps only pr_state/reason/number/url). +// It is 1:1 with a session and written OFF the canonical CDC path: upserting it +// never bumps revision and never emits a change_log/outbox event. pending_comments +// and ci_log_tail are opaque blobs the SCM observer serializes. +type PREnrichmentRow struct { + SessionID string + CISummary string + ReviewDecision string + Mergeability string + PendingComments string + CILogTail string + LastFetchedAt time.Time +} + +// UpsertPREnrichment inserts or replaces the cached PR facts for one session. +func (s *Store) UpsertPREnrichment(ctx context.Context, r PREnrichmentRow) error { + return s.q.UpsertPREnrichment(ctx, gen.UpsertPREnrichmentParams{ + SessionID: r.SessionID, + CiSummary: r.CISummary, + ReviewDecision: r.ReviewDecision, + Mergeability: r.Mergeability, + PendingComments: r.PendingComments, + CiLogTail: r.CILogTail, + LastFetchedAt: r.LastFetchedAt, + }) +} + +// GetPREnrichment returns the cached PR facts for one session. ok is false when +// no row exists (the SCM observer has not yet fetched, or the session has no PR). +func (s *Store) GetPREnrichment(ctx context.Context, sessionID string) (PREnrichmentRow, bool, error) { + e, err := s.q.GetPREnrichment(ctx, sessionID) + if errors.Is(err, sql.ErrNoRows) { + return PREnrichmentRow{}, false, nil + } + if err != nil { + return PREnrichmentRow{}, false, fmt.Errorf("get pr enrichment: %w", err) + } + return PREnrichmentRow{ + SessionID: e.SessionID, + CISummary: e.CiSummary, + ReviewDecision: e.ReviewDecision, + Mergeability: e.Mergeability, + PendingComments: e.PendingComments, + CILogTail: e.CiLogTail, + LastFetchedAt: e.LastFetchedAt, + }, true, nil +} + +// DeletePREnrichment drops the cached PR facts for one session. Normally +// unnecessary (the FK cascades on session delete), exposed for explicit eviction. +func (s *Store) DeletePREnrichment(ctx context.Context, sessionID string) error { + return s.q.DeletePREnrichment(ctx, sessionID) +} diff --git a/backend/internal/storage/sqlite/project_store.go b/backend/internal/storage/sqlite/project_store.go new file mode 100644 index 00000000..fb75e18a --- /dev/null +++ b/backend/internal/storage/sqlite/project_store.go @@ -0,0 +1,115 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ProjectRow is one registered repo, the durable twin of the old YAML config +// entry. It is the unit the registration path upserts and cross-project readers +// list. Off the canonical CDC path: writing a project never emits a change_log +// or outbox event. +type ProjectRow struct { + ID string + Path string + RepoOwner string + RepoName string + RepoPlatform string + RepoOriginURL string + DefaultBranch string + DisplayName string + SessionPrefix string + Source string + RegisteredAt time.Time + // ArchivedAt is the soft-delete marker; zero means active. GetProject returns + // it regardless of state (so a session can resolve its archived project); + // ListProjects returns only rows where it is zero. + ArchivedAt time.Time +} + +// UpsertProject inserts or updates one registered project. +func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { + return s.q.UpsertProject(ctx, gen.UpsertProjectParams{ + ID: r.ID, + Path: r.Path, + RepoOwner: r.RepoOwner, + RepoName: r.RepoName, + RepoPlatform: r.RepoPlatform, + RepoOriginUrl: r.RepoOriginURL, + DefaultBranch: r.DefaultBranch, + DisplayName: r.DisplayName, + SessionPrefix: r.SessionPrefix, + Source: r.Source, + RegisteredAt: r.RegisteredAt, + ArchivedAt: nullTime(r.ArchivedAt), + }) +} + +// ArchiveProject soft-deletes one project, keeping the row so a session's +// project_id still resolves. Active-only reads (ListProjects) then hide it. +func (s *Store) ArchiveProject(ctx context.Context, id string, t time.Time) error { + return s.q.ArchiveProject(ctx, gen.ArchiveProjectParams{ + ArchivedAt: nullTime(t), + ID: id, + }) +} + +// GetProject returns one project by id. ok is false when no row exists. +func (s *Store) GetProject(ctx context.Context, id string) (ProjectRow, bool, error) { + p, err := s.q.GetProject(ctx, id) + if errors.Is(err, sql.ErrNoRows) { + return ProjectRow{}, false, nil + } + if err != nil { + return ProjectRow{}, false, fmt.Errorf("get project: %w", err) + } + return projectRowFromGen(p), true, nil +} + +// ListProjects returns every registered project, ordered by id. +func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { + rows, err := s.q.ListProjects(ctx) + if err != nil { + return nil, fmt.Errorf("list projects: %w", err) + } + out := make([]ProjectRow, 0, len(rows)) + for _, p := range rows { + out = append(out, projectRowFromGen(p)) + } + return out, nil +} + +// DeleteProject removes one project by id. +func (s *Store) DeleteProject(ctx context.Context, id string) error { + return s.q.DeleteProject(ctx, id) +} + +func projectRowFromGen(p gen.Project) ProjectRow { + return ProjectRow{ + ID: p.ID, + Path: p.Path, + RepoOwner: p.RepoOwner, + RepoName: p.RepoName, + RepoPlatform: p.RepoPlatform, + RepoOriginURL: p.RepoOriginUrl, + DefaultBranch: p.DefaultBranch, + DisplayName: p.DisplayName, + SessionPrefix: p.SessionPrefix, + Source: p.Source, + RegisteredAt: p.RegisteredAt, + ArchivedAt: p.ArchivedAt.Time, + } +} + +// nullTime maps a zero time.Time to a NULL column, else a valid timestamp. +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} diff --git a/backend/internal/storage/sqlite/queries/pr_enrichment.sql b/backend/internal/storage/sqlite/queries/pr_enrichment.sql new file mode 100644 index 00000000..7c2ac0a0 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr_enrichment.sql @@ -0,0 +1,18 @@ +-- name: UpsertPREnrichment :exec +INSERT INTO pr_enrichment (session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + ci_summary = excluded.ci_summary, + review_decision = excluded.review_decision, + mergeability = excluded.mergeability, + pending_comments = excluded.pending_comments, + ci_log_tail = excluded.ci_log_tail, + last_fetched_at = excluded.last_fetched_at; + +-- name: GetPREnrichment :one +SELECT session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at +FROM pr_enrichment +WHERE session_id = ?; + +-- name: DeletePREnrichment :exec +DELETE FROM pr_enrichment WHERE session_id = ?; diff --git a/backend/internal/storage/sqlite/queries/projects.sql b/backend/internal/storage/sqlite/queries/projects.sql new file mode 100644 index 00000000..054b8f0e --- /dev/null +++ b/backend/internal/storage/sqlite/queries/projects.sql @@ -0,0 +1,32 @@ +-- name: UpsertProject :exec +INSERT INTO projects (id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (id) DO UPDATE SET + path = excluded.path, + repo_owner = excluded.repo_owner, + repo_name = excluded.repo_name, + repo_platform = excluded.repo_platform, + repo_origin_url = excluded.repo_origin_url, + default_branch = excluded.default_branch, + display_name = excluded.display_name, + session_prefix = excluded.session_prefix, + source = excluded.source, + registered_at = excluded.registered_at, + archived_at = excluded.archived_at; + +-- name: GetProject :one +SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at +FROM projects +WHERE id = ?; + +-- name: ListProjects :many +SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at +FROM projects +WHERE archived_at IS NULL +ORDER BY id; + +-- name: ArchiveProject :exec +UPDATE projects SET archived_at = ? WHERE id = ?; + +-- name: DeleteProject :exec +DELETE FROM projects WHERE id = ?; From 4ce90448e28e0792a33f90d0379ce7a11d877f71 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 00:33:13 +0530 Subject: [PATCH 03/10] refactor(storage): make session metadata + PR facts typed and structured The first storage cut modelled two side tables as free-form blobs. This replaces both with opinionated, statically-typed schema so what a session can carry is fixed by the schema, not by convention. session_metadata: was a (session_id, key, value) KV bag with six convention-only keys. Now a 1:1 table of named, typed columns. The domain currency is a typed domain.SessionMetadata struct (was map[string]string), threaded through ports.LifecycleStore, the LCM, the Session Manager and the reaper, so an unknown key is a compile error rather than a silently-dropped write. PatchMetadata keeps its non-destructive merge ("empty = leave unchanged"). The off-canonical invariant is now enforced at the type level via json:"-" on SessionRecord.Metadata, removing the manual `Metadata = nil` scrub the change_log/snapshot paths had to remember; the Meta* string-key constants are deleted. pr_enrichment -> pr (+ pr_check, pr_comment): the scalar facts are now typed columns with CHECK-constrained enums (review_decision, mergeability, ci_state) and integer CI counts instead of opaque TEXT. The two list facts the old `pending_comments`/ci_summary strings smuggled are normalized into child tables (pr_check, pr_comment) that cascade from pr. The store exposes UpsertPR/GetPR plus atomic ReplacePRChecks/ReplacePRComments + List. Both tables remain off the canonical CDC path. sqlc regenerated; migrations 0001/0002 revised in place (nothing released). gofmt/vet clean; go test -race green; daemon smoke-boots and creates the new schema. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/cdc_wiring.go | 1 - backend/internal/domain/session.go | 27 +- backend/internal/lifecycle/fakes_test.go | 43 ++-- backend/internal/lifecycle/manager.go | 48 ++-- backend/internal/lifecycle/manager_test.go | 2 +- backend/internal/observe/reaper/reaper.go | 7 +- .../internal/observe/reaper/reaper_test.go | 15 +- backend/internal/ports/outbound.go | 4 +- backend/internal/session/fakes_test.go | 43 +++- backend/internal/session/manager.go | 21 +- backend/internal/session/manager_test.go | 42 ++-- .../storage/sqlite/gen/metadata.sql.go | 95 ++++--- backend/internal/storage/sqlite/gen/models.go | 47 +++- backend/internal/storage/sqlite/gen/pr.sql.go | 235 ++++++++++++++++++ .../storage/sqlite/gen/pr_enrichment.sql.go | 76 ------ .../internal/storage/sqlite/gen/querier.go | 19 +- .../storage/sqlite/migrations/0001_init.sql | 21 +- .../sqlite/migrations/0002_pr_projects.sql | 57 ++++- .../storage/sqlite/pr_projects_test.go | 108 +++++++- backend/internal/storage/sqlite/pr_store.go | 210 +++++++++++++--- .../storage/sqlite/queries/metadata.sql | 25 +- .../internal/storage/sqlite/queries/pr.sql | 43 ++++ .../storage/sqlite/queries/pr_enrichment.sql | 18 -- backend/internal/storage/sqlite/store.go | 64 ++--- backend/internal/storage/sqlite/store_test.go | 13 +- backend/internal/storage/sqlite/upsert.go | 6 +- backend/main_test.go | 2 +- 27 files changed, 909 insertions(+), 383 deletions(-) create mode 100644 backend/internal/storage/sqlite/gen/pr.sql.go delete mode 100644 backend/internal/storage/sqlite/gen/pr_enrichment.sql.go create mode 100644 backend/internal/storage/sqlite/queries/pr.sql delete mode 100644 backend/internal/storage/sqlite/queries/pr_enrichment.sql diff --git a/backend/cdc_wiring.go b/backend/cdc_wiring.go index 89997e7d..cfae4fdb 100644 --- a/backend/cdc_wiring.go +++ b/backend/cdc_wiring.go @@ -125,7 +125,6 @@ func (s snapshotSource) Snapshot(ctx context.Context) ([]cdc.Event, int64, error events := make([]cdc.Event, 0, len(recs)) for _, r := range recs { r.Lifecycle.Version = domain.LifecycleVersion - r.Metadata = nil blob, err := json.Marshal(r) if err != nil { return nil, 0, fmt.Errorf("marshal snapshot %s: %w", r.ID, err) diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go index 578cca40..c9cd8d96 100644 --- a/backend/internal/domain/session.go +++ b/backend/internal/domain/session.go @@ -17,16 +17,41 @@ const ( KindOrchestrator SessionKind = "orchestrator" ) +// SessionMetadata is the typed, off-canonical metadata for a session: the +// operational handles and seed inputs the Session Manager and reaper need but +// that are NOT part of the canonical lifecycle. The set of fields is fixed here +// (no free-form keys), so what a session can carry is a compile-time fact, and +// it is persisted 1:1 in the session_metadata table off the CDC path. +// +// Empty fields mean "unset": PatchMetadata never overwrites a stored value with +// an empty one, so a partial write (spawn setting only the runtime handle) does +// not clobber a value set earlier (the branch set at creation). +type SessionMetadata struct { + Branch string `json:"branch,omitempty"` + WorkspacePath string `json:"workspacePath,omitempty"` + RuntimeHandleID string `json:"runtimeHandleId,omitempty"` + RuntimeName string `json:"runtimeName,omitempty"` + AgentSessionID string `json:"agentSessionId,omitempty"` + Prompt string `json:"prompt,omitempty"` +} + +// IsZero reports whether no metadata field is set. +func (m SessionMetadata) IsZero() bool { return m == SessionMetadata{} } + // SessionRecord is the PERSISTENCE shape: identity, canonical lifecycle, and // metadata — everything the store holds, and nothing derived. The store reads // and writes records; it never produces the derived display status. +// +// Metadata is json:"-" on purpose: it lives off the canonical path, so it must +// never ride along in the change_log / snapshot payloads. Enforcing that at the +// type level means no caller has to remember to scrub it before marshalling. type SessionRecord struct { ID SessionID `json:"id"` ProjectID ProjectID `json:"projectId"` IssueID IssueID `json:"issueId,omitempty"` Kind SessionKind `json:"kind"` Lifecycle CanonicalSessionLifecycle `json:"lifecycle"` - Metadata map[string]string `json:"metadata,omitempty"` + Metadata SessionMetadata `json:"-"` CreatedAt time.Time `json:"createdAt"` UpdatedAt time.Time `json:"updatedAt"` } diff --git a/backend/internal/lifecycle/fakes_test.go b/backend/internal/lifecycle/fakes_test.go index 5bacb49a..45aec91b 100644 --- a/backend/internal/lifecycle/fakes_test.go +++ b/backend/internal/lifecycle/fakes_test.go @@ -14,7 +14,7 @@ import ( type fakeStore struct { mu sync.Mutex records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]map[string]string + metadata map[domain.SessionID]domain.SessionMetadata } var _ ports.LifecycleStore = (*fakeStore)(nil) @@ -22,7 +22,7 @@ var _ ports.LifecycleStore = (*fakeStore)(nil) func newFakeStore() *fakeStore { return &fakeStore{ records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]map[string]string{}, + metadata: map[domain.SessionID]domain.SessionMetadata{}, } } @@ -90,26 +90,41 @@ func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain. return out, nil } -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { +func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (domain.SessionMetadata, error) { s.mu.Lock() defer s.mu.Unlock() - out := map[string]string{} - for k, v := range s.metadata[id] { - out[k] = v - } - return out, nil + return s.metadata[id], nil } -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { +func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, meta domain.SessionMetadata) error { s.mu.Lock() defer s.mu.Unlock() - if s.metadata[id] == nil { - s.metadata[id] = map[string]string{} + s.metadata[id] = mergeSessionMetadata(s.metadata[id], meta) + return nil +} + +// mergeSessionMetadata applies meta onto dst with the store's "empty = leave +// unchanged" semantics, so partial patches do not clobber earlier values. +func mergeSessionMetadata(dst, meta domain.SessionMetadata) domain.SessionMetadata { + if meta.Branch != "" { + dst.Branch = meta.Branch } - for k, v := range kv { - s.metadata[id][k] = v + if meta.WorkspacePath != "" { + dst.WorkspacePath = meta.WorkspacePath } - return nil + if meta.RuntimeHandleID != "" { + dst.RuntimeHandleID = meta.RuntimeHandleID + } + if meta.RuntimeName != "" { + dst.RuntimeName = meta.RuntimeName + } + if meta.AgentSessionID != "" { + dst.AgentSessionID = meta.AgentSessionID + } + if meta.Prompt != "" { + dst.Prompt = meta.Prompt + } + return dst } // recordingNotifier captures emitted events for assertions. diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 54e6887f..63d7164a 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -21,19 +21,11 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// Metadata keys OnSpawnCompleted records for the spawned session's handles. -// -// MetaPrompt is the assembled launch prompt, persisted so a Restore that finds -// no captured agent session id can still fall back to a fresh launch with the -// same prompt rather than failing. -const ( - MetaBranch = "branch" - MetaWorkspacePath = "workspacePath" - MetaRuntimeHandleID = "runtimeHandleId" - MetaRuntimeName = "runtimeName" - MetaAgentSessionID = "agentSessionId" - MetaPrompt = "prompt" -) +// Session metadata is now the typed domain.SessionMetadata struct (was a +// free-form string map keyed by Meta* constants). OnSpawnCompleted records the +// spawned session's handles via spawnMetadata; Prompt is the assembled launch +// prompt, persisted so a Restore that finds no captured agent session id can +// still fall back to a fresh launch with the same prompt rather than failing. // Manager is the LCM. The Apply* pipeline persists a transition and then fires // the mapped reaction via Notifier/AgentMessenger (see reactions.go). @@ -381,7 +373,7 @@ func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o p return err } } - if meta := spawnMetadata(o); len(meta) > 0 { + if meta := spawnMetadata(o); !meta.IsZero() { if err := m.store.PatchMetadata(ctx, id, meta); err != nil { return err } @@ -545,25 +537,13 @@ func sameActivity(a, b domain.ActivitySubstate) bool { return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt) } -func spawnMetadata(o ports.SpawnOutcome) map[string]string { - meta := map[string]string{} - if o.Branch != "" { - meta[MetaBranch] = o.Branch - } - if o.WorkspacePath != "" { - meta[MetaWorkspacePath] = o.WorkspacePath - } - if o.RuntimeHandle.ID != "" { - meta[MetaRuntimeHandleID] = o.RuntimeHandle.ID - } - if o.RuntimeHandle.RuntimeName != "" { - meta[MetaRuntimeName] = o.RuntimeHandle.RuntimeName - } - if o.AgentSessionID != "" { - meta[MetaAgentSessionID] = o.AgentSessionID - } - if o.Prompt != "" { - meta[MetaPrompt] = o.Prompt +func spawnMetadata(o ports.SpawnOutcome) domain.SessionMetadata { + return domain.SessionMetadata{ + Branch: o.Branch, + WorkspacePath: o.WorkspacePath, + RuntimeHandleID: o.RuntimeHandle.ID, + RuntimeName: o.RuntimeHandle.RuntimeName, + AgentSessionID: o.AgentSessionID, + Prompt: o.Prompt, } - return meta } diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index 6a2cc1d1..96557e8f 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -388,7 +388,7 @@ func TestOnSpawnCompleted(t *testing.T) { t.Errorf("display = %v, want spawning", got) } meta, _ := store.GetMetadata(context.Background(), sid) - if meta[MetaBranch] != "feat/x" || meta[MetaAgentSessionID] != "agent-1" || meta[MetaRuntimeName] != "tmux" { + if meta.Branch != "feat/x" || meta.AgentSessionID != "agent-1" || meta.RuntimeName != "tmux" { t.Errorf("metadata not recorded: %+v", meta) } } diff --git a/backend/internal/observe/reaper/reaper.go b/backend/internal/observe/reaper/reaper.go index 66456ea6..579f1d63 100644 --- a/backend/internal/observe/reaper/reaper.go +++ b/backend/internal/observe/reaper/reaper.go @@ -16,7 +16,6 @@ import ( "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -203,11 +202,11 @@ func (r *Reaper) probeOne(ctx context.Context, sess domain.SessionRecord, now ti } // handleFromRecord reconstructs the RuntimeHandle stored on the session by -// OnSpawnCompleted. Both keys are required; either being empty is the +// OnSpawnCompleted. Both fields are required; either being empty is the // "session lacks a probable handle" signal that probeOne uses to skip. func handleFromRecord(rec domain.SessionRecord) (ports.RuntimeHandle, bool) { - id := rec.Metadata[lifecycle.MetaRuntimeHandleID] - name := rec.Metadata[lifecycle.MetaRuntimeName] + id := rec.Metadata.RuntimeHandleID + name := rec.Metadata.RuntimeName if id == "" || name == "" { return ports.RuntimeHandle{}, false } diff --git a/backend/internal/observe/reaper/reaper_test.go b/backend/internal/observe/reaper/reaper_test.go index d6b88efd..0d3b4d47 100644 --- a/backend/internal/observe/reaper/reaper_test.go +++ b/backend/internal/observe/reaper/reaper_test.go @@ -9,7 +9,6 @@ import ( "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -124,9 +123,9 @@ func aliveSessionWith(id domain.SessionID, runtimeName, handleID string) domain. Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, }, - Metadata: map[string]string{ - lifecycle.MetaRuntimeHandleID: handleID, - lifecycle.MetaRuntimeName: runtimeName, + Metadata: domain.SessionMetadata{ + RuntimeHandleID: handleID, + RuntimeName: runtimeName, }, } } @@ -141,9 +140,9 @@ func detectingSessionWith(id domain.SessionID, runtimeName, handleID string) dom Session: domain.SessionSubstate{State: domain.SessionDetecting, Reason: domain.ReasonProbeFailure}, Runtime: domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError}, }, - Metadata: map[string]string{ - lifecycle.MetaRuntimeHandleID: handleID, - lifecycle.MetaRuntimeName: runtimeName, + Metadata: domain.SessionMetadata{ + RuntimeHandleID: handleID, + RuntimeName: runtimeName, }, } } @@ -367,7 +366,7 @@ func TestReaper_SkipsMissingHandle(t *testing.T) { now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) clock := func() time.Time { return now } sess := aliveSessionWith("s1", "tmux", "h1") - delete(sess.Metadata, lifecycle.MetaRuntimeHandleID) + sess.Metadata.RuntimeHandleID = "" lcm := &fakeLCM{sessions: []domain.SessionRecord{sess}} rt := &fakeRuntime{results: map[string]aliveResult{}} rp := reaper.New(lcm, reaper.MapRegistry{"tmux": rt}, reaper.Config{Clock: clock, Tick: time.Hour}) diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go index ba08d9b9..c64a1e6d 100644 --- a/backend/internal/ports/outbound.go +++ b/backend/internal/ports/outbound.go @@ -23,8 +23,8 @@ type LifecycleStore interface { Upsert(ctx context.Context, rec domain.SessionRecord, eventType EventType) error Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) - GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error) - PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error + GetMetadata(ctx context.Context, id domain.SessionID) (domain.SessionMetadata, error) + PatchMetadata(ctx context.Context, id domain.SessionID, meta domain.SessionMetadata) error // Get returns a single full record (with identity) by id. Load is // lifecycle-only, so readers use this to build the read-model and reconstruct diff --git a/backend/internal/session/fakes_test.go b/backend/internal/session/fakes_test.go index 71eaa4af..033f6de7 100644 --- a/backend/internal/session/fakes_test.go +++ b/backend/internal/session/fakes_test.go @@ -47,7 +47,7 @@ func (c *callLog) indexOf(name string) int { type fakeStore struct { mu sync.Mutex records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]map[string]string + metadata map[domain.SessionID]domain.SessionMetadata } var _ ports.LifecycleStore = (*fakeStore)(nil) @@ -55,7 +55,7 @@ var _ ports.LifecycleStore = (*fakeStore)(nil) func newFakeStore() *fakeStore { return &fakeStore{ records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]map[string]string{}, + metadata: map[domain.SessionID]domain.SessionMetadata{}, } } @@ -113,30 +113,47 @@ func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain. return out, nil } -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { +func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (domain.SessionMetadata, error) { s.mu.Lock() defer s.mu.Unlock() - return cloneMap(s.metadata[id]), nil + return s.metadata[id], nil } -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { +func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, meta domain.SessionMetadata) error { s.mu.Lock() defer s.mu.Unlock() - if s.metadata[id] == nil { - s.metadata[id] = map[string]string{} + s.metadata[id] = mergeSessionMetadata(s.metadata[id], meta) + return nil +} + +// mergeSessionMetadata applies meta onto dst with the store's "empty = leave +// unchanged" semantics, so partial patches do not clobber earlier values. +func mergeSessionMetadata(dst, meta domain.SessionMetadata) domain.SessionMetadata { + if meta.Branch != "" { + dst.Branch = meta.Branch } - for k, v := range kv { - s.metadata[id][k] = v + if meta.WorkspacePath != "" { + dst.WorkspacePath = meta.WorkspacePath } - return nil + if meta.RuntimeHandleID != "" { + dst.RuntimeHandleID = meta.RuntimeHandleID + } + if meta.RuntimeName != "" { + dst.RuntimeName = meta.RuntimeName + } + if meta.AgentSessionID != "" { + dst.AgentSessionID = meta.AgentSessionID + } + if meta.Prompt != "" { + dst.Prompt = meta.Prompt + } + return dst } // withMetadata attaches the separately-stored metadata to a record copy (a real // store would return them together). Caller holds s.mu. func (s *fakeStore) withMetadata(rec domain.SessionRecord) domain.SessionRecord { - if md := s.metadata[rec.ID]; len(md) > 0 { - rec.Metadata = cloneMap(md) - } + rec.Metadata = s.metadata[rec.ID] return rec } diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go index e764f6a3..dce63305 100644 --- a/backend/internal/session/manager.go +++ b/backend/internal/session/manager.go @@ -19,7 +19,6 @@ import ( "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -278,8 +277,8 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess // (the agent's id-capture path is a separate hook that may never have run, so // "no id" is the common case rather than an error). If neither is available // there is nothing to relaunch from — fail early, before any I/O. - agentSessionID := meta[lifecycle.MetaAgentSessionID] - seededPrompt := meta[lifecycle.MetaPrompt] + agentSessionID := meta.AgentSessionID + seededPrompt := meta.Prompt if agentSessionID == "" && seededPrompt == "" { return domain.Session{}, fmt.Errorf("restore %s: no agent session id or seeded prompt (cannot resume or relaunch)", id) } @@ -287,7 +286,7 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ ProjectID: rec.ProjectID, SessionID: id, - Branch: meta[lifecycle.MetaBranch], + Branch: meta.Branch, }) if err != nil { return domain.Session{}, fmt.Errorf("restore %s: workspace restore: %w", id, err) @@ -335,7 +334,7 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess if revertErr := m.lcm.OnSpawnInitiated(ctx, rec); revertErr != nil { return domain.Session{}, fmt.Errorf("restore %s: revert after spawn completed failure: %w (original error: %v)", id, revertErr, err) } - if len(rec.Metadata) > 0 { + if !rec.Metadata.IsZero() { if revertErr := m.store.PatchMetadata(ctx, id, rec.Metadata); revertErr != nil { return domain.Session{}, fmt.Errorf("restore %s: revert metadata after spawn completed failure: %w (original error: %v)", id, revertErr, err) } @@ -440,17 +439,17 @@ func seedRecord(id domain.SessionID, cfg ports.SpawnConfig, now time.Time) domai // runtimeHandle / workspaceInfo reconstruct teardown handles from the metadata // the LCM persisted in OnSpawnCompleted (the metadata-key contract is shared // with the lifecycle package). -func runtimeHandle(meta map[string]string) ports.RuntimeHandle { +func runtimeHandle(meta domain.SessionMetadata) ports.RuntimeHandle { return ports.RuntimeHandle{ - ID: meta[lifecycle.MetaRuntimeHandleID], - RuntimeName: meta[lifecycle.MetaRuntimeName], + ID: meta.RuntimeHandleID, + RuntimeName: meta.RuntimeName, } } -func workspaceInfo(rec domain.SessionRecord, meta map[string]string) ports.WorkspaceInfo { +func workspaceInfo(rec domain.SessionRecord, meta domain.SessionMetadata) ports.WorkspaceInfo { return ports.WorkspaceInfo{ - Path: meta[lifecycle.MetaWorkspacePath], - Branch: meta[lifecycle.MetaBranch], + Path: meta.WorkspacePath, + Branch: meta.Branch, SessionID: rec.ID, ProjectID: rec.ProjectID, } diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go index 5bb20d07..c0c98cf7 100644 --- a/backend/internal/session/manager_test.go +++ b/backend/internal/session/manager_test.go @@ -6,7 +6,6 @@ import ( "testing" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -86,16 +85,15 @@ func TestSpawn_HappyPath(t *testing.T) { // persisted too so a later Restore that finds no captured agent session id // can still fall back to a fresh launch using the same prompt. meta, _ := h.store.GetMetadata(ctx, "sess-1") - for k, want := range map[string]string{ - lifecycle.MetaBranch: "feat/42", - lifecycle.MetaWorkspacePath: "/tmp/ws/sess-1", - lifecycle.MetaRuntimeHandleID: "rt-sess-1", - lifecycle.MetaRuntimeName: "tmux", - lifecycle.MetaPrompt: "do the thing\n\nbe careful", - } { - if meta[k] != want { - t.Errorf("meta[%q] = %q, want %q", k, meta[k], want) - } + want := domain.SessionMetadata{ + Branch: "feat/42", + WorkspacePath: "/tmp/ws/sess-1", + RuntimeHandleID: "rt-sess-1", + RuntimeName: "tmux", + Prompt: "do the thing\n\nbe careful", + } + if meta != want { + t.Errorf("metadata = %+v, want %+v", meta, want) } } @@ -300,7 +298,7 @@ func TestRestore_LiveSession_Rejected(t *testing.T) { } // The session is live (never torn down). Capture an agent id so the only thing // blocking restore is the non-terminal lifecycle, not missing metadata. - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { t.Fatalf("patch metadata: %v", err) } createdBefore := len(h.runtime.created) @@ -398,7 +396,7 @@ func TestRestore_RelaunchesWithResumeCommand(t *testing.T) { t.Fatalf("kill: %v", err) } // The agent's resume id is captured in metadata (here set explicitly). - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { t.Fatalf("patch metadata: %v", err) } @@ -505,7 +503,7 @@ func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) { if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { t.Fatalf("kill: %v", err) } - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { t.Fatalf("patch metadata: %v", err) } beforeMeta, _ := h.store.GetMetadata(ctx, "sess-1") @@ -528,7 +526,7 @@ func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) { t.Fatalf("restore failure should advance revision twice, got %d want %d", rec.Lifecycle.Revision, before.Lifecycle.Revision+2) } afterMeta, _ := h.store.GetMetadata(ctx, "sess-1") - if !equalStringMap(afterMeta, beforeMeta) { + if afterMeta != beforeMeta { t.Fatalf("restore failure should restore metadata, got %+v want %+v", afterMeta, beforeMeta) } @@ -595,7 +593,7 @@ func seedTerminal(t *testing.T, h *harness, id domain.SessionID, wsPath string) }, ports.EventSessionCreated); err != nil { t.Fatalf("upsert %s: %v", id, err) } - if err := h.store.PatchMetadata(ctx, id, map[string]string{lifecycle.MetaWorkspacePath: wsPath}); err != nil { + if err := h.store.PatchMetadata(ctx, id, domain.SessionMetadata{WorkspacePath: wsPath}); err != nil { t.Fatalf("patch metadata %s: %v", id, err) } } @@ -612,18 +610,6 @@ func equalStrings(a, b []string) bool { return true } -func equalStringMap(a, b map[string]string) bool { - if len(a) != len(b) { - return false - } - for k, v := range a { - if b[k] != v { - return false - } - } - return true -} - func contains(ids []domain.SessionID, id domain.SessionID) bool { for _, x := range ids { if x == id { diff --git a/backend/internal/storage/sqlite/gen/metadata.sql.go b/backend/internal/storage/sqlite/gen/metadata.sql.go index 96510eb8..2c0396f7 100644 --- a/backend/internal/storage/sqlite/gen/metadata.sql.go +++ b/backend/internal/storage/sqlite/gen/metadata.sql.go @@ -7,53 +7,76 @@ package gen import ( "context" + "time" ) -const getMetadata = `-- name: GetMetadata :many -SELECT key, value FROM session_metadata WHERE session_id = ? +const getSessionMetadata = `-- name: GetSessionMetadata :one +SELECT branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt +FROM session_metadata +WHERE session_id = ? ` -type GetMetadataRow struct { - Key string - Value string +type GetSessionMetadataRow struct { + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string } -func (q *Queries) GetMetadata(ctx context.Context, sessionID string) ([]GetMetadataRow, error) { - rows, err := q.db.QueryContext(ctx, getMetadata, sessionID) - if err != nil { - return nil, err - } - defer rows.Close() - items := []GetMetadataRow{} - for rows.Next() { - var i GetMetadataRow - if err := rows.Scan(&i.Key, &i.Value); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil +func (q *Queries) GetSessionMetadata(ctx context.Context, sessionID string) (GetSessionMetadataRow, error) { + row := q.db.QueryRowContext(ctx, getSessionMetadata, sessionID) + var i GetSessionMetadataRow + err := row.Scan( + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + ) + return i, err } -const upsertMetadata = `-- name: UpsertMetadata :exec -INSERT INTO session_metadata (session_id, key, value) -VALUES (?, ?, ?) -ON CONFLICT (session_id, key) DO UPDATE SET value = excluded.value +const upsertSessionMetadata = `-- name: UpsertSessionMetadata :exec +INSERT INTO session_metadata ( + session_id, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + branch = CASE WHEN excluded.branch <> '' THEN excluded.branch ELSE session_metadata.branch END, + workspace_path = CASE WHEN excluded.workspace_path <> '' THEN excluded.workspace_path ELSE session_metadata.workspace_path END, + runtime_handle_id = CASE WHEN excluded.runtime_handle_id <> '' THEN excluded.runtime_handle_id ELSE session_metadata.runtime_handle_id END, + runtime_name = CASE WHEN excluded.runtime_name <> '' THEN excluded.runtime_name ELSE session_metadata.runtime_name END, + agent_session_id = CASE WHEN excluded.agent_session_id <> '' THEN excluded.agent_session_id ELSE session_metadata.agent_session_id END, + prompt = CASE WHEN excluded.prompt <> '' THEN excluded.prompt ELSE session_metadata.prompt END, + updated_at = excluded.updated_at ` -type UpsertMetadataParams struct { - SessionID string - Key string - Value string +type UpsertSessionMetadataParams struct { + SessionID string + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + UpdatedAt time.Time } -func (q *Queries) UpsertMetadata(ctx context.Context, arg UpsertMetadataParams) error { - _, err := q.db.ExecContext(ctx, upsertMetadata, arg.SessionID, arg.Key, arg.Value) +// Merge semantics: an empty incoming column is "leave unchanged", so a partial +// patch (e.g. spawn writing only the runtime handle) never clobbers a value set +// earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. +func (q *Queries) UpsertSessionMetadata(ctx context.Context, arg UpsertSessionMetadataParams) error { + _, err := q.db.ExecContext(ctx, upsertSessionMetadata, + arg.SessionID, + arg.Branch, + arg.WorkspacePath, + arg.RuntimeHandleID, + arg.RuntimeName, + arg.AgentSessionID, + arg.Prompt, + arg.UpdatedAt, + ) return err } diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go index dccf25c4..339062bf 100644 --- a/backend/internal/storage/sqlite/gen/models.go +++ b/backend/internal/storage/sqlite/gen/models.go @@ -34,14 +34,34 @@ type Outbox struct { CreatedAt time.Time } -type PrEnrichment struct { - SessionID string - CiSummary string - ReviewDecision string - Mergeability string - PendingComments string - CiLogTail string - LastFetchedAt time.Time +type Pr struct { + SessionID string + ReviewDecision string + Mergeability string + CiState string + CiPassed int64 + CiFailed int64 + CiPending int64 + CiLogTail string + LastFetchedAt time.Time +} + +type PrCheck struct { + SessionID string + Name string + Status string + Url string +} + +type PrComment struct { + SessionID string + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time } type Project struct { @@ -93,7 +113,12 @@ type Session struct { } type SessionMetadatum struct { - SessionID string - Key string - Value string + SessionID string + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + UpdatedAt time.Time } diff --git a/backend/internal/storage/sqlite/gen/pr.sql.go b/backend/internal/storage/sqlite/gen/pr.sql.go new file mode 100644 index 00000000..95cbd20a --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr.sql.go @@ -0,0 +1,235 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr.sql + +package gen + +import ( + "context" + "time" +) + +const deletePR = `-- name: DeletePR :exec +DELETE FROM pr WHERE session_id = ? +` + +func (q *Queries) DeletePR(ctx context.Context, sessionID string) error { + _, err := q.db.ExecContext(ctx, deletePR, sessionID) + return err +} + +const deletePRChecks = `-- name: DeletePRChecks :exec +DELETE FROM pr_check WHERE session_id = ? +` + +func (q *Queries) DeletePRChecks(ctx context.Context, sessionID string) error { + _, err := q.db.ExecContext(ctx, deletePRChecks, sessionID) + return err +} + +const deletePRComments = `-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE session_id = ? +` + +func (q *Queries) DeletePRComments(ctx context.Context, sessionID string) error { + _, err := q.db.ExecContext(ctx, deletePRComments, sessionID) + return err +} + +const getPR = `-- name: GetPR :one +SELECT session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at +FROM pr +WHERE session_id = ? +` + +func (q *Queries) GetPR(ctx context.Context, sessionID string) (Pr, error) { + row := q.db.QueryRowContext(ctx, getPR, sessionID) + var i Pr + err := row.Scan( + &i.SessionID, + &i.ReviewDecision, + &i.Mergeability, + &i.CiState, + &i.CiPassed, + &i.CiFailed, + &i.CiPending, + &i.CiLogTail, + &i.LastFetchedAt, + ) + return i, err +} + +const insertPRCheck = `-- name: InsertPRCheck :exec +INSERT INTO pr_check (session_id, name, status, url) VALUES (?, ?, ?, ?) +` + +type InsertPRCheckParams struct { + SessionID string + Name string + Status string + Url string +} + +func (q *Queries) InsertPRCheck(ctx context.Context, arg InsertPRCheckParams) error { + _, err := q.db.ExecContext(ctx, insertPRCheck, + arg.SessionID, + arg.Name, + arg.Status, + arg.Url, + ) + return err +} + +const insertPRComment = `-- name: InsertPRComment :exec +INSERT INTO pr_comment (session_id, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +` + +type InsertPRCommentParams struct { + SessionID string + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time +} + +func (q *Queries) InsertPRComment(ctx context.Context, arg InsertPRCommentParams) error { + _, err := q.db.ExecContext(ctx, insertPRComment, + arg.SessionID, + arg.CommentID, + arg.Author, + arg.File, + arg.Line, + arg.Body, + arg.Resolved, + arg.CreatedAt, + ) + return err +} + +const listPRChecks = `-- name: ListPRChecks :many +SELECT name, status, url FROM pr_check WHERE session_id = ? ORDER BY name +` + +type ListPRChecksRow struct { + Name string + Status string + Url string +} + +func (q *Queries) ListPRChecks(ctx context.Context, sessionID string) ([]ListPRChecksRow, error) { + rows, err := q.db.QueryContext(ctx, listPRChecks, sessionID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ListPRChecksRow{} + for rows.Next() { + var i ListPRChecksRow + if err := rows.Scan(&i.Name, &i.Status, &i.Url); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const listPRComments = `-- name: ListPRComments :many +SELECT comment_id, author, file, line, body, resolved, created_at +FROM pr_comment +WHERE session_id = ? +ORDER BY created_at, comment_id +` + +type ListPRCommentsRow struct { + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time +} + +func (q *Queries) ListPRComments(ctx context.Context, sessionID string) ([]ListPRCommentsRow, error) { + rows, err := q.db.QueryContext(ctx, listPRComments, sessionID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ListPRCommentsRow{} + for rows.Next() { + var i ListPRCommentsRow + if err := rows.Scan( + &i.CommentID, + &i.Author, + &i.File, + &i.Line, + &i.Body, + &i.Resolved, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPR = `-- name: UpsertPR :exec +INSERT INTO pr ( + session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + review_decision = excluded.review_decision, + mergeability = excluded.mergeability, + ci_state = excluded.ci_state, + ci_passed = excluded.ci_passed, + ci_failed = excluded.ci_failed, + ci_pending = excluded.ci_pending, + ci_log_tail = excluded.ci_log_tail, + last_fetched_at = excluded.last_fetched_at +` + +type UpsertPRParams struct { + SessionID string + ReviewDecision string + Mergeability string + CiState string + CiPassed int64 + CiFailed int64 + CiPending int64 + CiLogTail string + LastFetchedAt time.Time +} + +func (q *Queries) UpsertPR(ctx context.Context, arg UpsertPRParams) error { + _, err := q.db.ExecContext(ctx, upsertPR, + arg.SessionID, + arg.ReviewDecision, + arg.Mergeability, + arg.CiState, + arg.CiPassed, + arg.CiFailed, + arg.CiPending, + arg.CiLogTail, + arg.LastFetchedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go b/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go deleted file mode 100644 index c0643104..00000000 --- a/backend/internal/storage/sqlite/gen/pr_enrichment.sql.go +++ /dev/null @@ -1,76 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 -// source: pr_enrichment.sql - -package gen - -import ( - "context" - "time" -) - -const deletePREnrichment = `-- name: DeletePREnrichment :exec -DELETE FROM pr_enrichment WHERE session_id = ? -` - -func (q *Queries) DeletePREnrichment(ctx context.Context, sessionID string) error { - _, err := q.db.ExecContext(ctx, deletePREnrichment, sessionID) - return err -} - -const getPREnrichment = `-- name: GetPREnrichment :one -SELECT session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at -FROM pr_enrichment -WHERE session_id = ? -` - -func (q *Queries) GetPREnrichment(ctx context.Context, sessionID string) (PrEnrichment, error) { - row := q.db.QueryRowContext(ctx, getPREnrichment, sessionID) - var i PrEnrichment - err := row.Scan( - &i.SessionID, - &i.CiSummary, - &i.ReviewDecision, - &i.Mergeability, - &i.PendingComments, - &i.CiLogTail, - &i.LastFetchedAt, - ) - return i, err -} - -const upsertPREnrichment = `-- name: UpsertPREnrichment :exec -INSERT INTO pr_enrichment (session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at) -VALUES (?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET - ci_summary = excluded.ci_summary, - review_decision = excluded.review_decision, - mergeability = excluded.mergeability, - pending_comments = excluded.pending_comments, - ci_log_tail = excluded.ci_log_tail, - last_fetched_at = excluded.last_fetched_at -` - -type UpsertPREnrichmentParams struct { - SessionID string - CiSummary string - ReviewDecision string - Mergeability string - PendingComments string - CiLogTail string - LastFetchedAt time.Time -} - -func (q *Queries) UpsertPREnrichment(ctx context.Context, arg UpsertPREnrichmentParams) error { - _, err := q.db.ExecContext(ctx, upsertPREnrichment, - arg.SessionID, - arg.CiSummary, - arg.ReviewDecision, - arg.Mergeability, - arg.PendingComments, - arg.CiLogTail, - arg.LastFetchedAt, - ) - return err -} diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go index 76dd1aab..83aa0c7e 100644 --- a/backend/internal/storage/sqlite/gen/querier.go +++ b/backend/internal/storage/sqlite/gen/querier.go @@ -10,25 +10,31 @@ import ( type Querier interface { ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error - DeletePREnrichment(ctx context.Context, sessionID string) error + DeletePR(ctx context.Context, sessionID string) error + DeletePRChecks(ctx context.Context, sessionID string) error + DeletePRComments(ctx context.Context, sessionID string) error DeleteProject(ctx context.Context, id string) error DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error GetConsumerOffset(ctx context.Context, consumer string) (int64, error) - GetMetadata(ctx context.Context, sessionID string) ([]GetMetadataRow, error) - GetPREnrichment(ctx context.Context, sessionID string) (PrEnrichment, error) + GetPR(ctx context.Context, sessionID string) (Pr, error) GetProject(ctx context.Context, id string) (Project, error) GetSession(ctx context.Context, id string) (Session, error) + GetSessionMetadata(ctx context.Context, sessionID string) (GetSessionMetadataRow, error) GetSessionRevision(ctx context.Context, id string) (int64, error) // Appends a canonical-write record and returns its monotonic seq so the same // transaction can thread it into the outbox row. InsertChangeLog(ctx context.Context, arg InsertChangeLogParams) (int64, error) InsertOutbox(ctx context.Context, arg InsertOutboxParams) error + InsertPRCheck(ctx context.Context, arg InsertPRCheckParams) error + InsertPRComment(ctx context.Context, arg InsertPRCommentParams) error // CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; // the row is persisted at revision 1. InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) ListAllSessions(ctx context.Context) ([]Session, error) + ListPRChecks(ctx context.Context, sessionID string) ([]ListPRChecksRow, error) + ListPRComments(ctx context.Context, sessionID string) ([]ListPRCommentsRow, error) ListProjects(ctx context.Context) ([]Project, error) ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) @@ -41,10 +47,13 @@ type Querier interface { // revision (@expected_revision). 0 rows affected => revision mismatch. UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error - UpsertMetadata(ctx context.Context, arg UpsertMetadataParams) error - UpsertPREnrichment(ctx context.Context, arg UpsertPREnrichmentParams) error + UpsertPR(ctx context.Context, arg UpsertPRParams) error UpsertProject(ctx context.Context, arg UpsertProjectParams) error UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error + // Merge semantics: an empty incoming column is "leave unchanged", so a partial + // patch (e.g. spawn writing only the runtime handle) never clobbers a value set + // earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. + UpsertSessionMetadata(ctx context.Context, arg UpsertSessionMetadataParams) error } var _ Querier = (*Queries)(nil) diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql index f343e16d..38224125 100644 --- a/backend/internal/storage/sqlite/migrations/0001_init.sql +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -39,14 +39,21 @@ CREATE TABLE sessions ( CREATE INDEX idx_sessions_project ON sessions (project_id); --- session_metadata is the opaque key/value side-channel (branch, workspacePath, --- runtimeHandleId, runtimeName, agentSessionId, prompt). Written by --- PatchMetadata; never bumps revision and never emits a CDC event. +-- session_metadata is the 1:1 typed side-channel for a session's operational +-- handles and seed inputs — the fields the Session Manager and reaper need but +-- that are NOT part of the canonical lifecycle. One row per session, named +-- columns (not a free-form key/value bag), so the set of metadata a session can +-- carry is fixed by the schema. Written by PatchMetadata; never bumps revision +-- and never emits a CDC event. CREATE TABLE session_metadata ( - session_id TEXT NOT NULL REFERENCES sessions (id) ON DELETE CASCADE, - key TEXT NOT NULL, - value TEXT NOT NULL, - PRIMARY KEY (session_id, key) + session_id TEXT PRIMARY KEY REFERENCES sessions (id) ON DELETE CASCADE, + branch TEXT NOT NULL DEFAULT '', + workspace_path TEXT NOT NULL DEFAULT '', + runtime_handle_id TEXT NOT NULL DEFAULT '', + runtime_name TEXT NOT NULL DEFAULT '', + agent_session_id TEXT NOT NULL DEFAULT '', + prompt TEXT NOT NULL DEFAULT '', + updated_at TIMESTAMP NOT NULL ); -- change_log is the durable, ordered record of every canonical write. seq is the diff --git a/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql b/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql index 4421f0dd..da987ed5 100644 --- a/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql +++ b/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql @@ -26,25 +26,60 @@ CREATE TABLE projects ( archived_at TIMESTAMP ); --- pr_enrichment is the SCM observer's per-session cache of the rich PR facts that --- do NOT live in the canonical lifecycle (which keeps only pr_state/reason/number/ --- url). It is 1:1 with a session (a PR is always tied to a session by its branch), --- written by the SCM observer OFF the canonical CDC path (no revision bump, no --- change_log/outbox event), and cascades away with its session. -CREATE TABLE pr_enrichment ( +-- pr is the SCM observer's per-session cache of the rich PR facts that do NOT +-- live in the canonical lifecycle (which keeps only pr_state/reason/number/url). +-- 1:1 with a session (a PR is tied to a session by its branch), written by the +-- SCM observer OFF the canonical CDC path (no revision bump, no change_log/outbox +-- event), and cascades away with its session. Scalar facts are typed columns — +-- review_decision/mergeability/ci_state are CHECK-constrained enums and the CI +-- counts are integers, not opaque strings; the list facts (individual checks and +-- review comments) are normalized into pr_check / pr_comment. +CREATE TABLE pr ( session_id TEXT PRIMARY KEY REFERENCES sessions (id) ON DELETE CASCADE, - ci_summary TEXT NOT NULL DEFAULT '', - review_decision TEXT NOT NULL DEFAULT '', - mergeability TEXT NOT NULL DEFAULT '', - pending_comments TEXT NOT NULL DEFAULT '', + review_decision TEXT NOT NULL DEFAULT 'none' + CHECK (review_decision IN ('none', 'approved', 'changes_requested', 'review_required')), + mergeability TEXT NOT NULL DEFAULT 'unknown' + CHECK (mergeability IN ('unknown', 'mergeable', 'conflicting', 'blocked', 'unstable')), + ci_state TEXT NOT NULL DEFAULT 'unknown' + CHECK (ci_state IN ('unknown', 'pending', 'passing', 'failing')), + ci_passed INTEGER NOT NULL DEFAULT 0, + ci_failed INTEGER NOT NULL DEFAULT 0, + ci_pending INTEGER NOT NULL DEFAULT 0, ci_log_tail TEXT NOT NULL DEFAULT '', last_fetched_at TIMESTAMP NOT NULL ); +-- pr_check is one CI check belonging to a pr (the normalized form of the old +-- ci_summary string). It cascades from pr, so it cannot outlive its PR facts. +CREATE TABLE pr_check ( + session_id TEXT NOT NULL REFERENCES pr (session_id) ON DELETE CASCADE, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'unknown' + CHECK (status IN ('unknown', 'queued', 'in_progress', 'passed', 'failed', 'skipped', 'cancelled')), + url TEXT NOT NULL DEFAULT '', + PRIMARY KEY (session_id, name) +); + +-- pr_comment is one unresolved review comment belonging to a pr (the normalized +-- form of the old pending_comments JSON-in-a-string). Cascades from pr. +CREATE TABLE pr_comment ( + session_id TEXT NOT NULL REFERENCES pr (session_id) ON DELETE CASCADE, + comment_id TEXT NOT NULL, + author TEXT NOT NULL DEFAULT '', + file TEXT NOT NULL DEFAULT '', + line INTEGER NOT NULL DEFAULT 0, + body TEXT NOT NULL DEFAULT '', + resolved INTEGER NOT NULL DEFAULT 0, + created_at TIMESTAMP NOT NULL, + PRIMARY KEY (session_id, comment_id) +); + -- +goose StatementEnd -- +goose Down -- +goose StatementBegin -DROP TABLE pr_enrichment; +DROP TABLE pr_comment; +DROP TABLE pr_check; +DROP TABLE pr; DROP TABLE projects; -- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/pr_projects_test.go b/backend/internal/storage/sqlite/pr_projects_test.go index 6cdd20bc..58227b1f 100644 --- a/backend/internal/storage/sqlite/pr_projects_test.go +++ b/backend/internal/storage/sqlite/pr_projects_test.go @@ -2,6 +2,7 @@ package sqlite import ( "context" + "reflect" "testing" "time" @@ -88,41 +89,122 @@ func TestArchiveProjectHidesFromListButGetResolves(t *testing.T) { } } -func TestPREnrichmentUpsertGetDelete(t *testing.T) { +func TestPRUpsertGetDelete(t *testing.T) { s := newTestStore(t) ctx := context.Background() now := time.Now().UTC().Truncate(time.Second) - // pr_enrichment FKs sessions(id); seed the session first. + // pr FKs sessions(id); seed the session first. if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { t.Fatalf("seed session: %v", err) } - if _, ok, err := s.GetPREnrichment(ctx, "s1"); err != nil || ok { + if _, ok, err := s.GetPR(ctx, "s1"); err != nil || ok { t.Fatalf("get missing: ok=%v err=%v", ok, err) } - e := PREnrichmentRow{ - SessionID: "s1", CISummary: "3 passing, 1 failing", ReviewDecision: "changes_requested", - Mergeability: "blocked", PendingComments: `[{"path":"a.go"}]`, CILogTail: "FAIL TestX", + pr := PRRow{ + SessionID: "s1", ReviewDecision: "changes_requested", Mergeability: "blocked", + CIState: "failing", CIPassed: 3, CIFailed: 1, CIPending: 0, CILogTail: "FAIL TestX", LastFetchedAt: now, } - if err := s.UpsertPREnrichment(ctx, e); err != nil { + if err := s.UpsertPR(ctx, pr); err != nil { t.Fatalf("upsert: %v", err) } - got, ok, err := s.GetPREnrichment(ctx, "s1") + got, ok, err := s.GetPR(ctx, "s1") if err != nil || !ok { t.Fatalf("get: ok=%v err=%v", ok, err) } - if got != e { - t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, e) + if got != pr { + t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, pr) } - if err := s.DeletePREnrichment(ctx, "s1"); err != nil { + if err := s.DeletePR(ctx, "s1"); err != nil { t.Fatalf("delete: %v", err) } - if _, ok, _ := s.GetPREnrichment(ctx, "s1"); ok { - t.Fatal("enrichment should be gone after delete") + if _, ok, _ := s.GetPR(ctx, "s1"); ok { + t.Fatal("pr should be gone after delete") + } +} + +func TestPRRejectsBadEnum(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { + t.Fatalf("seed session: %v", err) + } + // review_decision is a CHECK-constrained enum; an off-list value must fail. + err := s.UpsertPR(ctx, PRRow{ + SessionID: "s1", ReviewDecision: "definitely_not_a_decision", + Mergeability: "unknown", CIState: "unknown", LastFetchedAt: time.Now().UTC(), + }) + if err == nil { + t.Fatal("expected CHECK constraint to reject an invalid review_decision") + } +} + +func TestPRChecksAndCommentsReplaceAndList(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + + if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { + t.Fatalf("seed session: %v", err) + } + // pr_check / pr_comment FK pr(session_id); the pr row must exist first. + if err := s.UpsertPR(ctx, PRRow{ + SessionID: "s1", ReviewDecision: "review_required", Mergeability: "unknown", + CIState: "pending", LastFetchedAt: now, + }); err != nil { + t.Fatalf("upsert pr: %v", err) + } + + checks := []PRCheck{ + {Name: "build", Status: "passed", URL: "https://ci/build"}, + {Name: "test", Status: "failed", URL: "https://ci/test"}, + } + if err := s.ReplacePRChecks(ctx, "s1", checks); err != nil { + t.Fatalf("replace checks: %v", err) + } + gotChecks, err := s.ListPRChecks(ctx, "s1") + if err != nil { + t.Fatalf("list checks: %v", err) + } + if !reflect.DeepEqual(gotChecks, checks) { + t.Fatalf("checks = %+v, want %+v", gotChecks, checks) + } + // Replace is a set-replace, not a merge: a shorter set removes the rest. + if err := s.ReplacePRChecks(ctx, "s1", []PRCheck{{Name: "build", Status: "passed"}}); err != nil { + t.Fatalf("replace checks 2: %v", err) + } + if gotChecks, _ = s.ListPRChecks(ctx, "s1"); len(gotChecks) != 1 { + t.Fatalf("after replace, checks = %+v, want 1", gotChecks) + } + + comments := []PRComment{ + {CommentID: "c1", Author: "alice", File: "a.go", Line: 10, Body: "nit", Resolved: false, CreatedAt: now}, + {CommentID: "c2", Author: "bob", File: "b.go", Line: 20, Body: "bug", Resolved: true, CreatedAt: now.Add(time.Second)}, + } + if err := s.ReplacePRComments(ctx, "s1", comments); err != nil { + t.Fatalf("replace comments: %v", err) + } + gotComments, err := s.ListPRComments(ctx, "s1") + if err != nil { + t.Fatalf("list comments: %v", err) + } + if !reflect.DeepEqual(gotComments, comments) { + t.Fatalf("comments = %+v, want %+v", gotComments, comments) + } + + // Deleting the pr cascades its checks and comments. + if err := s.DeletePR(ctx, "s1"); err != nil { + t.Fatalf("delete pr: %v", err) + } + if c, _ := s.ListPRChecks(ctx, "s1"); len(c) != 0 { + t.Fatalf("checks not cascaded: %+v", c) + } + if c, _ := s.ListPRComments(ctx, "s1"); len(c) != 0 { + t.Fatalf("comments not cascaded: %+v", c) } } diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go index 70efb7ce..c7d436bd 100644 --- a/backend/internal/storage/sqlite/pr_store.go +++ b/backend/internal/storage/sqlite/pr_store.go @@ -10,57 +10,185 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// PREnrichmentRow is the SCM observer's cache of the rich PR facts that do not -// live in the canonical lifecycle (which keeps only pr_state/reason/number/url). -// It is 1:1 with a session and written OFF the canonical CDC path: upserting it -// never bumps revision and never emits a change_log/outbox event. pending_comments -// and ci_log_tail are opaque blobs the SCM observer serializes. -type PREnrichmentRow struct { - SessionID string - CISummary string - ReviewDecision string - Mergeability string - PendingComments string - CILogTail string - LastFetchedAt time.Time +// PRRow is the SCM observer's cache of the scalar PR facts that do not live in +// the canonical lifecycle (which keeps only pr_state/reason/number/url). It is +// 1:1 with a session and written OFF the canonical CDC path: upserting it never +// bumps revision and never emits a change_log/outbox event. The list facts +// (checks, comments) are separate rows — see PRCheck / PRComment. +type PRRow struct { + SessionID string + ReviewDecision string // none | approved | changes_requested | review_required + Mergeability string // unknown | mergeable | conflicting | blocked | unstable + CIState string // unknown | pending | passing | failing + CIPassed int64 + CIFailed int64 + CIPending int64 + CILogTail string + LastFetchedAt time.Time } -// UpsertPREnrichment inserts or replaces the cached PR facts for one session. -func (s *Store) UpsertPREnrichment(ctx context.Context, r PREnrichmentRow) error { - return s.q.UpsertPREnrichment(ctx, gen.UpsertPREnrichmentParams{ - SessionID: r.SessionID, - CiSummary: r.CISummary, - ReviewDecision: r.ReviewDecision, - Mergeability: r.Mergeability, - PendingComments: r.PendingComments, - CiLogTail: r.CILogTail, - LastFetchedAt: r.LastFetchedAt, +// PRCheck is one CI check belonging to a session's PR. +type PRCheck struct { + Name string + Status string // unknown | queued | in_progress | passed | failed | skipped | cancelled + URL string +} + +// PRComment is one review comment belonging to a session's PR. +type PRComment struct { + CommentID string + Author string + File string + Line int64 + Body string + Resolved bool + CreatedAt time.Time +} + +// UpsertPR inserts or replaces the scalar PR facts for one session. +func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { + return s.q.UpsertPR(ctx, gen.UpsertPRParams{ + SessionID: r.SessionID, + ReviewDecision: r.ReviewDecision, + Mergeability: r.Mergeability, + CiState: r.CIState, + CiPassed: r.CIPassed, + CiFailed: r.CIFailed, + CiPending: r.CIPending, + CiLogTail: r.CILogTail, + LastFetchedAt: r.LastFetchedAt, }) } -// GetPREnrichment returns the cached PR facts for one session. ok is false when -// no row exists (the SCM observer has not yet fetched, or the session has no PR). -func (s *Store) GetPREnrichment(ctx context.Context, sessionID string) (PREnrichmentRow, bool, error) { - e, err := s.q.GetPREnrichment(ctx, sessionID) +// GetPR returns the scalar PR facts for one session. ok is false when no row +// exists (the SCM observer has not fetched yet, or the session has no PR). +func (s *Store) GetPR(ctx context.Context, sessionID string) (PRRow, bool, error) { + p, err := s.q.GetPR(ctx, sessionID) if errors.Is(err, sql.ErrNoRows) { - return PREnrichmentRow{}, false, nil + return PRRow{}, false, nil } if err != nil { - return PREnrichmentRow{}, false, fmt.Errorf("get pr enrichment: %w", err) + return PRRow{}, false, fmt.Errorf("get pr: %w", err) } - return PREnrichmentRow{ - SessionID: e.SessionID, - CISummary: e.CiSummary, - ReviewDecision: e.ReviewDecision, - Mergeability: e.Mergeability, - PendingComments: e.PendingComments, - CILogTail: e.CiLogTail, - LastFetchedAt: e.LastFetchedAt, + return PRRow{ + SessionID: p.SessionID, + ReviewDecision: p.ReviewDecision, + Mergeability: p.Mergeability, + CIState: p.CiState, + CIPassed: p.CiPassed, + CIFailed: p.CiFailed, + CIPending: p.CiPending, + CILogTail: p.CiLogTail, + LastFetchedAt: p.LastFetchedAt, }, true, nil } -// DeletePREnrichment drops the cached PR facts for one session. Normally -// unnecessary (the FK cascades on session delete), exposed for explicit eviction. -func (s *Store) DeletePREnrichment(ctx context.Context, sessionID string) error { - return s.q.DeletePREnrichment(ctx, sessionID) +// DeletePR drops the scalar PR facts for one session, cascading its checks and +// comments. Normally unnecessary (the chain cascades on session delete); exposed +// for explicit eviction. +func (s *Store) DeletePR(ctx context.Context, sessionID string) error { + return s.q.DeletePR(ctx, sessionID) +} + +// ReplacePRChecks atomically replaces the full set of CI checks for a session's +// PR — each SCM fetch reports the current set, so a replace (not a merge) keeps +// the table in sync (a check that disappeared upstream is removed). The PR row +// must already exist (pr_check FKs pr). +func (s *Store) ReplacePRChecks(ctx context.Context, sessionID string, checks []PRCheck) error { + return s.inTx(ctx, "replace pr checks", func(qtx *gen.Queries) error { + if err := qtx.DeletePRChecks(ctx, sessionID); err != nil { + return err + } + for _, c := range checks { + if err := qtx.InsertPRCheck(ctx, gen.InsertPRCheckParams{ + SessionID: sessionID, + Name: c.Name, + Status: c.Status, + Url: c.URL, + }); err != nil { + return fmt.Errorf("check %q: %w", c.Name, err) + } + } + return nil + }) +} + +// ListPRChecks returns a session's CI checks, ordered by name. +func (s *Store) ListPRChecks(ctx context.Context, sessionID string) ([]PRCheck, error) { + rows, err := s.q.ListPRChecks(ctx, sessionID) + if err != nil { + return nil, fmt.Errorf("list pr checks: %w", err) + } + out := make([]PRCheck, 0, len(rows)) + for _, r := range rows { + out = append(out, PRCheck{Name: r.Name, Status: r.Status, URL: r.Url}) + } + return out, nil +} + +// ReplacePRComments atomically replaces the full set of review comments for a +// session's PR (same replace-not-merge rationale as ReplacePRChecks). +func (s *Store) ReplacePRComments(ctx context.Context, sessionID string, comments []PRComment) error { + return s.inTx(ctx, "replace pr comments", func(qtx *gen.Queries) error { + if err := qtx.DeletePRComments(ctx, sessionID); err != nil { + return err + } + for _, c := range comments { + if err := qtx.InsertPRComment(ctx, gen.InsertPRCommentParams{ + SessionID: sessionID, + CommentID: c.CommentID, + Author: c.Author, + File: c.File, + Line: c.Line, + Body: c.Body, + Resolved: boolToInt(c.Resolved), + CreatedAt: c.CreatedAt, + }); err != nil { + return fmt.Errorf("comment %q: %w", c.CommentID, err) + } + } + return nil + }) +} + +// ListPRComments returns a session's review comments, ordered by creation time. +func (s *Store) ListPRComments(ctx context.Context, sessionID string) ([]PRComment, error) { + rows, err := s.q.ListPRComments(ctx, sessionID) + if err != nil { + return nil, fmt.Errorf("list pr comments: %w", err) + } + out := make([]PRComment, 0, len(rows)) + for _, r := range rows { + out = append(out, PRComment{ + CommentID: r.CommentID, + Author: r.Author, + File: r.File, + Line: r.Line, + Body: r.Body, + Resolved: r.Resolved != 0, + CreatedAt: r.CreatedAt, + }) + } + return out, nil +} + +// inTx runs fn inside a single transaction over the store's queries, rolling +// back on error. +func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin %s: %w", what, err) + } + defer tx.Rollback() + if err := fn(s.q.WithTx(tx)); err != nil { + return fmt.Errorf("%s: %w", what, err) + } + return tx.Commit() +} + +func boolToInt(b bool) int64 { + if b { + return 1 + } + return 0 } diff --git a/backend/internal/storage/sqlite/queries/metadata.sql b/backend/internal/storage/sqlite/queries/metadata.sql index 45079bb2..158552da 100644 --- a/backend/internal/storage/sqlite/queries/metadata.sql +++ b/backend/internal/storage/sqlite/queries/metadata.sql @@ -1,7 +1,20 @@ --- name: GetMetadata :many -SELECT key, value FROM session_metadata WHERE session_id = ?; +-- name: GetSessionMetadata :one +SELECT branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt +FROM session_metadata +WHERE session_id = ?; --- name: UpsertMetadata :exec -INSERT INTO session_metadata (session_id, key, value) -VALUES (?, ?, ?) -ON CONFLICT (session_id, key) DO UPDATE SET value = excluded.value; +-- name: UpsertSessionMetadata :exec +-- Merge semantics: an empty incoming column is "leave unchanged", so a partial +-- patch (e.g. spawn writing only the runtime handle) never clobbers a value set +-- earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. +INSERT INTO session_metadata ( + session_id, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + branch = CASE WHEN excluded.branch <> '' THEN excluded.branch ELSE session_metadata.branch END, + workspace_path = CASE WHEN excluded.workspace_path <> '' THEN excluded.workspace_path ELSE session_metadata.workspace_path END, + runtime_handle_id = CASE WHEN excluded.runtime_handle_id <> '' THEN excluded.runtime_handle_id ELSE session_metadata.runtime_handle_id END, + runtime_name = CASE WHEN excluded.runtime_name <> '' THEN excluded.runtime_name ELSE session_metadata.runtime_name END, + agent_session_id = CASE WHEN excluded.agent_session_id <> '' THEN excluded.agent_session_id ELSE session_metadata.agent_session_id END, + prompt = CASE WHEN excluded.prompt <> '' THEN excluded.prompt ELSE session_metadata.prompt END, + updated_at = excluded.updated_at; diff --git a/backend/internal/storage/sqlite/queries/pr.sql b/backend/internal/storage/sqlite/queries/pr.sql new file mode 100644 index 00000000..13c14a78 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr.sql @@ -0,0 +1,43 @@ +-- name: UpsertPR :exec +INSERT INTO pr ( + session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (session_id) DO UPDATE SET + review_decision = excluded.review_decision, + mergeability = excluded.mergeability, + ci_state = excluded.ci_state, + ci_passed = excluded.ci_passed, + ci_failed = excluded.ci_failed, + ci_pending = excluded.ci_pending, + ci_log_tail = excluded.ci_log_tail, + last_fetched_at = excluded.last_fetched_at; + +-- name: GetPR :one +SELECT session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at +FROM pr +WHERE session_id = ?; + +-- name: DeletePR :exec +DELETE FROM pr WHERE session_id = ?; + +-- name: DeletePRChecks :exec +DELETE FROM pr_check WHERE session_id = ?; + +-- name: InsertPRCheck :exec +INSERT INTO pr_check (session_id, name, status, url) VALUES (?, ?, ?, ?); + +-- name: ListPRChecks :many +SELECT name, status, url FROM pr_check WHERE session_id = ? ORDER BY name; + +-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE session_id = ?; + +-- name: InsertPRComment :exec +INSERT INTO pr_comment (session_id, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?); + +-- name: ListPRComments :many +SELECT comment_id, author, file, line, body, resolved, created_at +FROM pr_comment +WHERE session_id = ? +ORDER BY created_at, comment_id; diff --git a/backend/internal/storage/sqlite/queries/pr_enrichment.sql b/backend/internal/storage/sqlite/queries/pr_enrichment.sql deleted file mode 100644 index 7c2ac0a0..00000000 --- a/backend/internal/storage/sqlite/queries/pr_enrichment.sql +++ /dev/null @@ -1,18 +0,0 @@ --- name: UpsertPREnrichment :exec -INSERT INTO pr_enrichment (session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at) -VALUES (?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET - ci_summary = excluded.ci_summary, - review_decision = excluded.review_decision, - mergeability = excluded.mergeability, - pending_comments = excluded.pending_comments, - ci_log_tail = excluded.ci_log_tail, - last_fetched_at = excluded.last_fetched_at; - --- name: GetPREnrichment :one -SELECT session_id, ci_summary, review_decision, mergeability, pending_comments, ci_log_tail, last_fetched_at -FROM pr_enrichment -WHERE session_id = ?; - --- name: DeletePREnrichment :exec -DELETE FROM pr_enrichment WHERE session_id = ?; diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go index bd61e73b..75b5474a 100644 --- a/backend/internal/storage/sqlite/store.go +++ b/backend/internal/storage/sqlite/store.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" @@ -77,42 +78,41 @@ func (s *Store) ListAll(ctx context.Context) ([]domain.SessionRecord, error) { return out, nil } -// GetMetadata returns the opaque key/value metadata for a session. -func (s *Store) GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error) { - rows, err := s.q.GetMetadata(ctx, string(id)) - if err != nil { - return nil, fmt.Errorf("get metadata %s: %w", id, err) - } - if len(rows) == 0 { - return nil, nil +// GetMetadata returns the typed metadata for a session, or the zero value if the +// session has no metadata row yet. +func (s *Store) GetMetadata(ctx context.Context, id domain.SessionID) (domain.SessionMetadata, error) { + row, err := s.q.GetSessionMetadata(ctx, string(id)) + if errors.Is(err, sql.ErrNoRows) { + return domain.SessionMetadata{}, nil } - m := make(map[string]string, len(rows)) - for _, r := range rows { - m[r.Key] = r.Value + if err != nil { + return domain.SessionMetadata{}, fmt.Errorf("get metadata %s: %w", id, err) } - return m, nil + return domain.SessionMetadata{ + Branch: row.Branch, + WorkspacePath: row.WorkspacePath, + RuntimeHandleID: row.RuntimeHandleID, + RuntimeName: row.RuntimeName, + AgentSessionID: row.AgentSessionID, + Prompt: row.Prompt, + }, nil } -// PatchMetadata merges kv into the session's metadata. It is outside the -// canonical write path: no revision bump, no CDC event. -func (s *Store) PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error { - if len(kv) == 0 { +// PatchMetadata merges meta into the session's metadata. It is outside the +// canonical write path: no revision bump, no CDC event. Empty fields are left +// unchanged (see UpsertSessionMetadata), so a partial patch is non-destructive. +func (s *Store) PatchMetadata(ctx context.Context, id domain.SessionID, meta domain.SessionMetadata) error { + if meta.IsZero() { return nil } - tx, err := s.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin patch metadata: %w", err) - } - defer tx.Rollback() - qtx := s.q.WithTx(tx) - for k, v := range kv { - if err := qtx.UpsertMetadata(ctx, gen.UpsertMetadataParams{ - SessionID: string(id), - Key: k, - Value: v, - }); err != nil { - return fmt.Errorf("patch metadata %s[%s]: %w", id, k, err) - } - } - return tx.Commit() + return s.q.UpsertSessionMetadata(ctx, gen.UpsertSessionMetadataParams{ + SessionID: string(id), + Branch: meta.Branch, + WorkspacePath: meta.WorkspacePath, + RuntimeHandleID: meta.RuntimeHandleID, + RuntimeName: meta.RuntimeName, + AgentSessionID: meta.AgentSessionID, + Prompt: meta.Prompt, + UpdatedAt: time.Now().UTC(), + }) } diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store_test.go index 5457855d..711f8cf1 100644 --- a/backend/internal/storage/sqlite/store_test.go +++ b/backend/internal/storage/sqlite/store_test.go @@ -156,7 +156,7 @@ func TestGetListRoundTrip(t *testing.T) { if got.ID != "a" || got.Lifecycle.Revision != 1 || got.IssueID != "issue-1" { t.Fatalf("unexpected record: %+v", got) } - if got.Metadata != nil { + if !got.Metadata.IsZero() { t.Fatalf("Get must not reconstruct metadata, got %v", got.Metadata) } @@ -176,10 +176,11 @@ func TestMetadataSideChannel(t *testing.T) { t.Fatal(err) } - if err := s.PatchMetadata(ctx, "s1", map[string]string{"branch": "feat/x", "prompt": "do it"}); err != nil { + if err := s.PatchMetadata(ctx, "s1", domain.SessionMetadata{Branch: "feat/x", Prompt: "do it"}); err != nil { t.Fatalf("patch: %v", err) } - if err := s.PatchMetadata(ctx, "s1", map[string]string{"branch": "feat/y"}); err != nil { + // A partial patch (only Branch) must not clobber the earlier Prompt. + if err := s.PatchMetadata(ctx, "s1", domain.SessionMetadata{Branch: "feat/y"}); err != nil { t.Fatalf("patch overwrite: %v", err) } @@ -187,8 +188,8 @@ func TestMetadataSideChannel(t *testing.T) { if err != nil { t.Fatal(err) } - if m["branch"] != "feat/y" || m["prompt"] != "do it" { - t.Fatalf("metadata = %v", m) + if m.Branch != "feat/y" || m.Prompt != "do it" { + t.Fatalf("metadata = %+v", m) } // Metadata writes must not bump revision (off the canonical path). lc, _, _ := s.Load(ctx, "s1") @@ -239,7 +240,7 @@ func TestLoadGetMissing(t *testing.T) { if _, ok, err := s.Get(ctx, "nope"); ok || err != nil { t.Fatalf("Get missing: ok=%v err=%v", ok, err) } - if m, err := s.GetMetadata(ctx, "nope"); err != nil || m != nil { + if m, err := s.GetMetadata(ctx, "nope"); err != nil || !m.IsZero() { t.Fatalf("GetMetadata missing: m=%v err=%v", m, err) } } diff --git a/backend/internal/storage/sqlite/upsert.go b/backend/internal/storage/sqlite/upsert.go index 40944005..64674516 100644 --- a/backend/internal/storage/sqlite/upsert.go +++ b/backend/internal/storage/sqlite/upsert.go @@ -82,14 +82,14 @@ func casPersist(ctx context.Context, q *gen.Queries, rec domain.SessionRecord) ( } // appendOutbox writes the change_log entry and threads its seq into a fresh -// outbox row. The change_log payload is the persisted record at its new -// revision (metadata excluded — it is not on the canonical path). +// outbox row. The change_log payload is the persisted record at its new revision +// (metadata is excluded by SessionRecord's json:"-" tag — it is not on the +// canonical path). func appendOutbox(ctx context.Context, q *gen.Queries, rec domain.SessionRecord, newRevision int, eventType ports.EventType) error { now := time.Now().UTC() payload := rec payload.Lifecycle.Revision = newRevision payload.Lifecycle.Version = domain.LifecycleVersion - payload.Metadata = nil blob, err := json.Marshal(payload) if err != nil { return fmt.Errorf("marshal change_log payload %s: %w", rec.ID, err) diff --git a/backend/main_test.go b/backend/main_test.go index 1a8d60c3..c8f32541 100644 --- a/backend/main_test.go +++ b/backend/main_test.go @@ -127,7 +127,7 @@ func TestSnapshotSourceRebuildsState(t *testing.T) { if rec.Lifecycle.Revision != 1 { t.Errorf("payload revision = %d, want 1", rec.Lifecycle.Revision) } - if rec.Metadata != nil { + if !rec.Metadata.IsZero() { t.Errorf("snapshot payload must exclude metadata, got %v", rec.Metadata) } } From b0e4fffa62f92957bc40c330c1c0a105c7c597c4 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 00:39:19 +0530 Subject: [PATCH 04/10] test(cdc): add full-stack E2E tests through the real store + snapshot source The cdc integration test covers the synchronous Drain/Poll happy path but (1) resyncs from a fake snapshot and (2) never runs the publisher/consumer as the concurrent goroutines the daemon actually uses. Add two E2E tests in the composition-root package that wire the real sqlite.Store, outboxAdapter, Publisher, JSONL log, Consumer, Broadcaster and the REAL snapshotSource (store.ListAll): - RealSnapshotResyncThroughRotation: forces a rotation and asserts the consumer rebuilds from the sessions table, delivering the persisted record payload, with the offset landing at the change_log head. - ConcurrentPublisherConsumer: runs both as goroutines on their tickers and asserts every write is delivered exactly once, in order, offset at head (also exercises the broadcaster hand-off under -race). Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/cdc_e2e_test.go | 194 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 backend/cdc_e2e_test.go diff --git a/backend/cdc_e2e_test.go b/backend/cdc_e2e_test.go new file mode 100644 index 00000000..29b04534 --- /dev/null +++ b/backend/cdc_e2e_test.go @@ -0,0 +1,194 @@ +package main + +import ( + "context" + "encoding/json" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// These are full-stack end-to-end tests of the write+delivery path wired exactly +// as main.go wires it: real sqlite.Store -> real outboxAdapter -> real +// cdc.Publisher -> real JSONL log -> real cdc.Consumer -> real cdc.Broadcaster, +// using the REAL snapshotSource (store.ListAll) rather than a fake. The cdc +// package's own integration test covers the synchronous Drain/Poll happy path +// with a fake snapshot; these cover the two gaps it leaves: a rotation that +// resyncs from the actual sessions table, and the concurrent goroutine model +// the daemon actually runs. + +// TestE2E_RealSnapshotResyncThroughRotation forces a log rotation and asserts the +// consumer rebuilds state from the REAL sessions-table snapshot (not the +// rotated-away bytes), delivering the persisted record's payload. +func TestE2E_RealSnapshotResyncThroughRotation(t *testing.T) { + ctx := context.Background() + store := newWiringStore(t) + dir := t.TempDir() + log, err := cdc.OpenLog(dir, 80) // tiny cap: the second write forces a rotation + if err != nil { + t.Fatal(err) + } + defer log.Close() + + var mu sync.Mutex + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + con := cdc.NewConsumer("fe", filepath.Join(dir, cdc.LogFileName), store, bc, + cdc.ConsumerConfig{Snapshot: snapshotSource{store: store}}) + if _, err := con.Start(ctx); err != nil { + t.Fatal(err) + } + pub := cdc.NewPublisher(outboxAdapter{store: store}, log, cdc.PublisherConfig{}) + + // First canonical write: drained and consumed live from the original file. + if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { + t.Fatal(err) + } + if err := pub.Drain(ctx); err != nil { + t.Fatal(err) + } + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + mu.Lock() + before := len(got) + mu.Unlock() + + // Second write pushes the log past its cap -> rotation. The consumer sees a + // fresh file and must resync from the sessions table. + r := wiringRec("s1") + r.Lifecycle.Revision = 1 + if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { + t.Fatal(err) + } + if err := pub.Drain(ctx); err != nil { + t.Fatal(err) + } + if err := con.Poll(ctx); err != nil { + t.Fatal(err) + } + + mu.Lock() + defer mu.Unlock() + if len(got) <= before { + t.Fatalf("resync delivered nothing after rotation (got %d, before %d)", len(got), before) + } + // A real session_snapshot for s1 must have been delivered, carrying the full + // record persisted in the sessions table. + var snap *cdc.Event + for i := range got { + if got[i].EventType == "session_snapshot" && got[i].SessionID == "s1" { + snap = &got[i] + } + } + if snap == nil { + t.Fatalf("no real session_snapshot delivered after rotation; got %+v", got) + } + var rec domain.SessionRecord + if err := json.Unmarshal([]byte(snap.Payload), &rec); err != nil { + t.Fatalf("snapshot payload not a SessionRecord: %v", err) + } + if rec.ID != "s1" || rec.Lifecycle.Session.State != domain.SessionWorking { + t.Fatalf("snapshot payload mismatch: %+v", rec) + } + // The consumer's durable offset advanced to the change_log head. + off, err := store.GetOffset(ctx, "fe") + if err != nil { + t.Fatal(err) + } + maxSeq, err := store.MaxChangeLogSeq(ctx) + if err != nil { + t.Fatal(err) + } + if off != maxSeq { + t.Fatalf("offset = %d, want change_log head %d", off, maxSeq) + } +} + +// TestE2E_ConcurrentPublisherConsumer runs the publisher and consumer as the +// daemon runs them — independent goroutines on their own tickers — and asserts +// every canonical write is delivered exactly once, in order, with the offset +// landing at the head. Run under -race this also guards the broadcaster/consumer +// hand-off. +func TestE2E_ConcurrentPublisherConsumer(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + store := newWiringStore(t) + dir := t.TempDir() + log, err := cdc.OpenLog(dir, 0) + if err != nil { + t.Fatal(err) + } + defer log.Close() + + var mu sync.Mutex + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + pub := cdc.NewPublisher(outboxAdapter{store: store}, log, cdc.PublisherConfig{}) + con := cdc.NewConsumer("fe", filepath.Join(dir, cdc.LogFileName), store, bc, cdc.ConsumerConfig{}) + + pubDone := pub.Start(ctx) + conDone, err := con.Start(ctx) + if err != nil { + t.Fatal(err) + } + + const n = 5 + for i := 0; i < n; i++ { + r := wiringRec("s1") + r.Lifecycle.Revision = i + evt := ports.EventSessionStateChanged + if i == 0 { + evt = ports.EventSessionCreated + } + if err := store.Upsert(ctx, r, evt); err != nil { + t.Fatalf("upsert %d: %v", i, err) + } + } + + // Bounded wait for the goroutine pipeline to deliver everything. + deadline := time.Now().Add(5 * time.Second) + for { + mu.Lock() + count := len(got) + mu.Unlock() + if count >= n { + break + } + if time.Now().After(deadline) { + t.Fatalf("timed out: delivered %d/%d events", count, n) + } + time.Sleep(20 * time.Millisecond) + } + + cancel() + <-pubDone + <-conDone + + mu.Lock() + defer mu.Unlock() + if len(got) != n { + t.Fatalf("delivered %d events, want %d", len(got), n) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d has seq %d, want %d (out-of-order or duplicate)", i, e.Seq, i+1) + } + } + off, err := store.GetOffset(context.Background(), "fe") + if err != nil { + t.Fatal(err) + } + if off != n { + t.Fatalf("offset = %d, want %d", off, n) + } +} From ba472128021848d3cb223d153b079ca4eed67441 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 00:51:18 +0530 Subject: [PATCH 05/10] perf(storage): allow concurrent reads; serialize writes via a mutex SetMaxOpenConns(1) forced every read (List/Get/GetPR/...) to queue behind the single connection, so the dashboard's reads contended with the LCM's writes. WAL already supports many concurrent readers, so raise the pool to 8 and instead serialize *writes* with a Store.writeMu. That keeps WAL's single-writer rule and the revision-CAS read-then-write atomic regardless of pool size, while reads now run in parallel across the pool. Every write method takes writeMu (Upsert, PatchMetadata, UpsertPR/DeletePR, the pr_check/pr_comment Replace* via inTx, the CDC outbox/offset writes, project writes, reaction-tracker writes); reads take nothing. Added TestConcurrentReadsAndWrites (16 writers + 16 readers) which passes under -race. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/internal/storage/sqlite/cdc_store.go | 8 +++ backend/internal/storage/sqlite/db.go | 22 +++++--- backend/internal/storage/sqlite/pr_store.go | 11 +++- .../internal/storage/sqlite/project_store.go | 6 +++ .../internal/storage/sqlite/reaction_store.go | 6 +++ backend/internal/storage/sqlite/store.go | 17 +++++-- backend/internal/storage/sqlite/store_test.go | 51 +++++++++++++++++++ backend/internal/storage/sqlite/upsert.go | 2 + 8 files changed, 109 insertions(+), 14 deletions(-) diff --git a/backend/internal/storage/sqlite/cdc_store.go b/backend/internal/storage/sqlite/cdc_store.go index 3386f988..8f92eda7 100644 --- a/backend/internal/storage/sqlite/cdc_store.go +++ b/backend/internal/storage/sqlite/cdc_store.go @@ -45,6 +45,8 @@ func (s *Store) ListUnsent(ctx context.Context, limit int) ([]OutboxEvent, error // MarkSent flags an outbox row delivered. func (s *Store) MarkSent(ctx context.Context, outboxID int64, at time.Time) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.MarkOutboxSent(ctx, gen.MarkOutboxSentParams{ SentAt: sql.NullTime{Time: at, Valid: true}, ID: outboxID, @@ -53,6 +55,8 @@ func (s *Store) MarkSent(ctx context.Context, outboxID int64, at time.Time) erro // MarkFailed bumps the attempt count and records the last error for an outbox row. func (s *Store) MarkFailed(ctx context.Context, outboxID int64, errMsg string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.MarkOutboxFailed(ctx, gen.MarkOutboxFailedParams{LastError: errMsg, ID: outboxID}) } @@ -70,6 +74,8 @@ func (s *Store) GetOffset(ctx context.Context, consumer string) (int64, error) { // SetOffset durably records a consumer's acknowledged seq. func (s *Store) SetOffset(ctx context.Context, consumer string, seq int64, at time.Time) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.UpsertConsumerOffset(ctx, gen.UpsertConsumerOffsetParams{ Consumer: consumer, LastSeq: seq, @@ -100,5 +106,7 @@ func (s *Store) MinConsumerOffset(ctx context.Context) (int64, error) { // DeleteSentOutboxBelow removes delivered outbox rows whose seq is below the // watermark, returning the number removed. func (s *Store) DeleteSentOutboxBelow(ctx context.Context, seq int64) (int64, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.DeleteSentOutboxBelow(ctx, seq) } diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go index 78eb3ae9..0a2555e4 100644 --- a/backend/internal/storage/sqlite/db.go +++ b/backend/internal/storage/sqlite/db.go @@ -18,17 +18,25 @@ import ( //go:embed migrations/*.sql var migrationsFS embed.FS -// pragmas are applied on every connection open. WAL + NORMAL gives concurrent -// reads alongside the single writer; busy_timeout absorbs brief writer -// contention; foreign_keys enforces the session_metadata cascade. +// pragmas are applied on every connection open. WAL + NORMAL lets readers run +// concurrently with the writer; busy_timeout absorbs brief writer contention; +// foreign_keys enforces the cascades. const pragmas = "?_pragma=journal_mode(WAL)" + "&_pragma=busy_timeout(5000)" + "&_pragma=foreign_keys(ON)" + "&_pragma=synchronous(NORMAL)" +// maxConnections caps the pool. WAL allows many concurrent readers, so reads +// (List/Get/GetPR/...) scale across the pool instead of queuing behind a single +// connection. Writes do NOT rely on the pool for serialization — the Store funnels +// every write through its writeMu (see store.go), which keeps WAL's single-writer +// rule and the revision-CAS read-then-write atomic regardless of pool size. +const maxConnections = 8 + // Open opens (creating if absent) the SQLite database under dataDir, applies the // connection pragmas, and runs all goose migrations up. The returned *sql.DB is -// safe for the single-writer / many-reader workload the LCM and readers impose. +// sized for the many-reader / serialized-single-writer workload the LCM and +// readers impose. func Open(dataDir string) (*sql.DB, error) { if err := os.MkdirAll(dataDir, 0o755); err != nil { return nil, fmt.Errorf("create data dir: %w", err) @@ -38,10 +46,8 @@ func Open(dataDir string) (*sql.DB, error) { if err != nil { return nil, fmt.Errorf("open sqlite: %w", err) } - // Single writer: serialize all access through one connection so WAL's - // single-writer rule is never violated by the pool handing out a second - // writable conn mid-transaction. - db.SetMaxOpenConns(1) + db.SetMaxOpenConns(maxConnections) + db.SetMaxIdleConns(maxConnections) // keep reader conns warm; avoid open/close churn if err := migrate(db); err != nil { db.Close() diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go index c7d436bd..1eca08f8 100644 --- a/backend/internal/storage/sqlite/pr_store.go +++ b/backend/internal/storage/sqlite/pr_store.go @@ -47,6 +47,8 @@ type PRComment struct { // UpsertPR inserts or replaces the scalar PR facts for one session. func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.UpsertPR(ctx, gen.UpsertPRParams{ SessionID: r.SessionID, ReviewDecision: r.ReviewDecision, @@ -87,6 +89,8 @@ func (s *Store) GetPR(ctx context.Context, sessionID string) (PRRow, bool, error // comments. Normally unnecessary (the chain cascades on session delete); exposed // for explicit eviction. func (s *Store) DeletePR(ctx context.Context, sessionID string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.DeletePR(ctx, sessionID) } @@ -172,9 +176,12 @@ func (s *Store) ListPRComments(ctx context.Context, sessionID string) ([]PRComme return out, nil } -// inTx runs fn inside a single transaction over the store's queries, rolling -// back on error. +// inTx runs fn inside a single write transaction over the store's queries, +// rolling back on error. It holds writeMu for the duration, so callers must not +// already hold it. func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() tx, err := s.db.BeginTx(ctx, nil) if err != nil { return fmt.Errorf("begin %s: %w", what, err) diff --git a/backend/internal/storage/sqlite/project_store.go b/backend/internal/storage/sqlite/project_store.go index fb75e18a..4837cafc 100644 --- a/backend/internal/storage/sqlite/project_store.go +++ b/backend/internal/storage/sqlite/project_store.go @@ -34,6 +34,8 @@ type ProjectRow struct { // UpsertProject inserts or updates one registered project. func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.UpsertProject(ctx, gen.UpsertProjectParams{ ID: r.ID, Path: r.Path, @@ -53,6 +55,8 @@ func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { // ArchiveProject soft-deletes one project, keeping the row so a session's // project_id still resolves. Active-only reads (ListProjects) then hide it. func (s *Store) ArchiveProject(ctx context.Context, id string, t time.Time) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.ArchiveProject(ctx, gen.ArchiveProjectParams{ ArchivedAt: nullTime(t), ID: id, @@ -86,6 +90,8 @@ func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { // DeleteProject removes one project by id. func (s *Store) DeleteProject(ctx context.Context, id string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.DeleteProject(ctx, id) } diff --git a/backend/internal/storage/sqlite/reaction_store.go b/backend/internal/storage/sqlite/reaction_store.go index 819d9716..c703a21b 100644 --- a/backend/internal/storage/sqlite/reaction_store.go +++ b/backend/internal/storage/sqlite/reaction_store.go @@ -48,6 +48,8 @@ func (s *Store) ListReactionTrackers(ctx context.Context) ([]ReactionTrackerRow, // SaveReactionTracker durably persists one escalation budget (insert or update). func (s *Store) SaveReactionTracker(ctx context.Context, r ReactionTrackerRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() escalated := int64(0) if r.Escalated { escalated = 1 @@ -68,6 +70,8 @@ func (s *Store) SaveReactionTracker(ctx context.Context, r ReactionTrackerRow) e // DeleteReactionTracker drops one escalation budget. func (s *Store) DeleteReactionTracker(ctx context.Context, sessionID, reactionKey string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.DeleteReactionTracker(ctx, gen.DeleteReactionTrackerParams{ SessionID: sessionID, ReactionKey: reactionKey, @@ -76,5 +80,7 @@ func (s *Store) DeleteReactionTracker(ctx context.Context, sessionID, reactionKe // DeleteSessionReactionTrackers drops every escalation budget for a session. func (s *Store) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.DeleteSessionReactionTrackers(ctx, sessionID) } diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go index 75b5474a..2effeaee 100644 --- a/backend/internal/storage/sqlite/store.go +++ b/backend/internal/storage/sqlite/store.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "sync" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" @@ -12,11 +13,17 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// Store is the SQLite-backed ports.LifecycleStore. The LCM is its sole logical -// writer (via Upsert); readers (Session Manager, reaper) use Load/Get/List. +// Store is the SQLite-backed ports.LifecycleStore. Reads (Load/Get/List/...) run +// concurrently across the connection pool; every write is funnelled through +// writeMu so there is exactly one writer at a time. That single-writer guarantee +// is load-bearing: it keeps WAL's single-writer rule and makes the revision-CAS +// (read-then-write in Upsert) atomic without depending on the pool size. Hold +// writeMu only around writes — never around a read — and never call one +// write method from inside another (the mutex is not reentrant). type Store struct { - db *sql.DB - q *gen.Queries + db *sql.DB + q *gen.Queries + writeMu sync.Mutex } var _ ports.LifecycleStore = (*Store)(nil) @@ -105,6 +112,8 @@ func (s *Store) PatchMetadata(ctx context.Context, id domain.SessionID, meta dom if meta.IsZero() { return nil } + s.writeMu.Lock() + defer s.writeMu.Unlock() return s.q.UpsertSessionMetadata(ctx, gen.UpsertSessionMetadataParams{ SessionID: string(id), Branch: meta.Branch, diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store_test.go index 711f8cf1..a197f3af 100644 --- a/backend/internal/storage/sqlite/store_test.go +++ b/backend/internal/storage/sqlite/store_test.go @@ -2,7 +2,9 @@ package sqlite import ( "context" + "fmt" "strings" + "sync" "testing" "time" @@ -255,3 +257,52 @@ func assertOutboxCount(t *testing.T, s *Store, ctx context.Context, want int) { t.Fatalf("outbox count = %d, want %d", len(rows), want) } } + +// TestConcurrentReadsAndWrites exercises the read-pool + write-mutex model: +// many writers (each its own session) run alongside many readers hammering +// ListAll. Reads must not be serialized behind writes, writes must not corrupt +// or error under the revision-CAS, and the final state must be exact. Run under +// -race this also guards the writeMu discipline. +func TestConcurrentReadsAndWrites(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + const n = 16 + + var wg sync.WaitGroup + errc := make(chan error, n*2) + + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + if err := s.Upsert(ctx, sampleRecord(fmt.Sprintf("s%02d", i)), ports.EventSessionCreated); err != nil { + errc <- err + } + }(i) + } + for i := 0; i < n; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < 25; j++ { + if _, err := s.ListAll(ctx); err != nil { + errc <- err + return + } + } + }() + } + wg.Wait() + close(errc) + for err := range errc { + t.Fatalf("concurrent op error: %v", err) + } + + got, err := s.ListAll(ctx) + if err != nil { + t.Fatal(err) + } + if len(got) != n { + t.Fatalf("after %d concurrent inserts, ListAll returned %d", n, len(got)) + } +} diff --git a/backend/internal/storage/sqlite/upsert.go b/backend/internal/storage/sqlite/upsert.go index 64674516..f8ae4093 100644 --- a/backend/internal/storage/sqlite/upsert.go +++ b/backend/internal/storage/sqlite/upsert.go @@ -24,6 +24,8 @@ import ( // stored+1. // - insert: rec.Lifecycle.Revision must be 0, persisted as 1. func (s *Store) Upsert(ctx context.Context, rec domain.SessionRecord, eventType ports.EventType) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() tx, err := s.db.BeginTx(ctx, nil) if err != nil { return fmt.Errorf("begin upsert: %w", err) From e5c4fd6ffde69cdede0a0f7395d8e94128de82d3 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 05:04:31 +0530 Subject: [PATCH 06/10] feat(storage,cdc): minimal 6-table schema + trigger-driven CDC (storage layer) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks the storage + CDC layer to the simplified design agreed in review: Schema (one clean migration, 0001): projects, sessions, pr, pr_checks, pr_comment, change_log. sessions.id is a single string key "{project}-{num}" (mer-1); operational metadata folded into sessions; is_alive replaces the runtime axis; no revision (the per-session write mutex serializes, change_log.seq orders). pr keyed by URL (1 session : many PRs). pr_checks is CI run history (one row per check per commit) — the CI-fix-loop brake is a LIMIT 3 query, no counter stored. change_log carries a required project_id FK + nullable session_id. CDC is DB-native: AFTER INSERT/UPDATE triggers on sessions/pr/pr_checks append to change_log atomically with the change (json_object payloads). The old durable outbox/JSONL/janitor pipeline is gone; the cdc package is now a Poller that reads change_log and fans events out through the in-memory Broadcaster (hardened with recover()). Clients catch up via the log from their own offset (SSE Last-Event-ID). Storage uses a single writer connection + a reader pool (read-your-writes for the triggers' subqueries; concurrent reads). sqlc-generated typed queries. Tests (-race): CRUD, per-project id assignment, the loop-brake query, concurrent creates, triggers populating change_log; CDC end-to-end through the real store, concurrent goroutine delivery, broadcaster panic-isolation. NOTE: scoped to storage + CDC. The lifecycle-engine consumers (decide, lifecycle, session, reaper, main wiring) still reference the old domain axes and need a follow-up integration pass to compile against the new model. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/internal/cdc/broadcast.go | 40 +- backend/internal/cdc/cdc_integration_test.go | 256 -------- backend/internal/cdc/cdc_test.go | 192 ++++++ backend/internal/cdc/consumer.go | 221 ------- backend/internal/cdc/event.go | 60 +- backend/internal/cdc/janitor.go | 84 --- backend/internal/cdc/jsonl.go | 109 ---- backend/internal/cdc/poller.go | 123 ++++ backend/internal/cdc/publisher.go | 115 ---- backend/internal/domain/decide/decide.go | 226 ++----- backend/internal/domain/decide/decide_test.go | 602 +++--------------- backend/internal/domain/decide/types.go | 68 +- backend/internal/domain/lifecycle.go | 174 +++-- backend/internal/domain/status.go | 76 ++- backend/internal/domain/status_test.go | 145 ++--- backend/internal/storage/sqlite/cdc_store.go | 112 ---- .../storage/sqlite/changelog_store.go | 89 +++ backend/internal/storage/sqlite/db.go | 59 +- .../internal/storage/sqlite/gen/cdc.sql.go | 199 ------ .../storage/sqlite/gen/changelog.sql.go | 102 +++ .../storage/sqlite/gen/metadata.sql.go | 82 --- backend/internal/storage/sqlite/gen/models.go | 92 +-- backend/internal/storage/sqlite/gen/pr.sql.go | 209 ++---- .../storage/sqlite/gen/pr_checks.sql.go | 119 ++++ .../storage/sqlite/gen/pr_comment.sql.go | 89 +++ .../storage/sqlite/gen/projects.sql.go | 55 +- .../internal/storage/sqlite/gen/querier.go | 53 +- .../storage/sqlite/gen/reactions.sql.go | 100 --- .../storage/sqlite/gen/sessions.sql.go | 262 ++++---- backend/internal/storage/sqlite/mapping.go | 158 +++-- .../storage/sqlite/migrations/0001_init.sql | 273 +++++--- .../sqlite/migrations/0002_pr_projects.sql | 85 --- .../storage/sqlite/pr_projects_test.go | 210 ------ backend/internal/storage/sqlite/pr_store.go | 271 ++++---- .../internal/storage/sqlite/project_store.go | 72 +-- .../internal/storage/sqlite/queries/cdc.sql | 42 -- .../storage/sqlite/queries/changelog.sql | 10 + .../storage/sqlite/queries/metadata.sql | 20 - .../internal/storage/sqlite/queries/pr.sql | 51 +- .../storage/sqlite/queries/pr_checks.sql | 15 + .../storage/sqlite/queries/pr_comment.sql | 12 + .../storage/sqlite/queries/projects.sql | 25 +- .../storage/sqlite/queries/reactions.sql | 18 - .../storage/sqlite/queries/sessions.sql | 70 +- .../internal/storage/sqlite/reaction_store.go | 86 --- backend/internal/storage/sqlite/spike_test.go | 92 --- backend/internal/storage/sqlite/store.go | 163 ++--- backend/internal/storage/sqlite/store_test.go | 406 ++++++------ backend/internal/storage/sqlite/upsert.go | 115 ---- 49 files changed, 2169 insertions(+), 4138 deletions(-) delete mode 100644 backend/internal/cdc/cdc_integration_test.go create mode 100644 backend/internal/cdc/cdc_test.go delete mode 100644 backend/internal/cdc/consumer.go delete mode 100644 backend/internal/cdc/janitor.go delete mode 100644 backend/internal/cdc/jsonl.go create mode 100644 backend/internal/cdc/poller.go delete mode 100644 backend/internal/cdc/publisher.go delete mode 100644 backend/internal/storage/sqlite/cdc_store.go create mode 100644 backend/internal/storage/sqlite/changelog_store.go delete mode 100644 backend/internal/storage/sqlite/gen/cdc.sql.go create mode 100644 backend/internal/storage/sqlite/gen/changelog.sql.go delete mode 100644 backend/internal/storage/sqlite/gen/metadata.sql.go create mode 100644 backend/internal/storage/sqlite/gen/pr_checks.sql.go create mode 100644 backend/internal/storage/sqlite/gen/pr_comment.sql.go delete mode 100644 backend/internal/storage/sqlite/gen/reactions.sql.go delete mode 100644 backend/internal/storage/sqlite/migrations/0002_pr_projects.sql delete mode 100644 backend/internal/storage/sqlite/pr_projects_test.go delete mode 100644 backend/internal/storage/sqlite/queries/cdc.sql create mode 100644 backend/internal/storage/sqlite/queries/changelog.sql delete mode 100644 backend/internal/storage/sqlite/queries/metadata.sql create mode 100644 backend/internal/storage/sqlite/queries/pr_checks.sql create mode 100644 backend/internal/storage/sqlite/queries/pr_comment.sql delete mode 100644 backend/internal/storage/sqlite/queries/reactions.sql delete mode 100644 backend/internal/storage/sqlite/reaction_store.go delete mode 100644 backend/internal/storage/sqlite/spike_test.go delete mode 100644 backend/internal/storage/sqlite/upsert.go diff --git a/backend/internal/cdc/broadcast.go b/backend/internal/cdc/broadcast.go index a7458e38..b914f766 100644 --- a/backend/internal/cdc/broadcast.go +++ b/backend/internal/cdc/broadcast.go @@ -1,25 +1,29 @@ package cdc -import "sync" +import ( + "log/slog" + "sync" +) -// Broadcaster is the in-process fan-out the consumer feeds. Subscribers (the +// Broadcaster is the in-process fan-out the poller feeds. Subscribers (the // WS/SSE transport, wired in the frontend task) register a callback; every -// consumed Event is delivered to all current subscribers. It is the single -// seam between the CDC pipeline and live delivery, so the transport can be -// built and swapped without touching the pipeline. +// polled Event is delivered to all current subscribers. It is the single seam +// between the CDC poller and live delivery, so the transport can be built and +// swapped without touching the poller. type Broadcaster struct { mu sync.RWMutex nextID int subs map[int]func(Event) + logger *slog.Logger } // NewBroadcaster returns an empty Broadcaster ready for subscriptions. func NewBroadcaster() *Broadcaster { - return &Broadcaster{subs: map[int]func(Event){}} + return &Broadcaster{subs: map[int]func(Event){}, logger: slog.Default()} } // Subscribe registers fn and returns an unsubscribe function. fn is called -// synchronously from the consumer loop, so it must not block; a transport that +// synchronously from the poller loop, so it must not block; a transport that // needs buffering should push onto its own channel inside fn. func (b *Broadcaster) Subscribe(fn func(Event)) (unsubscribe func()) { b.mu.Lock() @@ -34,11 +38,29 @@ func (b *Broadcaster) Subscribe(fn func(Event)) (unsubscribe func()) { } } -// Publish delivers e to every current subscriber. +// SubscriberCount reports the number of current subscribers. +func (b *Broadcaster) SubscriberCount() int { + b.mu.RLock() + defer b.mu.RUnlock() + return len(b.subs) +} + +// Publish delivers e to every current subscriber. A panicking subscriber is +// recovered and logged so one bad callback can't kill the poller goroutine or +// starve the other subscribers. func (b *Broadcaster) Publish(e Event) { b.mu.RLock() defer b.mu.RUnlock() for _, fn := range b.subs { - fn(e) + b.deliver(fn, e) } } + +func (b *Broadcaster) deliver(fn func(Event), e Event) { + defer func() { + if r := recover(); r != nil { + b.logger.Error("cdc broadcaster: subscriber panicked", "seq", e.Seq, "panic", r) + } + }() + fn(e) +} diff --git a/backend/internal/cdc/cdc_integration_test.go b/backend/internal/cdc/cdc_integration_test.go deleted file mode 100644 index 9390afe0..00000000 --- a/backend/internal/cdc/cdc_integration_test.go +++ /dev/null @@ -1,256 +0,0 @@ -package cdc_test - -import ( - "context" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/cdc" - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" -) - -// outboxAdapter bridges sqlite.Store's outbox methods to cdc.OutboxStore. This -// is the same glue the composition root (main.go) installs. -type outboxAdapter struct{ s *sqlite.Store } - -func (a outboxAdapter) ListUnsent(ctx context.Context, limit int) ([]cdc.PendingEvent, error) { - evs, err := a.s.ListUnsent(ctx, limit) - if err != nil { - return nil, err - } - out := make([]cdc.PendingEvent, len(evs)) - for i, e := range evs { - out[i] = cdc.PendingEvent{ - OutboxID: e.OutboxID, - Event: cdc.Event{ - Seq: e.Seq, - SessionID: e.SessionID, - EventType: e.EventType, - Revision: e.Revision, - Payload: e.Payload, - CreatedAt: e.CreatedAt, - }, - } - } - return out, nil -} - -func (a outboxAdapter) MarkSent(ctx context.Context, id int64, at time.Time) error { - return a.s.MarkSent(ctx, id, at) -} -func (a outboxAdapter) MarkFailed(ctx context.Context, id int64, msg string) error { - return a.s.MarkFailed(ctx, id, msg) -} - -func newStore(t *testing.T) *sqlite.Store { - t.Helper() - db, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open: %v", err) - } - t.Cleanup(func() { db.Close() }) - return sqlite.NewStore(db) -} - -func rec(id string) domain.SessionRecord { - now := time.Now().UTC() - return domain.SessionRecord{ - ID: domain.SessionID(id), ProjectID: "p", Kind: domain.KindWorker, CreatedAt: now, UpdatedAt: now, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, - PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, - Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, - }, - } -} - -func TestEndToEndPublishConsume(t *testing.T) { - ctx := context.Background() - store := newStore(t) - dir := t.TempDir() - log, err := cdc.OpenLog(dir, 0) - if err != nil { - t.Fatal(err) - } - defer log.Close() - - // Three canonical writes => three outbox rows, seq 1..3. - r := rec("s1") - if err := store.Upsert(ctx, r, ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - r.Lifecycle.Revision = 1 - if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { - t.Fatal(err) - } - r.Lifecycle.Revision = 2 - if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { - t.Fatal(err) - } - - pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) - if err := pub.Drain(ctx); err != nil { - t.Fatalf("drain: %v", err) - } - - var got []cdc.Event - bc := cdc.NewBroadcaster() - bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) - - con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{}) - if _, err := con.Start(ctx); err != nil { - t.Fatal(err) - } - // Drive one poll synchronously instead of waiting on the goroutine. - if err := con.Poll(ctx); err != nil { - t.Fatalf("poll: %v", err) - } - - if len(got) != 3 { - t.Fatalf("delivered %d events, want 3", len(got)) - } - for i, e := range got { - if e.Seq != int64(i+1) { - t.Fatalf("event %d has seq %d, want %d", i, e.Seq, i+1) - } - } - if got[0].EventType != string(ports.EventSessionCreated) { - t.Fatalf("first event type = %q", got[0].EventType) - } - - // Idempotency: a second poll with no new bytes delivers nothing more. - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - if len(got) != 3 { - t.Fatalf("re-poll delivered extra events: %d", len(got)) - } - - // Offset persisted at seq 3. - off, _ := store.GetOffset(ctx, "fe") - if off != 3 { - t.Fatalf("offset = %d, want 3", off) - } - - // Janitor: consumer ACKed 3, so sent rows with seq < 3 are reclaimed. - jan := cdc.NewJanitor(store, cdc.JanitorConfig{}) - deleted, err := jan.Sweep(ctx) - if err != nil { - t.Fatal(err) - } - if deleted != 2 { - t.Fatalf("janitor deleted %d, want 2 (seq 1,2 < watermark 3)", deleted) - } -} - -func TestConsumerRestartSkipsDelivered(t *testing.T) { - ctx := context.Background() - store := newStore(t) - dir := t.TempDir() - log, _ := cdc.OpenLog(dir, 0) - defer log.Close() - - if err := store.Upsert(ctx, rec("s1"), ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) - if err := pub.Drain(ctx); err != nil { - t.Fatal(err) - } - - // Pre-seed the durable offset as if a prior consumer already delivered seq 1. - if err := store.SetOffset(ctx, "fe", 1, time.Now().UTC()); err != nil { - t.Fatal(err) - } - - var got []cdc.Event - bc := cdc.NewBroadcaster() - bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) - con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{}) - if _, err := con.Start(ctx); err != nil { - t.Fatal(err) - } - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - if len(got) != 0 { - t.Fatalf("restart re-delivered already-acked events: %d", len(got)) - } -} - -// fakeSnapshot stands in for the sessions-table snapshot source on resync. -type fakeSnapshot struct { - events []cdc.Event - maxSeq int64 -} - -func (f fakeSnapshot) Snapshot(context.Context) ([]cdc.Event, int64, error) { - return f.events, f.maxSeq, nil -} - -func TestRotationTriggersResync(t *testing.T) { - ctx := context.Background() - store := newStore(t) - dir := t.TempDir() - // Tiny cap so a couple of writes force a rotation. - log, err := cdc.OpenLog(dir, 80) - if err != nil { - t.Fatal(err) - } - defer log.Close() - - var got []cdc.Event - bc := cdc.NewBroadcaster() - bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) - - snap := fakeSnapshot{events: []cdc.Event{{Seq: 5, SessionID: "s1", EventType: "session_updated"}}, maxSeq: 5} - con := cdc.NewConsumer("fe", dir+"/"+cdc.LogFileName, store, bc, cdc.ConsumerConfig{Snapshot: snap}) - if _, err := con.Start(ctx); err != nil { - t.Fatal(err) - } - - pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{}) - - // First write + drain + poll: consumer reads it and advances its cursor. - if err := store.Upsert(ctx, rec("s1"), ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - if err := pub.Drain(ctx); err != nil { - t.Fatal(err) - } - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - cursorBefore := len(got) - - // Force rotation by writing past the cap, then poll: the file shrank, so the - // consumer must resync from the snapshot source. - r := rec("s1") - r.Lifecycle.Revision = 1 - if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { - t.Fatal(err) - } - if err := pub.Drain(ctx); err != nil { - t.Fatal(err) - } - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - - if len(got) <= cursorBefore { - t.Fatal("expected resync to deliver the snapshot event") - } - // The snapshot event (seq 5) must be among the delivered events. - var sawSnapshot bool - for _, e := range got { - if e.Seq == 5 { - sawSnapshot = true - } - } - if !sawSnapshot { - t.Fatalf("resync did not deliver snapshot event; got %+v", got) - } -} diff --git a/backend/internal/cdc/cdc_test.go b/backend/internal/cdc/cdc_test.go new file mode 100644 index 00000000..d72370f4 --- /dev/null +++ b/backend/internal/cdc/cdc_test.go @@ -0,0 +1,192 @@ +package cdc_test + +import ( + "context" + "encoding/json" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// storeSource adapts sqlite.Store to cdc.Source — the same glue the daemon wires. +type storeSource struct{ s *sqlite.Store } + +func (a storeSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { + rows, err := a.s.ReadChangeLogAfter(ctx, after, limit) + if err != nil { + return nil, err + } + out := make([]cdc.Event, len(rows)) + for i, r := range rows { + out[i] = cdc.Event{ + Seq: r.Seq, + ProjectID: r.ProjectID, + SessionID: r.SessionID, + Type: cdc.EventType(r.EventType), + Payload: json.RawMessage(r.Payload), + CreatedAt: r.CreatedAt, + } + } + return out, nil +} + +func (a storeSource) LatestSeq(ctx context.Context) (int64, error) { return a.s.MaxChangeLogSeq(ctx) } + +func newStore(t *testing.T) *sqlite.Store { + t.Helper() + s, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +func seedSession(t *testing.T, s *sqlite.Store) domain.SessionRecord { + t.Helper() + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + if err := s.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/m", RegisteredAt: now}); err != nil { + t.Fatal(err) + } + r, err := s.CreateSession(ctx, domain.SessionRecord{ + ProjectID: "mer", Kind: domain.KindWorker, + Lifecycle: domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: domain.SessionWorking}, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + }, + CreatedAt: now, UpdatedAt: now, + }) + if err != nil { + t.Fatal(err) + } + return r +} + +// TestE2E_StoreWriteToBroadcast drives the whole path: a store write fires a DB +// trigger that appends to change_log; the poller reads it and broadcasts. +func TestE2E_StoreWriteToBroadcast(t *testing.T) { + ctx := context.Background() + s := newStore(t) + r := seedSession(t, s) // -> session_created (seq 1) + + r.Lifecycle.Session.State = domain.SessionIdle + if err := s.UpdateSession(ctx, r); err != nil { // -> session_updated (seq 2) + t.Fatal(err) + } + if err := s.UpsertPR(ctx, sqlite.PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: r.UpdatedAt}); err != nil { // -> pr_created (seq 3) + t.Fatal(err) + } + + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) + p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // StartSeq 0: read from the top + if err := p.Poll(ctx); err != nil { + t.Fatal(err) + } + + if len(got) != 3 { + t.Fatalf("delivered %d events, want 3", len(got)) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d seq=%d, want %d", i, e.Seq, i+1) + } + if e.ProjectID != "mer" { + t.Fatalf("event %d project=%q, want mer", i, e.ProjectID) + } + } + if got[0].Type != cdc.EventSessionCreated || got[1].Type != cdc.EventSessionUpdated || got[2].Type != cdc.EventPRCreated { + t.Fatalf("types = %s, %s, %s", got[0].Type, got[1].Type, got[2].Type) + } + // the trigger-built JSON payload survives as a usable RawMessage. + var payload map[string]any + if err := json.Unmarshal(got[0].Payload, &payload); err != nil { + t.Fatalf("payload not JSON: %v", err) + } + if payload["id"] != string(r.ID) || payload["state"] != "working" { + t.Fatalf("payload = %v", payload) + } + + // idempotent: a second poll with no new rows delivers nothing more. + if err := p.Poll(ctx); err != nil { + t.Fatal(err) + } + if len(got) != 3 { + t.Fatalf("re-poll delivered extra events: %d", len(got)) + } +} + +// TestE2E_ConcurrentPollerLiveDelivery runs the poller as a goroutine (the daemon +// model) and asserts every store change is delivered exactly once, in order. +func TestE2E_ConcurrentPollerLiveDelivery(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + s := newStore(t) + r := seedSession(t, s) // seq 1 + + var mu sync.Mutex + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // from the top + done := p.Start(ctx) + + const n = 6 + for i := 0; i < n; i++ { + r.Lifecycle.IsAlive = i%2 == 0 // toggles is_alive -> sessions_cdc_update fires + if err := s.UpdateSession(ctx, r); err != nil { + t.Fatal(err) + } + } + want := 1 + n // session_created + n updates + + deadline := time.Now().Add(5 * time.Second) + for { + mu.Lock() + c := len(got) + mu.Unlock() + if c >= want { + break + } + if time.Now().After(deadline) { + t.Fatalf("timed out: delivered %d/%d", c, want) + } + time.Sleep(20 * time.Millisecond) + } + cancel() + <-done + + mu.Lock() + defer mu.Unlock() + if len(got) != want { + t.Fatalf("delivered %d events, want %d", len(got), want) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d has seq %d, want %d (out-of-order/duplicate)", i, e.Seq, i+1) + } + } +} + +// TestBroadcasterRecoversPanickingSubscriber: one panicking subscriber must not +// kill delivery to the others (or crash the poller goroutine). +func TestBroadcasterRecoversPanickingSubscriber(t *testing.T) { + bc := cdc.NewBroadcaster() + good := 0 + bc.Subscribe(func(cdc.Event) { panic("boom") }) + bc.Subscribe(func(cdc.Event) { good++ }) + + bc.Publish(cdc.Event{Seq: 1}) // must not panic + bc.Publish(cdc.Event{Seq: 2}) + + if good != 2 { + t.Fatalf("good subscriber got %d, want 2 (panic was not isolated)", good) + } +} diff --git a/backend/internal/cdc/consumer.go b/backend/internal/cdc/consumer.go deleted file mode 100644 index 00edb0f1..00000000 --- a/backend/internal/cdc/consumer.go +++ /dev/null @@ -1,221 +0,0 @@ -package cdc - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "log/slog" - "os" - "time" -) - -// DefaultPollInterval is how often the consumer checks the log for new bytes. -// Polling (rather than fs-notify) keeps the consumer dependency-free; at this -// cadence live updates stay well under a human-perceptible delay. -const DefaultPollInterval = 100 * time.Millisecond - -// OffsetStore persists the consumer's durable seq cursor (at-least-once). -type OffsetStore interface { - GetOffset(ctx context.Context, consumer string) (int64, error) - SetOffset(ctx context.Context, consumer string, seq int64, at time.Time) error -} - -// SnapshotSource rebuilds current state from the source of truth (the sessions -// table) after a rotation gap, where log lines for unconsumed-but-already-sent -// events were truncated away. It returns one Event per live session plus the -// MAX(change_log seq) the snapshot corresponds to, so the consumer can resume. -type SnapshotSource interface { - Snapshot(ctx context.Context) (events []Event, maxSeq int64, err error) -} - -// Consumer tails the JSONL log, deduplicates by seq, and fans each new event -// out through the Broadcaster, persisting its durable offset as it goes. -type Consumer struct { - name string - path string - offsets OffsetStore - bcast *Broadcaster - snapshot SnapshotSource - interval time.Duration - clock func() time.Time - logger *slog.Logger - - cursor int64 // byte offset into the log - lastSeq int64 // highest seq delivered - prevInfo os.FileInfo // identity of the file last polled (rotation detection) -} - -// ConsumerConfig holds optional knobs and the snapshot source. -type ConsumerConfig struct { - Snapshot SnapshotSource - Interval time.Duration - Clock func() time.Time - Logger *slog.Logger -} - -// NewConsumer constructs a Consumer named name (the consumer_offsets key) over -// the log at path, fanning out through bcast and persisting offsets via offsets. -func NewConsumer(name, path string, offsets OffsetStore, bcast *Broadcaster, cfg ConsumerConfig) *Consumer { - c := &Consumer{ - name: name, - path: path, - offsets: offsets, - bcast: bcast, - snapshot: cfg.Snapshot, - interval: cfg.Interval, - clock: cfg.Clock, - logger: cfg.Logger, - } - if c.interval <= 0 { - c.interval = DefaultPollInterval - } - if c.clock == nil { - c.clock = time.Now - } - if c.logger == nil { - c.logger = slog.Default() - } - return c -} - -// Start loads the durable offset and runs the poll loop until ctx is cancelled; -// the returned channel closes when the loop has exited. -func (c *Consumer) Start(ctx context.Context) (<-chan struct{}, error) { - seq, err := c.offsets.GetOffset(ctx, c.name) - if err != nil { - return nil, fmt.Errorf("load consumer offset: %w", err) - } - c.lastSeq = seq - - done := make(chan struct{}) - go func() { - defer close(done) - t := time.NewTicker(c.interval) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - if err := c.Poll(ctx); err != nil { - c.logger.Error("cdc consumer: poll failed", "err", err) - } - } - } - }() - return done, nil -} - -// Poll reads any new bytes since the last cursor and delivers complete lines. It -// detects rotation (the file shrank below the cursor) and resyncs from the DB -// snapshot before resuming. -func (c *Consumer) Poll(ctx context.Context) error { - f, err := os.Open(c.path) - if err != nil { - if os.IsNotExist(err) { - return nil // publisher has not created the log yet - } - return fmt.Errorf("open cdc log: %w", err) - } - defer f.Close() - - info, err := f.Stat() - if err != nil { - return fmt.Errorf("stat cdc log: %w", err) - } - size := info.Size() - - rotated := (c.prevInfo != nil && !os.SameFile(c.prevInfo, info)) || size < c.cursor - c.prevInfo = info - if rotated { - // The previous file's bytes are void. Resync from the DB snapshot (if - // wired), then resume reading the fresh file from the top. - if err := c.resync(ctx); err != nil { - return err - } - c.cursor = 0 - } - if size == c.cursor { - return nil - } - - if _, err := f.Seek(c.cursor, io.SeekStart); err != nil { - return fmt.Errorf("seek cdc log: %w", err) - } - data, err := io.ReadAll(f) - if err != nil { - return fmt.Errorf("read cdc log: %w", err) - } - - consumed, maxSeq := c.processLines(data) - c.cursor += int64(consumed) - - if maxSeq > c.lastSeq { - c.lastSeq = maxSeq - if err := c.offsets.SetOffset(ctx, c.name, c.lastSeq, c.clock().UTC()); err != nil { - return fmt.Errorf("persist consumer offset: %w", err) - } - } - return nil -} - -// processLines delivers each complete (newline-terminated) line, skipping reset -// markers and any event whose seq was already delivered. It returns the number -// of bytes consumed (only complete lines) and the highest seq seen. -func (c *Consumer) processLines(data []byte) (consumed int, maxSeq int64) { - maxSeq = c.lastSeq - for { - nl := bytes.IndexByte(data[consumed:], '\n') - if nl < 0 { - return consumed, maxSeq // partial trailing line: leave for next poll - } - line := data[consumed : consumed+nl] - consumed += nl + 1 - - if isResetMarker(line) { - continue - } - var e Event - if err := json.Unmarshal(line, &e); err != nil { - c.logger.Error("cdc consumer: bad line skipped", "err", err) - continue - } - if e.Seq <= c.lastSeq { - continue // idempotent: already delivered - } - c.bcast.Publish(e) - if e.Seq > maxSeq { - maxSeq = e.Seq - } - } -} - -func (c *Consumer) resync(ctx context.Context) error { - if c.snapshot == nil { - return nil - } - events, maxSeq, err := c.snapshot.Snapshot(ctx) - if err != nil { - return fmt.Errorf("cdc consumer resync: %w", err) - } - for _, e := range events { - c.bcast.Publish(e) - } - if maxSeq > c.lastSeq { - c.lastSeq = maxSeq - if err := c.offsets.SetOffset(ctx, c.name, c.lastSeq, c.clock().UTC()); err != nil { - return fmt.Errorf("persist offset after resync: %w", err) - } - } - return nil -} - -func isResetMarker(line []byte) bool { - var m resetMarker - if err := json.Unmarshal(line, &m); err != nil { - return false - } - return m.Type == "reset" -} diff --git a/backend/internal/cdc/event.go b/backend/internal/cdc/event.go index b0eddf98..04f52648 100644 --- a/backend/internal/cdc/event.go +++ b/backend/internal/cdc/event.go @@ -1,32 +1,40 @@ -// Package cdc is the change-data-capture pipeline that turns the storage layer's -// transactional outbox into a durable, ordered event stream for the frontend. +// Package cdc is the change-data-capture delivery layer. Change events are +// captured durably by SQLite triggers into the change_log table (see the storage +// migrations); this package POLLS that log and fans new events out, in order, to +// in-process subscribers (the WS/SSE transport, wired in the frontend task). // -// The flow: the publisher drains the SQLite outbox (sent=0, seq order) and -// appends each change as one JSON line to a rotating log file. The consumer -// tails that file from a durable byte cursor, deduplicates by seq, and fans each -// change out through the Broadcaster to in-process subscribers (the WS/SSE -// transport, wired later). The janitor reclaims outbox rows every consumer has -// acknowledged. Delivery is at-least-once; seq is the idempotency key. +// There is no durable outbox/JSONL/janitor machinery: the change_log table IS +// the durable, ordered source of truth, and clients catch up by reading it from +// their own offset (SSE Last-Event-ID). The poller + broadcaster here are only +// the LIVE push on top of that. package cdc -import "time" +import ( + "encoding/json" + "time" +) -// Event is one change-data-capture record. It is the JSONL line shape and the -// value handed to Broadcaster subscribers. Seq is the monotonic ordering and -// idempotency key (the change_log seq). -type Event struct { - Seq int64 `json:"seq"` - SessionID string `json:"sessionId"` - EventType string `json:"eventType"` - Revision int64 `json:"revision"` - Payload string `json:"payload"` - CreatedAt time.Time `json:"createdAt"` -} +// EventType mirrors the event_type values the DB triggers write. +type EventType string -// resetMarker is written as the first line of a freshly rotated log file. A -// consumer that reads it knows the byte offsets of the previous file are void -// and must snapshot-resync, then resume from the current MAX(seq). -type resetMarker struct { - Type string `json:"type"` // always "reset" - RotatedAt time.Time `json:"rotatedAt"` +const ( + EventSessionCreated EventType = "session_created" + EventSessionUpdated EventType = "session_updated" + EventPRCreated EventType = "pr_created" + EventPRUpdated EventType = "pr_updated" + EventPRCheckRecorded EventType = "pr_check_recorded" +) + +// Event is one CDC change read from change_log. Seq is the monotonic ordering + +// idempotency key (consumers dedup by it). SessionID is empty for project-level +// events. Payload is the trigger-built JSON, kept raw so a typed transport can +// narrow it by Type (the discriminated-union decode lives at the transport edge, +// not here). +type Event struct { + Seq int64 `json:"seq"` + ProjectID string `json:"projectId"` + SessionID string `json:"sessionId,omitempty"` + Type EventType `json:"type"` + Payload json.RawMessage `json:"payload"` + CreatedAt time.Time `json:"createdAt"` } diff --git a/backend/internal/cdc/janitor.go b/backend/internal/cdc/janitor.go deleted file mode 100644 index 3968b2cf..00000000 --- a/backend/internal/cdc/janitor.go +++ /dev/null @@ -1,84 +0,0 @@ -package cdc - -import ( - "context" - "log/slog" - "time" -) - -// DefaultJanitorInterval is the outbox-vacuum cadence. -const DefaultJanitorInterval = 60 * time.Second - -// Vacuum is the janitor's view of storage: the safe deletion watermark and the -// delete itself. -type Vacuum interface { - MinConsumerOffset(ctx context.Context) (int64, error) - DeleteSentOutboxBelow(ctx context.Context, seq int64) (int64, error) -} - -// Janitor reclaims delivered outbox rows every consumer has acknowledged. -// -// Watermark: MIN(consumer_offsets.last_seq). Rows with seq < watermark are sent -// AND past every consumer's cursor, so they are safe to drop. When the watermark -// is 0 (a consumer exists but has acknowledged nothing, or none is registered -// yet) the janitor deletes nothing — it never races ahead of a consumer that -// has not yet read an event. change_log is never touched: it is the durable -// history and the snapshot-resync floor. -type Janitor struct { - store Vacuum - interval time.Duration - logger *slog.Logger -} - -// JanitorConfig holds optional knobs; zero values fall back to defaults. -type JanitorConfig struct { - Interval time.Duration - Logger *slog.Logger -} - -// NewJanitor constructs a Janitor over store. -func NewJanitor(store Vacuum, cfg JanitorConfig) *Janitor { - j := &Janitor{store: store, interval: cfg.Interval, logger: cfg.Logger} - if j.interval <= 0 { - j.interval = DefaultJanitorInterval - } - if j.logger == nil { - j.logger = slog.Default() - } - return j -} - -// Start runs the vacuum loop until ctx is cancelled; the returned channel closes -// when the loop has exited. -func (j *Janitor) Start(ctx context.Context) <-chan struct{} { - done := make(chan struct{}) - go func() { - defer close(done) - t := time.NewTicker(j.interval) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - if _, err := j.Sweep(ctx); err != nil { - j.logger.Error("cdc janitor: sweep failed", "err", err) - } - } - } - }() - return done -} - -// Sweep deletes delivered outbox rows below the safe watermark and returns the -// number removed. -func (j *Janitor) Sweep(ctx context.Context) (int64, error) { - watermark, err := j.store.MinConsumerOffset(ctx) - if err != nil { - return 0, err - } - if watermark <= 0 { - return 0, nil - } - return j.store.DeleteSentOutboxBelow(ctx, watermark) -} diff --git a/backend/internal/cdc/jsonl.go b/backend/internal/cdc/jsonl.go deleted file mode 100644 index 74dc0695..00000000 --- a/backend/internal/cdc/jsonl.go +++ /dev/null @@ -1,109 +0,0 @@ -package cdc - -import ( - "encoding/json" - "fmt" - "os" - "path/filepath" - "sync" - "time" -) - -// LogFileName is the active CDC log under the data dir. -const LogFileName = "session-events.jsonl" - -// DefaultMaxBytes is the size at which the log rotates (1 MiB). -const DefaultMaxBytes int64 = 1 << 20 - -// Log is the append-only JSONL sink the publisher writes to. When it grows past -// maxBytes it rotates by truncating in place and writing a reset marker as the -// new first line — the consumer treats a shrunken file as "resync from the DB -// snapshot", so the log itself is not the durable source of truth (SQLite is). -type Log struct { - mu sync.Mutex - path string - maxBytes int64 - f *os.File - size int64 -} - -// OpenLog opens (creating if absent) the JSONL log in dir. maxBytes <= 0 uses -// DefaultMaxBytes. -func OpenLog(dir string, maxBytes int64) (*Log, error) { - if maxBytes <= 0 { - maxBytes = DefaultMaxBytes - } - path := filepath.Join(dir, LogFileName) - f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) - if err != nil { - return nil, fmt.Errorf("open cdc log: %w", err) - } - info, err := f.Stat() - if err != nil { - f.Close() - return nil, fmt.Errorf("stat cdc log: %w", err) - } - return &Log{path: path, maxBytes: maxBytes, f: f, size: info.Size()}, nil -} - -// Append writes one event as a JSON line, flushing to disk. It rotates first if -// the file is already at/over the size cap, so a single oversized burst still -// lands in a fresh segment. -func (l *Log) Append(e Event) error { - l.mu.Lock() - defer l.mu.Unlock() - - if l.size >= l.maxBytes { - if err := l.rotateLocked(); err != nil { - return err - } - } - return l.writeLocked(e) -} - -func (l *Log) writeLocked(v any) error { - line, err := json.Marshal(v) - if err != nil { - return fmt.Errorf("marshal cdc line: %w", err) - } - line = append(line, '\n') - n, err := l.f.Write(line) - l.size += int64(n) - if err != nil { - return fmt.Errorf("write cdc line: %w", err) - } - if err := l.f.Sync(); err != nil { - return fmt.Errorf("sync cdc log: %w", err) - } - return nil -} - -// rotateLocked renames the active file aside and starts a fresh one whose first -// line is a reset marker. Renaming (not truncating in place) gives the file a -// new identity, so a polling consumer reliably detects rotation via -// os.SameFile even if the fresh file grows past its old byte cursor between -// polls. The consumer then resyncs from the DB snapshot. -func (l *Log) rotateLocked() error { - if err := l.f.Close(); err != nil { - return fmt.Errorf("close cdc log for rotate: %w", err) - } - archive := l.path + ".1" - _ = os.Remove(archive) // best-effort: history lives in SQLite, not the log - if err := os.Rename(l.path, archive); err != nil { - return fmt.Errorf("rotate cdc log: %w", err) - } - f, err := os.OpenFile(l.path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) - if err != nil { - return fmt.Errorf("reopen cdc log after rotate: %w", err) - } - l.f = f - l.size = 0 - return l.writeLocked(resetMarker{Type: "reset", RotatedAt: time.Now().UTC()}) -} - -// Close closes the underlying file. -func (l *Log) Close() error { - l.mu.Lock() - defer l.mu.Unlock() - return l.f.Close() -} diff --git a/backend/internal/cdc/poller.go b/backend/internal/cdc/poller.go new file mode 100644 index 00000000..c824def3 --- /dev/null +++ b/backend/internal/cdc/poller.go @@ -0,0 +1,123 @@ +package cdc + +import ( + "context" + "fmt" + "log/slog" + "time" +) + +// DefaultPollInterval is how often the poller checks change_log for new rows. +// Polling (rather than fs-notify or a DB hook) keeps it dependency-free; at this +// cadence live updates stay well under a human-perceptible delay. +const DefaultPollInterval = 100 * time.Millisecond + +// DefaultBatch bounds how many events one poll drains. +const DefaultBatch = 512 + +// Source is the poller's view of the durable log: read events after a seq, and +// the current head seq. The storage layer implements it (the change_log table). +type Source interface { + EventsAfter(ctx context.Context, after int64, limit int) ([]Event, error) + LatestSeq(ctx context.Context) (int64, error) +} + +// Poller tails change_log and fans each new event out through the Broadcaster, +// in seq order. It holds only an in-memory cursor (lastSeq): it is the LIVE push +// path, while durable catch-up is the client's job (read change_log from its own +// offset). A restart re-seeks to head, so the poller never re-broadcasts history +// to a freshly-started broadcaster. +type Poller struct { + src Source + bcast *Broadcaster + interval time.Duration + batch int + logger *slog.Logger + lastSeq int64 +} + +// PollerConfig holds optional knobs; zero values fall back to defaults. StartSeq +// is the cursor to begin from; production wiring leaves it 0 and calls +// SeekToHead, tests set it to read from the beginning. +type PollerConfig struct { + Interval time.Duration + Batch int + Logger *slog.Logger + StartSeq int64 +} + +// NewPoller constructs a Poller over src, fanning out through bcast. +func NewPoller(src Source, bcast *Broadcaster, cfg PollerConfig) *Poller { + p := &Poller{ + src: src, + bcast: bcast, + interval: cfg.Interval, + batch: cfg.Batch, + logger: cfg.Logger, + lastSeq: cfg.StartSeq, + } + if p.interval <= 0 { + p.interval = DefaultPollInterval + } + if p.batch <= 0 { + p.batch = DefaultBatch + } + if p.logger == nil { + p.logger = slog.Default() + } + return p +} + +// SeekToHead moves the cursor to the current head, so the poller only broadcasts +// events created from now on (clients catch up on older events via the store). +func (p *Poller) SeekToHead(ctx context.Context) error { + seq, err := p.src.LatestSeq(ctx) + if err != nil { + return fmt.Errorf("cdc poller seek: %w", err) + } + p.lastSeq = seq + return nil +} + +// Start runs the poll loop until ctx is cancelled; the returned channel closes +// when the loop has exited. +func (p *Poller) Start(ctx context.Context) <-chan struct{} { + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(p.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := p.Poll(ctx); err != nil { + p.logger.Error("cdc poller: poll failed", "err", err) + } + } + } + }() + return done +} + +// Poll drains one batch of new events and broadcasts them in seq order, +// advancing the cursor. Exported so tests (and a daemon) can drive a cycle +// synchronously. +func (p *Poller) Poll(ctx context.Context) error { + evs, err := p.src.EventsAfter(ctx, p.lastSeq, p.batch) + if err != nil { + return fmt.Errorf("cdc poller: read after %d: %w", p.lastSeq, err) + } + for _, e := range evs { + if e.Seq <= p.lastSeq { + continue // idempotent guard + } + p.bcast.Publish(e) + p.lastSeq = e.Seq + } + return nil +} + +// LastSeq returns the poller's current cursor (the highest seq broadcast). +func (p *Poller) LastSeq() int64 { return p.lastSeq } diff --git a/backend/internal/cdc/publisher.go b/backend/internal/cdc/publisher.go deleted file mode 100644 index 3283a236..00000000 --- a/backend/internal/cdc/publisher.go +++ /dev/null @@ -1,115 +0,0 @@ -package cdc - -import ( - "context" - "log/slog" - "time" -) - -// DefaultPublishInterval is the outbox drain cadence. -const DefaultPublishInterval = 50 * time.Millisecond - -// DefaultBatchSize bounds how many outbox rows one drain pass handles. -const DefaultBatchSize = 256 - -// PendingEvent is an undelivered outbox row paired with its CDC event payload. -type PendingEvent struct { - OutboxID int64 - Event -} - -// OutboxStore is the publisher's view of the storage layer: read undelivered -// rows in seq order, then mark each delivered or failed. -type OutboxStore interface { - ListUnsent(ctx context.Context, limit int) ([]PendingEvent, error) - MarkSent(ctx context.Context, outboxID int64, at time.Time) error - MarkFailed(ctx context.Context, outboxID int64, errMsg string) error -} - -// Publisher drains the outbox into the JSONL log on a fixed cadence. -type Publisher struct { - src OutboxStore - log *Log - interval time.Duration - batch int - clock func() time.Time - logger *slog.Logger -} - -// PublisherConfig holds optional knobs; zero values fall back to defaults. -type PublisherConfig struct { - Interval time.Duration - Batch int - Clock func() time.Time - Logger *slog.Logger -} - -// NewPublisher constructs a Publisher over src and log. -func NewPublisher(src OutboxStore, log *Log, cfg PublisherConfig) *Publisher { - p := &Publisher{ - src: src, - log: log, - interval: cfg.Interval, - batch: cfg.Batch, - clock: cfg.Clock, - logger: cfg.Logger, - } - if p.interval <= 0 { - p.interval = DefaultPublishInterval - } - if p.batch <= 0 { - p.batch = DefaultBatchSize - } - if p.clock == nil { - p.clock = time.Now - } - if p.logger == nil { - p.logger = slog.Default() - } - return p -} - -// Start runs the drain loop until ctx is cancelled; the returned channel closes -// when the loop has exited. -func (p *Publisher) Start(ctx context.Context) <-chan struct{} { - done := make(chan struct{}) - go func() { - defer close(done) - t := time.NewTicker(p.interval) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - if err := p.Drain(ctx); err != nil { - p.logger.Error("cdc publisher: drain failed", "err", err) - } - } - } - }() - return done -} - -// Drain runs one pass: append each undelivered row to the log in seq order, -// marking it sent. A write failure stops the pass (the row is marked failed and -// retried next tick) so ordering is never violated by skipping ahead. -func (p *Publisher) Drain(ctx context.Context) error { - pending, err := p.src.ListUnsent(ctx, p.batch) - if err != nil { - return err - } - for _, pe := range pending { - if err := p.log.Append(pe.Event); err != nil { - p.logger.Error("cdc publisher: append failed", "outboxId", pe.OutboxID, "seq", pe.Seq, "err", err) - if merr := p.src.MarkFailed(ctx, pe.OutboxID, err.Error()); merr != nil { - p.logger.Error("cdc publisher: mark failed errored", "outboxId", pe.OutboxID, "err", merr) - } - return nil - } - if err := p.src.MarkSent(ctx, pe.OutboxID, p.clock().UTC()); err != nil { - return err - } - } - return nil -} diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go index c46df18d..be195aef 100644 --- a/backend/internal/domain/decide/decide.go +++ b/backend/internal/domain/decide/decide.go @@ -1,7 +1,11 @@ // Package decide is the pure DECIDE core: total, deterministic, zero I/O. It -// collapses observed facts (plus the prior detecting/activity memory) into one -// LifecycleDecision. Every function here must remain side-effect free so the -// whole status truth-table can be tested in isolation. +// collapses observed liveness facts (plus the prior detecting memory) into one +// LifecycleDecision. Every function here is side-effect free so the whole +// liveness truth-table can be tested in isolation. +// +// PR-driven behaviour is NOT here: PR display status is derived by +// domain.DeriveStatus from the pr table, and PR-driven nudges are the reaction +// engine's job. decide is only about liveness + the anti-flap quarantine. package decide import ( @@ -30,158 +34,57 @@ const ( // terminal this decider may reach without quarantine); // - a *failed* probe (timeout/error) is never read as death — it routes to // detecting, as does any disagreement between the two probes; -// - only runtime-dead + process-dead + no-recent-activity reaches killed. +// - only runtime-down + process-dead + no-recent-activity reaches terminal. func ResolveProbeDecision(in ProbeInput) LifecycleDecision { if in.KillRequested { + reason := in.KillReason + if reason == "" { + reason = domain.TermManuallyKilled + } return LifecycleDecision{ - Status: domain.StatusKilled, - Evidence: "manual kill requested", - SessionState: domain.SessionTerminated, - SessionReason: domain.ReasonManuallyKilled, + Evidence: "manual kill requested", + SessionState: domain.SessionTerminated, + TerminationReason: reason, + IsAlive: false, } } - if in.RuntimeFailed || in.ProcessFailed || in.Runtime == domain.RuntimeProbeFailed { - ev := fmt.Sprintf("probe_failed runtime=%s runtimeFailed=%t process=%s processFailed=%t", - in.Runtime, in.RuntimeFailed, in.Process, in.ProcessFailed) - return detecting(in, domain.ReasonProbeFailure, ev) + if in.RuntimeFailed || in.ProcessFailed { + ev := fmt.Sprintf("probe_failed runtimeFailed=%t process=%s processFailed=%t", in.RuntimeFailed, in.Process, in.ProcessFailed) + return detecting(in, ev) } - switch in.Runtime { - case domain.RuntimeAlive: + if in.RuntimeAlive { if in.Process == ProcessDead { // Runtime up but the agent process is gone: probes disagree. ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity) - return detecting(in, domain.ReasonAgentProcessExited, ev) - } - return LifecycleDecision{ - Status: domain.StatusWorking, - Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), - SessionState: domain.SessionWorking, - SessionReason: domain.ReasonTaskInProgress, - } - - case domain.RuntimeExited, domain.RuntimeMissing: - // Runtime is gone. Death is only concluded when the process is *also* - // confirmed dead AND nothing has been heard from the agent recently; - // any other shape is ambiguous and quarantines. - if in.Process == ProcessAlive || in.RecentActivity { - ev := fmt.Sprintf("disagree runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) - return detecting(in, domain.ReasonRuntimeLost, ev) - } - if in.Process == ProcessDead { - return LifecycleDecision{ - Status: domain.StatusKilled, - Evidence: fmt.Sprintf("dead runtime=%s process=dead recentActivity=false", in.Runtime), - SessionState: domain.SessionTerminated, - SessionReason: domain.ReasonRuntimeLost, - } - } - // Process indeterminate: cannot confirm death, so quarantine. - ev := fmt.Sprintf("runtime_lost runtime=%s process=%s recentActivity=false", in.Runtime, in.Process) - return detecting(in, domain.ReasonRuntimeLost, ev) - - default: - // unknown (not yet probed): ambiguous, never conclude death. - ev := fmt.Sprintf("runtime_unknown runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) - return detecting(in, domain.ReasonRuntimeLost, ev) - } -} - -// ResolveOpenPRDecision walks the PR pipeline ladder. CI failure dominates -// everything. Draft PRs then surface as draft and do not enter the review or -// merge states. Open PRs continue through requested changes, approval/merge -// states, pending review, stalled (idle-beyond-threshold), then plain open. -func ResolveOpenPRDecision(in OpenPRInput) LifecycleDecision { - // evidence is a stable, timestamp-free summary " # " - // for logs/traceability; it folds in the PR identity inputs (Number/URL). - evidence := func(cond string) string { - s := cond - if in.Number > 0 { - s += fmt.Sprintf(" #%d", in.Number) - } - if in.URL != "" { - s += " " + in.URL + return detecting(in, ev) } - return s - } - prState := domain.PROpen - if in.Draft { - prState = domain.PRDraft - } - base := func(status domain.SessionStatus, cond string, prReason domain.PRReason, ss domain.SessionState, sr domain.SessionReason) LifecycleDecision { return LifecycleDecision{ - Status: status, - Evidence: evidence(cond), - SessionState: ss, - SessionReason: sr, - PRState: prState, - PRReason: prReason, + Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), + SessionState: domain.SessionWorking, + IsAlive: true, } } - switch { - case in.CIFailing: - return base(domain.StatusCIFailed, "ci_failing", domain.PRReasonCIFailing, domain.SessionWorking, domain.ReasonFixingCI) - case in.Draft: - return base(domain.StatusDraft, "draft", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) - case in.ChangesRequested: - return base(domain.StatusChangesRequested, "changes_requested", domain.PRReasonChangesRequested, domain.SessionWorking, domain.ReasonResolvingReviewComments) - case in.BotComments: - return base(domain.StatusChangesRequested, "bot_comments", domain.PRReasonBotComments, domain.SessionWorking, domain.ReasonResolvingReviewComments) - case in.MergeConflicts: - return base(domain.StatusPROpen, "merge_conflicts", domain.PRReasonMergeConflicts, domain.SessionWorking, domain.ReasonPRCreated) - case in.Mergeable: - // Mergeability is the authoritative merge gate, so it already folds in - // "approved if review is required". Checking it before Approved means a - // PR on a no-required-review repo (mergeable, not formally approved) is - // still surfaced as ready-to-merge instead of falling through to PR_OPEN. - return base(domain.StatusMergeable, "merge_ready", domain.PRReasonMergeReady, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.Approved: - return base(domain.StatusApproved, "approved", domain.PRReasonApproved, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.ReviewPending: - return base(domain.StatusReviewPending, "review_pending", domain.PRReasonReviewPending, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.IdleBeyond: - // A PR open but quiet past the stuck threshold needs a human nudge. - return base(domain.StatusStuck, "idle_beyond", domain.PRReasonInProgress, domain.SessionStuck, domain.ReasonAwaitingUserInput) - default: - return base(domain.StatusPROpen, "pr_open", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) + // Runtime is gone. Death is only concluded when the process is *also* + // confirmed dead AND nothing has been heard from the agent recently; any + // other shape is ambiguous and quarantines. + if in.Process == ProcessAlive || in.RecentActivity { + ev := fmt.Sprintf("disagree runtime=down process=%s recentActivity=%t", in.Process, in.RecentActivity) + return detecting(in, ev) } -} - -// ResolveTerminalPRStateDecision handles merged/closed PRs. A merge parks the -// session idle awaiting a human's post-merge decision; a close drops to idle. -// none/open are not terminal — callers should route those to the open-PR or -// probe deciders — but the function stays total for safety. -func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { - switch pr { - case domain.PRMerged: + if in.Process == ProcessDead { return LifecycleDecision{ - Status: domain.StatusMerged, - Evidence: "pr merged", - SessionState: domain.SessionIdle, - SessionReason: domain.ReasonMergedWaitingDecision, - PRState: domain.PRMerged, - PRReason: domain.PRReasonMerged, - } - case domain.PRClosed: - return LifecycleDecision{ - Status: domain.StatusIdle, - Evidence: "pr closed unmerged", - SessionState: domain.SessionIdle, - SessionReason: domain.ReasonAwaitingUserInput, - PRState: domain.PRClosed, - PRReason: domain.PRReasonClosedUnmerged, - } - default: - return LifecycleDecision{ - Status: domain.StatusWorking, - Evidence: fmt.Sprintf("non-terminal pr state=%s", pr), - SessionState: domain.SessionWorking, - SessionReason: domain.ReasonTaskInProgress, - PRState: pr, + Evidence: "dead runtime=down process=dead recentActivity=false", + SessionState: domain.SessionTerminated, + TerminationReason: domain.TermRuntimeLost, + IsAlive: false, } } + // Process indeterminate: cannot confirm death, so quarantine. + ev := fmt.Sprintf("runtime_lost runtime=down process=%s recentActivity=false", in.Process) + return detecting(in, ev) } // CreateDetectingDecision advances or escalates the anti-flap quarantine. @@ -189,9 +92,10 @@ func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { // The attempt counter climbs only while the (timestamp-stripped) evidence hash // is unchanged and resets the moment the evidence moves; StartedAt is preserved // across the whole detecting episode so the duration cap is a real wall-clock -// safety net even when the evidence keeps flapping. Escalation to stuck fires -// at DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration -// elapsed since first entering detecting. +// safety net even when the evidence keeps flapping. Escalation to stuck fires at +// DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration +// elapsed since first entering detecting. Detecting/stuck leave IsAlive true: +// the probe was ambiguous, so the session is not confirmed dead. func CreateDetectingDecision(in DetectingInput) LifecycleDecision { hash := HashEvidence(in.Evidence) @@ -207,19 +111,17 @@ func CreateDetectingDecision(in DetectingInput) LifecycleDecision { escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration)) if escalate { return LifecycleDecision{ - Status: domain.StatusStuck, - Evidence: in.Evidence, - SessionState: domain.SessionStuck, - SessionReason: in.ProposedReason, + Evidence: in.Evidence, + SessionState: domain.SessionStuck, + IsAlive: true, } } return LifecycleDecision{ - Status: domain.StatusDetecting, - Evidence: in.Evidence, - Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, - SessionState: domain.SessionDetecting, - SessionReason: in.ProposedReason, + Evidence: in.Evidence, + Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, + SessionState: domain.SessionDetecting, + IsAlive: true, } } @@ -237,38 +139,20 @@ func HashEvidence(evidence string) string { } // timestampPatterns is the list of regexes HashEvidence applies (in order) to -// delete the time-varying parts of an evidence string before hashing, so the -// same ambiguous signal restamped with a new clock value hashes equal and the -// detecting counter keeps climbing instead of resetting every tick. -// -// Order matters: the full datetime form is removed first so its embedded -// HH:MM:SS isn't half-eaten by the bare time-of-day pattern that follows. -// -// 1. full ISO-8601 / RFC3339 datetime — date, a T or space separator, -// HH:MM:SS, optional fractional seconds, optional Z or ±HH:MM offset. -// e.g. "2026-05-26T12:00:00Z", "2026-05-26 12:00:00.218+05:30" -// 2. a bare time-of-day, e.g. "12:00:00" or "12:00:00.218" -// 3. a bare unix epoch — any 10-13 digit run (seconds or millis), e.g. -// "1716724800". This is broad enough to also clobber a same-width numeric -// ID if one ever appears in evidence; evidence is decider-authored, so keep -// IDs out of evidence strings to preserve hash fidelity. +// delete the time-varying parts of an evidence string before hashing. var timestampPatterns = []*regexp.Regexp{ regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`), regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`), regexp.MustCompile(`\b\d{10,13}\b`), } -// detecting adapts a probe verdict into the shared anti-flap path. It packages -// the proposed reason + evidence (plus the prior counter from the same probe -// input) into a DetectingInput and defers to CreateDetectingDecision, so every +// detecting packages a probe verdict into the shared anti-flap path, so every // probe-driven ambiguity is counted and escalated by the identical quarantine // logic instead of each probe branch re-implementing the counter. -func detecting(in ProbeInput, reason domain.SessionReason, evidence string) LifecycleDecision { +func detecting(in ProbeInput, evidence string) LifecycleDecision { return CreateDetectingDecision(DetectingInput{ - Evidence: evidence, - ProposedState: domain.SessionDetecting, - ProposedReason: reason, - Prior: in.Prior, - Now: in.Now, + Evidence: evidence, + Prior: in.Prior, + Now: in.Now, }) } diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go index 1a815959..bc25af55 100644 --- a/backend/internal/domain/decide/decide_test.go +++ b/backend/internal/domain/decide/decide_test.go @@ -7,570 +7,158 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) +var t0 = time.Date(2026, 5, 31, 12, 0, 0, 0, time.UTC) func TestResolveProbeDecision(t *testing.T) { tests := []struct { - name string - in ProbeInput - wantStatus domain.SessionStatus - wantState domain.SessionState - wantReason domain.SessionReason - wantDetect bool // expect non-nil Detecting memory - wantTermNil bool // expect terminal (Detecting must be nil) - }{ - { - name: "kill requested short-circuits to terminal killed", - in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonManuallyKilled, - wantTermNil: true, - }, - { - name: "kill requested wins even over a dead+dead probe", - in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonManuallyKilled, - wantTermNil: true, - }, - { - name: "runtime probe failed routes to detecting, never death", - in: ProbeInput{Runtime: domain.RuntimeMissing, RuntimeFailed: true, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "process probe failed routes to detecting", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, ProcessFailed: true, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "runtime state probe_failed routes to detecting", - in: ProbeInput{Runtime: domain.RuntimeProbeFailed, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "runtime alive + process alive is working", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - }, - { - name: "runtime alive + process indeterminate leans alive", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - }, - { - name: "runtime alive + process dead disagree -> detecting (agent_process_exited)", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonAgentProcessExited, - wantDetect: true, - }, - { - name: "runtime dead + process alive disagree -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime dead + recent activity disagree -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, RecentActivity: true, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime dead + process indeterminate cannot confirm -> detecting", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime exited + process dead + no activity -> killed terminal", - in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantTermNil: true, - }, - { - name: "runtime missing + process dead + no activity -> killed terminal", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantTermNil: true, - }, - { - name: "runtime unknown is ambiguous -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeUnknown, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := ResolveProbeDecision(tt.in) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) - } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) - } - if got.SessionReason != tt.wantReason { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) - } - if tt.wantDetect && got.Detecting == nil { - t.Errorf("expected non-nil Detecting memory, got nil") - } - if tt.wantTermNil && got.Detecting != nil { - t.Errorf("terminal decision must carry nil Detecting, got %+v", got.Detecting) - } - }) - } -} - -func TestResolveOpenPRDecision(t *testing.T) { - tests := []struct { - name string - in OpenPRInput - wantStatus domain.SessionStatus - wantPR domain.PRReason - wantPRState domain.PRState - wantState domain.SessionState + name string + in ProbeInput + wantState domain.SessionState + wantReason domain.TerminationReason + wantAlive bool + wantDetect bool // expect a detecting verdict (first attempt -> SessionDetecting) }{ { - name: "ci failing dominates everything", - in: OpenPRInput{CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusCIFailed, - wantPR: domain.PRReasonCIFailing, - wantState: domain.SessionWorking, - }, - { - name: "draft with failing CI maps to ci_failed", - in: OpenPRInput{Draft: true, CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusCIFailed, - wantPR: domain.PRReasonCIFailing, - wantPRState: domain.PRDraft, - wantState: domain.SessionWorking, - }, - { - name: "draft ignores review and merge states", - in: OpenPRInput{Draft: true, ChangesRequested: true, Approved: true, Mergeable: true, ReviewPending: true, IdleBeyond: true}, - wantStatus: domain.StatusDraft, - wantPR: domain.PRReasonInProgress, - wantPRState: domain.PRDraft, - wantState: domain.SessionWorking, - }, - { - name: "changes requested before approval states", - in: OpenPRInput{ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusChangesRequested, - wantPR: domain.PRReasonChangesRequested, - wantState: domain.SessionWorking, - }, - { - name: "bot comments get distinct PR reason", - in: OpenPRInput{BotComments: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusChangesRequested, - wantPR: domain.PRReasonBotComments, - wantState: domain.SessionWorking, - }, - { - name: "merge conflicts get distinct PR reason", - in: OpenPRInput{MergeConflicts: true, Approved: true}, - wantStatus: domain.StatusPROpen, - wantPR: domain.PRReasonMergeConflicts, - wantState: domain.SessionWorking, - }, - { - name: "approved + mergeable -> mergeable", - in: OpenPRInput{Approved: true, Mergeable: true}, - wantStatus: domain.StatusMergeable, - wantPR: domain.PRReasonMergeReady, - wantState: domain.SessionIdle, - }, - { - name: "mergeable without formal approval (no required review) -> mergeable", - in: OpenPRInput{Mergeable: true}, - wantStatus: domain.StatusMergeable, - wantPR: domain.PRReasonMergeReady, - wantState: domain.SessionIdle, - }, - { - name: "approved but not mergeable -> approved", - in: OpenPRInput{Approved: true}, - wantStatus: domain.StatusApproved, - wantPR: domain.PRReasonApproved, - wantState: domain.SessionIdle, - }, - { - name: "review pending", - in: OpenPRInput{ReviewPending: true}, - wantStatus: domain.StatusReviewPending, - wantPR: domain.PRReasonReviewPending, - wantState: domain.SessionIdle, + name: "kill requested -> terminated with reason", + in: ProbeInput{KillRequested: true, KillReason: domain.TermManuallyKilled, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, }, { - name: "idle beyond threshold -> stuck", - in: OpenPRInput{IdleBeyond: true}, - wantStatus: domain.StatusStuck, - wantPR: domain.PRReasonInProgress, - wantState: domain.SessionStuck, + name: "kill requested without reason defaults to manually_killed", + in: ProbeInput{KillRequested: true, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, }, { - name: "review pending wins over idle-beyond", - in: OpenPRInput{ReviewPending: true, IdleBeyond: true}, - wantStatus: domain.StatusReviewPending, - wantPR: domain.PRReasonReviewPending, - wantState: domain.SessionIdle, + name: "runtime probe failed -> detecting (not death)", + in: ProbeInput{RuntimeFailed: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "nothing set -> plain open", - in: OpenPRInput{}, - wantStatus: domain.StatusPROpen, - wantPR: domain.PRReasonInProgress, - wantState: domain.SessionWorking, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := ResolveOpenPRDecision(tt.in) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) - } - if got.PRReason != tt.wantPR { - t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) - } - wantPRState := tt.wantPRState - if wantPRState == "" { - wantPRState = domain.PROpen - } - if got.PRState != wantPRState { - t.Errorf("PRState = %q, want %q", got.PRState, wantPRState) - } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) - } - }) - } -} - -func TestResolveOpenPRDecisionEvidence(t *testing.T) { - tests := []struct { - name string - in OpenPRInput - want string - }{ - { - name: "condition with PR number and URL", - in: OpenPRInput{CIFailing: true, Number: 123, URL: "https://example.com/pr/123"}, - want: "ci_failing #123 https://example.com/pr/123", + name: "process probe failed -> detecting", + in: ProbeInput{RuntimeAlive: true, ProcessFailed: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "condition with number only", - in: OpenPRInput{Approved: true, Mergeable: true, Number: 7}, - want: "merge_ready #7", + name: "runtime alive + process alive -> working", + in: ProbeInput{RuntimeAlive: true, Process: ProcessAlive, Now: t0}, + wantState: domain.SessionWorking, wantAlive: true, }, { - name: "no identity falls back to the bare condition", - in: OpenPRInput{}, - want: "pr_open", + name: "runtime alive + process indeterminate -> working", + in: ProbeInput{RuntimeAlive: true, Process: ProcessIndeterminate, Now: t0}, + wantState: domain.SessionWorking, wantAlive: true, }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := ResolveOpenPRDecision(tt.in).Evidence; got != tt.want { - t.Errorf("Evidence = %q, want %q", got, tt.want) - } - }) - } -} - -func TestDecidersDeriveConsistently(t *testing.T) { - // Every decision a decider produces must be self-consistent: the display - // Status it reports must equal what DeriveLegacyStatus produces from the - // canonical (session, pr) sub-states it emits. This locks the deciders and - // the display-derivation against drifting apart. - // - // The ResolveTerminalPRStateDecision none/open default is intentionally - // excluded — it is a documented no-op for misuse, not a real verdict. - var decisions []LifecycleDecision - - for _, in := range []OpenPRInput{ - {Draft: true, CIFailing: true}, - {Draft: true, ChangesRequested: true, Approved: true, Mergeable: true, ReviewPending: true, IdleBeyond: true}, - {CIFailing: true}, - {ChangesRequested: true}, - {BotComments: true}, - {MergeConflicts: true}, - {Approved: true, Mergeable: true}, - {Mergeable: true}, - {Approved: true}, - {ReviewPending: true}, - {IdleBeyond: true}, - {}, - } { - decisions = append(decisions, ResolveOpenPRDecision(in)) - } - - decisions = append(decisions, - ResolveTerminalPRStateDecision(domain.PRMerged), - ResolveTerminalPRStateDecision(domain.PRClosed), - ) - - for _, in := range []ProbeInput{ - {KillRequested: true, Now: t0}, - {Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - {Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, - {Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, - } { - decisions = append(decisions, ResolveProbeDecision(in)) - } - - for _, d := range decisions { - l := domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: d.SessionState, Reason: d.SessionReason}, - PR: domain.PRSubstate{State: d.PRState, Reason: d.PRReason}, - } - if got := domain.DeriveLegacyStatus(l); got != d.Status { - t.Errorf("decision %+v: Status=%q but DeriveLegacyStatus=%q", d, d.Status, got) - } - } -} - -func TestResolveTerminalPRStateDecision(t *testing.T) { - tests := []struct { - name string - pr domain.PRState - wantStatus domain.SessionStatus - wantState domain.SessionState - wantReason domain.SessionReason - wantPR domain.PRReason - }{ { - name: "merged parks idle awaiting decision", - pr: domain.PRMerged, - wantStatus: domain.StatusMerged, - wantState: domain.SessionIdle, - wantReason: domain.ReasonMergedWaitingDecision, - wantPR: domain.PRReasonMerged, + name: "runtime alive + process dead -> detecting (disagree)", + in: ProbeInput{RuntimeAlive: true, Process: ProcessDead, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "closed drops to idle", - pr: domain.PRClosed, - wantStatus: domain.StatusIdle, - wantState: domain.SessionIdle, - wantReason: domain.ReasonAwaitingUserInput, - wantPR: domain.PRReasonClosedUnmerged, + name: "runtime down + process dead + no activity -> terminated runtime_lost", + in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: false, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermRuntimeLost, wantAlive: false, }, { - name: "non-terminal none is a working no-op", - pr: domain.PRNone, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process alive -> detecting (disagree)", + in: ProbeInput{RuntimeAlive: false, Process: ProcessAlive, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "non-terminal open is a working no-op", - pr: domain.PROpen, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process dead + recent activity -> detecting", + in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "non-terminal draft is a working no-op", - pr: domain.PRDraft, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process indeterminate -> detecting", + in: ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := ResolveTerminalPRStateDecision(tt.pr) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + d := ResolveProbeDecision(tt.in) + if d.SessionState != tt.wantState { + t.Errorf("state = %q, want %q", d.SessionState, tt.wantState) } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + if d.TerminationReason != tt.wantReason { + t.Errorf("reason = %q, want %q", d.TerminationReason, tt.wantReason) } - if got.SessionReason != tt.wantReason { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + if d.IsAlive != tt.wantAlive { + t.Errorf("isAlive = %v, want %v", d.IsAlive, tt.wantAlive) } - if tt.wantPR != "" && got.PRReason != tt.wantPR { - t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + if tt.wantDetect && d.Detecting == nil { + t.Errorf("expected detecting memory, got nil") } }) } } func TestCreateDetectingDecision(t *testing.T) { - const ev = "runtime_lost runtime=missing process=indeterminate" - hash := HashEvidence(ev) - - t.Run("first entry records attempt 1 and stays detecting", func(t *testing.T) { - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Now: t0}) - if got.Status != domain.StatusDetecting || got.SessionState != domain.SessionDetecting { - t.Fatalf("want detecting, got Status=%q State=%q", got.Status, got.SessionState) - } - if got.Detecting == nil || got.Detecting.Attempts != 1 { - t.Fatalf("want attempts=1, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt = %v, want %v", got.Detecting.StartedAt, t0) - } - if got.Detecting.EvidenceHash != hash { - t.Errorf("EvidenceHash = %q, want %q", got.Detecting.EvidenceHash, hash) - } - if got.SessionReason != domain.ReasonRuntimeLost { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, domain.ReasonRuntimeLost) + t.Run("first entry sets attempts 1", func(t *testing.T) { + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Now: t0}) + if d.SessionState != domain.SessionDetecting || d.Detecting == nil || d.Detecting.Attempts != 1 { + t.Fatalf("got %+v", d) } }) - - t.Run("unchanged evidence climbs the counter", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Detecting == nil || got.Detecting.Attempts != 2 { - t.Fatalf("want attempts=2, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt must be preserved, got %v", got.Detecting.StartedAt) + t.Run("same evidence climbs the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) + if d.Detecting == nil || d.Detecting.Attempts != 2 { + t.Fatalf("attempts = %+v, want 2", d.Detecting) } }) - - t.Run("escalates to stuck on the third unchanged tick", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Status != domain.StatusStuck || got.SessionState != domain.SessionStuck { - t.Fatalf("want stuck, got Status=%q State=%q", got.Status, got.SessionState) - } - if got.Detecting != nil { - t.Errorf("stuck decision must drop detecting memory, got %+v", got.Detecting) - } - if got.SessionReason != domain.ReasonRuntimeLost { - t.Errorf("escalation should carry the why, got %q", got.SessionReason) - } - }) - - t.Run("changing evidence resets the counter but preserves StartedAt", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: "different evidence", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Status != domain.StatusDetecting { - t.Fatalf("changed evidence should stay detecting, got %q", got.Status) - } - if got.Detecting == nil || got.Detecting.Attempts != 1 { - t.Fatalf("counter should reset to 1, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt must survive an evidence change, got %v", got.Detecting.StartedAt) + t.Run("changed evidence resets the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 2, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "process dead", Prior: prior, Now: t0.Add(time.Second)}) + if d.Detecting == nil || d.Detecting.Attempts != 1 { + t.Fatalf("attempts = %+v, want 1 (evidence changed)", d.Detecting) } }) - - t.Run("duration cap escalates even below the attempt count", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration)}) - if got.Status != domain.StatusStuck { - t.Fatalf("want stuck from duration cap, got %q", got.Status) + t.Run("escalates to stuck at the attempt cap", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) + if d.SessionState != domain.SessionStuck { + t.Fatalf("state = %q, want stuck", d.SessionState) } }) - - t.Run("duration cap fires even when evidence keeps flapping", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: "ever-changing", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Minute)}) - if got.Status != domain.StatusStuck { - t.Fatalf("duration cap must override a reset counter, got %q", got.Status) + t.Run("escalates to stuck past the duration cap", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Second)}) + if d.SessionState != domain.SessionStuck { + t.Fatalf("state = %q, want stuck (duration cap)", d.SessionState) } }) } func TestProbeDetectingEscalationFlow(t *testing.T) { - // An unchanging ambiguous probe should escalate to stuck after exactly - // DetectingMaxAttempts ticks. - in := ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0} - d := ResolveProbeDecision(in) + in := ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0} + var prior *domain.DetectingState for i := 1; i < DetectingMaxAttempts; i++ { - if d.Status != domain.StatusDetecting { - t.Fatalf("tick %d: expected detecting, got %q", i, d.Status) - } - in.Prior = d.Detecting + in.Prior = prior in.Now = t0.Add(time.Duration(i) * time.Second) - d = ResolveProbeDecision(in) + d := ResolveProbeDecision(in) + if d.SessionState != domain.SessionDetecting { + t.Fatalf("attempt %d: state = %q, want detecting", i, d.SessionState) + } + prior = d.Detecting } - if d.Status != domain.StatusStuck { - t.Fatalf("expected escalation to stuck after %d ticks, got %q", DetectingMaxAttempts, d.Status) + in.Prior = prior + in.Now = t0.Add(time.Hour) + if d := ResolveProbeDecision(in); d.SessionState != domain.SessionStuck { + t.Fatalf("final attempt: state = %q, want stuck", d.SessionState) } } func TestHashEvidence(t *testing.T) { - t.Run("identical strings hash identically", func(t *testing.T) { - if HashEvidence("same input") != HashEvidence("same input") { - t.Error("identical evidence must hash equal") - } - }) - - t.Run("different evidence hashes differently", func(t *testing.T) { - if HashEvidence("runtime_lost") == HashEvidence("agent_process_exited") { - t.Error("distinct evidence must hash differently") - } - }) - - t.Run("only the timestamp differs -> equal hash", func(t *testing.T) { - a := "probe failed at 2026-05-26T12:00:00Z runtime=missing" - b := "probe failed at 2026-05-26T12:05:43.218Z runtime=missing" - if HashEvidence(a) != HashEvidence(b) { - t.Errorf("restamped evidence should hash equal:\n a=%q\n b=%q", a, b) - } - }) - - t.Run("bare time-of-day stripped", func(t *testing.T) { - if HashEvidence("idle since 12:00:00") != HashEvidence("idle since 13:30:59") { - t.Error("time-of-day differences should be stripped") - } - }) - - t.Run("unix epoch stripped", func(t *testing.T) { - if HashEvidence("last seen 1716724800") != HashEvidence("last seen 1716728400") { - t.Error("epoch differences should be stripped") - } - }) - - t.Run("a real content change still changes the hash", func(t *testing.T) { - a := "probe at 2026-05-26T12:00:00Z runtime=missing" - b := "probe at 2026-05-26T12:00:00Z runtime=alive" - if HashEvidence(a) == HashEvidence(b) { - t.Error("non-timestamp content change must change the hash") - } - }) - - t.Run("whitespace differences are normalised", func(t *testing.T) { - if HashEvidence("runtime=missing process=dead") != HashEvidence("runtime=missing process=dead") { - t.Error("collapsed whitespace should hash equal") - } - }) + // timestamp-only differences hash equal; a real change differs. + a := HashEvidence("runtime down at 2026-05-31T12:00:00Z") + b := HashEvidence("runtime down at 2026-05-31T13:30:45Z") + if a != b { + t.Errorf("restamped evidence should hash equal") + } + c := HashEvidence("process dead at 2026-05-31T12:00:00Z") + if a == c { + t.Errorf("different evidence should hash differently") + } } diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go index 1666fae7..832fab6f 100644 --- a/backend/internal/domain/decide/types.go +++ b/backend/internal/domain/decide/types.go @@ -6,39 +6,34 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleDecision is the output of every decider: the derived display status -// plus the canonical sub-state values to persist, the human-readable evidence, -// and the (possibly updated) detecting memory. +// LifecycleDecision is the output of a decider: the canonical session sub-state +// to persist (state, the liveness bool, and — only for a terminal state — the +// termination reason), the human-readable evidence, and the (possibly updated) +// detecting memory. The display status is NOT here — it is derived on read by +// domain.DeriveStatus from the persisted lifecycle plus the pr table. // -// Zero-value sub-state fields mean "this decider does not address that -// sub-state — leave it unchanged", NOT "set it to the empty value". SessionState -// is always populated, but the probe/detecting/kill paths legitimately leave -// PRState/PRReason empty: a liveness verdict knows nothing about the PR. When -// the LCM folds a decision into the next full canonical row it must therefore -// leave empty PRState/PRReason unchanged rather than writing them through — -// writing PRNone on a routine probe tick would clobber a live PR. Detecting is -// nil-by-default; the LCM explicitly clears stale detecting memory when a probe -// verdict leaves detecting. +// PR facts are likewise not here: a liveness verdict knows nothing about the PR, +// and PR-driven display/reactions are handled off the pr table, not the session +// state machine. type LifecycleDecision struct { - Status domain.SessionStatus - Evidence string - Detecting *domain.DetectingState - SessionState domain.SessionState - SessionReason domain.SessionReason - PRState domain.PRState - PRReason domain.PRReason + Evidence string + Detecting *domain.DetectingState + SessionState domain.SessionState + TerminationReason domain.TerminationReason // set only when SessionState is terminated + IsAlive bool } -// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout -// or error) is distinct from a "dead" verdict and must route to detecting, -// never to a death conclusion. KillRequested short-circuits to terminal. +// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout or +// error) is distinct from a "dead" verdict and must route to detecting, never to +// a death conclusion. KillRequested short-circuits to terminal with KillReason. type ProbeInput struct { - Runtime domain.RuntimeState - RuntimeFailed bool + RuntimeAlive bool // the runtime probe reports the backing runtime is up + RuntimeFailed bool // the runtime probe itself failed (timeout/error) — not "dead" Process ProcessLiveness ProcessFailed bool RecentActivity bool KillRequested bool + KillReason domain.TerminationReason // the terminal reason when KillRequested Prior *domain.DetectingState Now time.Time } @@ -52,28 +47,11 @@ const ( ProcessIndeterminate ProcessLiveness = "indeterminate" ) -// OpenPRInput drives the PR pipeline ladder for an open or draft PR. -type OpenPRInput struct { - Draft bool - CIFailing bool - ChangesRequested bool - BotComments bool - MergeConflicts bool - Approved bool - Mergeable bool - ReviewPending bool - IdleBeyond bool // idle past the stuck threshold - Number int - URL string -} - -// DetectingInput feeds the quarantine counter. Evidence is hashed with +// DetectingInput feeds the anti-flap quarantine counter. Evidence is hashed with // timestamps stripped, so "same ambiguous signal" keeps the counter climbing // while any real change resets it. type DetectingInput struct { - Evidence string - ProposedState domain.SessionState - ProposedReason domain.SessionReason - Prior *domain.DetectingState - Now time.Time + Evidence string + Prior *domain.DetectingState + Now time.Time } diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go index fca87b6b..b5636761 100644 --- a/backend/internal/domain/lifecycle.go +++ b/backend/internal/domain/lifecycle.go @@ -11,30 +11,35 @@ import "time" // Greenfield: we start at 1 and carry no migration/synthesis code. const LifecycleVersion = 1 -// CanonicalSessionLifecycle is the ONLY thing persisted for a session's state. -// The display status is derived from it on read (see DeriveLegacyStatus) and is -// never stored — this prevents canonical truth and display from drifting. +// CanonicalSessionLifecycle is the ONLY lifecycle state persisted for a session. +// The display status is derived from it (plus the session's PR facts, which live +// in the separate pr table) on read — see DeriveStatus — and is never stored, so +// canonical truth and display cannot drift. // -// Three orthogonal (state, reason) sub-states describe the session, its PR, and -// its runtime. Activity and Detecting are decider *inputs* that must survive -// between observations (they are read back by the pure decide core), so they -// live in the persisted record too. +// PR facts are deliberately NOT here: a session can own several PRs over its +// life, and PR state is owned by the pr table. The runtime axis is collapsed to +// a single IsAlive boolean. Activity and Detecting are decider *inputs* that +// must survive between observations, so they live in the persisted record. type CanonicalSessionLifecycle struct { // Version is the Go-only schema-shape constant for this record. It is not // persisted and is not part of the CDC payload. Version int - // Revision is the per-write monotonic counter. The storage layer's Upsert - // bumps it when the full row is persisted; the LCM does not. - Revision int `json:"revision"` - Session SessionSubstate `json:"session"` - PR PRSubstate `json:"pr"` - Runtime RuntimeSubstate `json:"runtime"` - - // Activity is the last-known agent activity. It arrives on a different - // cadence (ApplyActivitySignal) than runtime probes (the reaper), so the - // probe decider reads it from here to answer "was there recent activity?". + + Session SessionSubstate `json:"session"` Activity ActivitySubstate `json:"activity"` + // TerminationReason is set only when Session.State is terminated; '' otherwise. + TerminationReason TerminationReason `json:"terminationReason,omitempty"` + + // IsAlive is the single liveness fact: is the runtime/process backing this + // session still up? It replaces the old runtime (state, reason) axis — the + // nuance the probe decider needs (failed-probe != dead, anti-flap) lives in + // the decide core's inputs, not in a persisted enum. + IsAlive bool `json:"isAlive"` + + // Harness is the agent harness the session runs (claude-code, codex, ...). + Harness AgentHarness `json:"harness,omitempty"` + // Detecting is the anti-flap quarantine memory. It is non-nil only while // the session is in the detecting state; it carries the attempt counter, // the first-entry time, and a hash of the (timestamp-stripped) evidence so @@ -42,6 +47,18 @@ type CanonicalSessionLifecycle struct { Detecting *DetectingState `json:"detecting,omitempty"` } +// ---- agent harness ---- + +// AgentHarness identifies which agent CLI/runtime a session drives. +type AgentHarness string + +const ( + HarnessClaudeCode AgentHarness = "claude-code" + HarnessCodex AgentHarness = "codex" + HarnessAider AgentHarness = "aider" + HarnessOpenCode AgentHarness = "opencode" +) + // ---- session sub-state ---- type SessionState string @@ -57,99 +74,76 @@ const ( SessionTerminated SessionState = "terminated" ) -type SessionReason string +// TerminationReason is the typed "why" for a terminated session — the only +// state that carries a reason. Empty for every non-terminal state. It decides +// the terminal display status (killed / cleanup / errored). The PR-pipeline +// "why" (fixing CI, awaiting review, …) is NOT here; it is derived on read from +// the pr table, not persisted on the session. +type TerminationReason string const ( - ReasonSpawnRequested SessionReason = "spawn_requested" - ReasonAgentAcknowledged SessionReason = "agent_acknowledged" - ReasonTaskInProgress SessionReason = "task_in_progress" - ReasonPRCreated SessionReason = "pr_created" - ReasonFixingCI SessionReason = "fixing_ci" - ReasonResolvingReviewComments SessionReason = "resolving_review_comments" - ReasonAwaitingUserInput SessionReason = "awaiting_user_input" - ReasonAwaitingExternalReview SessionReason = "awaiting_external_review" - ReasonResearchComplete SessionReason = "research_complete" - ReasonMergedWaitingDecision SessionReason = "merged_waiting_decision" - ReasonManuallyKilled SessionReason = "manually_killed" - ReasonPRMerged SessionReason = "pr_merged" - ReasonAutoCleanup SessionReason = "auto_cleanup" - ReasonRuntimeLost SessionReason = "runtime_lost" - ReasonAgentProcessExited SessionReason = "agent_process_exited" - ReasonProbeFailure SessionReason = "probe_failure" - ReasonErrorInProcess SessionReason = "error_in_process" + TermNone TerminationReason = "" + TermManuallyKilled TerminationReason = "manually_killed" + TermRuntimeLost TerminationReason = "runtime_lost" + TermAgentProcessExited TerminationReason = "agent_process_exited" + TermProbeFailure TerminationReason = "probe_failure" + TermErrorInProcess TerminationReason = "error_in_process" + TermAutoCleanup TerminationReason = "auto_cleanup" + TermPRMerged TerminationReason = "pr_merged" ) type SessionSubstate struct { - State SessionState `json:"state"` - Reason SessionReason `json:"reason"` + State SessionState `json:"state"` } -// ---- PR sub-state ---- - -type PRState string - -const ( - PRNone PRState = "none" - PRDraft PRState = "draft" - PROpen PRState = "open" - PRMerged PRState = "merged" - PRClosed PRState = "closed" -) +// ---- PR facts (NOT persisted on the session; sourced from the pr table) ---- + +// PRFacts is the per-session PR snapshot the status/reaction derivation reads +// from the pr table. It is the decider input that replaces the old persisted PR +// axis. The zero value (Exists=false) means "no PR", which derivation treats as +// "session has no PR". +type PRFacts struct { + URL string + Number int + Exists bool + Draft bool + Merged bool + Closed bool + CI CIState + Review ReviewDecision + Mergeability Mergeability + BotComments bool + IdleBeyond bool // idle past the stuck threshold +} -type PRReason string +type CIState string const ( - PRReasonNotCreated PRReason = "not_created" - PRReasonInProgress PRReason = "in_progress" - PRReasonCIFailing PRReason = "ci_failing" - PRReasonReviewPending PRReason = "review_pending" - PRReasonChangesRequested PRReason = "changes_requested" - PRReasonBotComments PRReason = "bot_comments" - PRReasonMergeConflicts PRReason = "merge_conflicts" - PRReasonApproved PRReason = "approved" - PRReasonMergeReady PRReason = "merge_ready" - PRReasonMerged PRReason = "merged" - PRReasonClosedUnmerged PRReason = "closed_unmerged" - PRReasonClearedOnRestore PRReason = "cleared_on_restore" + CIUnknown CIState = "unknown" + CIPending CIState = "pending" + CIPassing CIState = "passing" + CIFailing CIState = "failing" ) -type PRSubstate struct { - State PRState `json:"state"` - Reason PRReason `json:"reason"` - Number int `json:"number,omitempty"` - URL string `json:"url,omitempty"` -} - -// ---- runtime sub-state ---- - -type RuntimeState string +type ReviewDecision string const ( - RuntimeUnknown RuntimeState = "unknown" - RuntimeAlive RuntimeState = "alive" - RuntimeExited RuntimeState = "exited" - RuntimeMissing RuntimeState = "missing" - RuntimeProbeFailed RuntimeState = "probe_failed" + ReviewNone ReviewDecision = "none" + ReviewApproved ReviewDecision = "approved" + ReviewChangesRequest ReviewDecision = "changes_requested" + ReviewRequired ReviewDecision = "review_required" ) -type RuntimeReason string +type Mergeability string const ( - RuntimeReasonSpawnIncomplete RuntimeReason = "spawn_incomplete" - RuntimeReasonProcessRunning RuntimeReason = "process_running" - RuntimeReasonProcessMissing RuntimeReason = "process_missing" - RuntimeReasonTmuxMissing RuntimeReason = "tmux_missing" - RuntimeReasonManualKillRequested RuntimeReason = "manual_kill_requested" - RuntimeReasonPRMergedCleanup RuntimeReason = "pr_merged_cleanup" - RuntimeReasonAutoCleanup RuntimeReason = "auto_cleanup" - RuntimeReasonProbeError RuntimeReason = "probe_error" + MergeUnknown Mergeability = "unknown" + MergeMergeable Mergeability = "mergeable" + MergeConflicting Mergeability = "conflicting" + MergeBlocked Mergeability = "blocked" + MergeUnstable Mergeability = "unstable" ) -type RuntimeSubstate struct { - State RuntimeState `json:"state"` - Reason RuntimeReason `json:"reason"` -} - // ---- activity sub-state (decider input) ---- type ActivityState string diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go index 1cc4404d..0ff5c0fd 100644 --- a/backend/internal/domain/status.go +++ b/backend/internal/domain/status.go @@ -1,7 +1,8 @@ package domain // SessionStatus is the single-word DISPLAY status the dashboard renders. It is -// derived from the canonical lifecycle on read and never persisted. +// derived from the canonical lifecycle (plus the session's PR facts) on read and +// never persisted. type SessionStatus string const ( @@ -26,27 +27,27 @@ const ( StatusTerminated SessionStatus = "terminated" ) -// DeriveLegacyStatus is the ONLY producer of the display status. It must stay a -// pure, total function of the canonical record. +// DeriveStatus is the ONLY producer of the display status. It is a pure, total +// function of the canonical record plus the session's PR facts (read from the pr +// table by the caller, since PR state is no longer persisted on the session). // // Order matters: // 1. Terminal / hard session states (done, terminated, needs_input, stuck, // detecting, not_started) map directly — these OUTRANK PR facts. -// 2. Otherwise a merged PR wins. -// 3. Otherwise a draft PR maps to draft, except CI failure still dominates. -// 4. Otherwise an open PR maps by its reason. -// 5. Otherwise fall through to the SOFT session state (idle/working). +// 2. Otherwise, if the session has a PR: a merged PR wins, else the PR pipeline +// ladder (CI failure dominates, then draft/review/merge states). +// 3. Otherwise fall through to the SOFT session state (idle/working). // // So "PR facts dominate session facts" applies only to the soft states: an idle // or working session with an open, CI-failing PR displays as ci_failed — but a -// session that is stuck or needs_input shows that regardless of PR state, since -// it needs a human either way. -func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { +// session that is stuck or needs_input shows that regardless, since it needs a +// human either way. +func DeriveStatus(l CanonicalSessionLifecycle, pr PRFacts) SessionStatus { switch l.Session.State { case SessionDone: return StatusDone case SessionTerminated: - return terminatedStatus(l.Session.Reason) + return terminatedStatus(l.TerminationReason) case SessionNeedsInput: return StatusNeedsInput case SessionStuck: @@ -57,16 +58,13 @@ func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { return StatusSpawning } - if l.PR.State == PRMerged { - return StatusMerged - } - - if l.PR.State == PRDraft { - return draftPRStatus(l.PR.Reason) - } - - if l.PR.State == PROpen { - return openPRStatus(l.PR.Reason) + if pr.Exists { + if pr.Merged { + return StatusMerged + } + if !pr.Closed { + return prPipelineStatus(pr) + } } if l.Session.State == SessionIdle { @@ -75,37 +73,35 @@ func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { return StatusWorking } -func terminatedStatus(r SessionReason) SessionStatus { +func terminatedStatus(r TerminationReason) SessionStatus { switch r { - case ReasonManuallyKilled, ReasonRuntimeLost, ReasonAgentProcessExited: + case TermManuallyKilled, TermRuntimeLost, TermAgentProcessExited: return StatusKilled - case ReasonAutoCleanup, ReasonPRMerged: + case TermAutoCleanup, TermPRMerged: return StatusCleanup - case ReasonErrorInProcess, ReasonProbeFailure: + case TermErrorInProcess, TermProbeFailure: return StatusErrored default: return StatusTerminated } } -func draftPRStatus(r PRReason) SessionStatus { - if r == PRReasonCIFailing { +// prPipelineStatus maps an open/draft PR's facts to a display status, preserving +// the old ladder: CI failure dominates everything, then draft, then the review / +// merge states. +func prPipelineStatus(pr PRFacts) SessionStatus { + switch { + case pr.CI == CIFailing: return StatusCIFailed - } - return StatusDraft -} - -func openPRStatus(r PRReason) SessionStatus { - switch r { - case PRReasonCIFailing: - return StatusCIFailed - case PRReasonChangesRequested, PRReasonBotComments: + case pr.Draft: + return StatusDraft + case pr.Review == ReviewChangesRequest || pr.BotComments: return StatusChangesRequested - case PRReasonApproved: - return StatusApproved - case PRReasonMergeReady: + case pr.Mergeability == MergeMergeable: return StatusMergeable - case PRReasonReviewPending: + case pr.Review == ReviewApproved: + return StatusApproved + case pr.Review == ReviewRequired: return StatusReviewPending default: return StatusPROpen diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go index 09854998..ae63271c 100644 --- a/backend/internal/domain/status_test.go +++ b/backend/internal/domain/status_test.go @@ -2,117 +2,58 @@ package domain import "testing" -func TestDeriveLegacyStatus(t *testing.T) { +func TestDeriveStatus(t *testing.T) { + // sess builds a non-terminal lifecycle (no reason). + sess := func(s SessionState) CanonicalSessionLifecycle { + return CanonicalSessionLifecycle{Session: SessionSubstate{State: s}} + } + // term builds a terminated lifecycle carrying a TerminationReason. + term := func(r TerminationReason) CanonicalSessionLifecycle { + return CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated}, TerminationReason: r} + } + openPR := func(mut func(*PRFacts)) PRFacts { + f := PRFacts{Exists: true, CI: CIUnknown, Review: ReviewNone, Mergeability: MergeUnknown} + if mut != nil { + mut(&f) + } + return f + } + tests := []struct { name string in CanonicalSessionLifecycle + pr PRFacts want SessionStatus }{ - { - name: "not_started maps to spawning", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNotStarted, Reason: ReasonSpawnRequested}}, - want: StatusSpawning, - }, - { - name: "terminated+manually_killed maps to killed", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonManuallyKilled}}, - want: StatusKilled, - }, - { - name: "terminated+auto_cleanup maps to cleanup", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonAutoCleanup}}, - want: StatusCleanup, - }, - { - name: "terminated+error maps to errored", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonErrorInProcess}}, - want: StatusErrored, - }, - { - name: "hard state needs_input maps directly", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNeedsInput}}, - want: StatusNeedsInput, - }, - { - name: "merged PR dominates an idle session", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionIdle}, - PR: PRSubstate{State: PRMerged}, - }, - want: StatusMerged, - }, - { - name: "open PR with failing CI dominates idle session", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionIdle}, - PR: PRSubstate{State: PROpen, Reason: PRReasonCIFailing}, - }, - want: StatusCIFailed, - }, - { - name: "draft PR with failing CI maps to ci_failed", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PRDraft, Reason: PRReasonCIFailing}, - }, - want: StatusCIFailed, - }, - { - name: "draft PR ignores review and merge reasons", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PRDraft, Reason: PRReasonMergeReady}, - }, - want: StatusDraft, - }, - { - name: "open PR bot comments display as changes_requested", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonBotComments}, - }, - want: StatusChangesRequested, - }, - { - name: "open PR merge conflicts display as plain open", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonMergeConflicts}, - }, - want: StatusPROpen, - }, - { - name: "open PR approved", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonApproved}, - }, - want: StatusApproved, - }, - { - name: "open PR merge_ready maps to mergeable", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonMergeReady}, - }, - want: StatusMergeable, - }, - { - name: "no PR falls through to idle", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionIdle}}, - want: StatusIdle, - }, - { - name: "no PR falls through to working", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionWorking}}, - want: StatusWorking, - }, + {"not_started maps to spawning", sess(SessionNotStarted), PRFacts{}, StatusSpawning}, + {"terminated+manually_killed -> killed", term(TermManuallyKilled), PRFacts{}, StatusKilled}, + {"terminated+runtime_lost -> killed", term(TermRuntimeLost), PRFacts{}, StatusKilled}, + {"terminated+auto_cleanup -> cleanup", term(TermAutoCleanup), PRFacts{}, StatusCleanup}, + {"terminated+pr_merged -> cleanup", term(TermPRMerged), PRFacts{}, StatusCleanup}, + {"terminated+error -> errored", term(TermErrorInProcess), PRFacts{}, StatusErrored}, + {"needs_input maps directly", sess(SessionNeedsInput), PRFacts{}, StatusNeedsInput}, + {"stuck dominates any PR", sess(SessionStuck), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusStuck}, + + {"no PR + idle -> idle", sess(SessionIdle), PRFacts{}, StatusIdle}, + {"no PR + working -> working", sess(SessionWorking), PRFacts{}, StatusWorking}, + + {"merged PR dominates idle session", sess(SessionIdle), PRFacts{Exists: true, Merged: true}, StatusMerged}, + {"open PR failing CI -> ci_failed", sess(SessionIdle), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusCIFailed}, + {"draft PR failing CI -> ci_failed (CI dominates)", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.CI = CIFailing }), StatusCIFailed}, + {"draft PR ignores review state -> draft", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.Review = ReviewApproved }), StatusDraft}, + {"open PR changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewChangesRequest }), StatusChangesRequested}, + {"open PR bot comments -> changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.BotComments = true }), StatusChangesRequested}, + {"open PR mergeable", sess(SessionWorking), openPR(func(f *PRFacts) { f.Mergeability = MergeMergeable }), StatusMergeable}, + {"open PR approved", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewApproved }), StatusApproved}, + {"open PR review required -> review_pending", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewRequired }), StatusReviewPending}, + {"open PR no signal -> pr_open", sess(SessionWorking), openPR(nil), StatusPROpen}, + {"closed PR falls through to soft state", sess(SessionIdle), PRFacts{Exists: true, Closed: true}, StatusIdle}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := DeriveLegacyStatus(tt.in); got != tt.want { - t.Errorf("DeriveLegacyStatus() = %q, want %q", got, tt.want) + if got := DeriveStatus(tt.in, tt.pr); got != tt.want { + t.Errorf("DeriveStatus() = %q, want %q", got, tt.want) } }) } diff --git a/backend/internal/storage/sqlite/cdc_store.go b/backend/internal/storage/sqlite/cdc_store.go deleted file mode 100644 index 8f92eda7..00000000 --- a/backend/internal/storage/sqlite/cdc_store.go +++ /dev/null @@ -1,112 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "errors" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// OutboxEvent is a single undelivered change, joined from outbox + change_log. -// It is the unit the CDC publisher drains to JSONL. -type OutboxEvent struct { - OutboxID int64 - Seq int64 - SessionID string - EventType string - Revision int64 - Payload string - CreatedAt time.Time -} - -// ListUnsent returns up to limit undelivered events in seq order. -func (s *Store) ListUnsent(ctx context.Context, limit int) ([]OutboxEvent, error) { - rows, err := s.q.ListUnsentOutbox(ctx, int64(limit)) - if err != nil { - return nil, fmt.Errorf("list unsent outbox: %w", err) - } - out := make([]OutboxEvent, 0, len(rows)) - for _, r := range rows { - out = append(out, OutboxEvent{ - OutboxID: r.ID, - Seq: r.ChangeLogSeq, - SessionID: r.SessionID, - EventType: r.EventType, - Revision: r.Revision, - Payload: r.Payload, - CreatedAt: r.CreatedAt, - }) - } - return out, nil -} - -// MarkSent flags an outbox row delivered. -func (s *Store) MarkSent(ctx context.Context, outboxID int64, at time.Time) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.MarkOutboxSent(ctx, gen.MarkOutboxSentParams{ - SentAt: sql.NullTime{Time: at, Valid: true}, - ID: outboxID, - }) -} - -// MarkFailed bumps the attempt count and records the last error for an outbox row. -func (s *Store) MarkFailed(ctx context.Context, outboxID int64, errMsg string) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.MarkOutboxFailed(ctx, gen.MarkOutboxFailedParams{LastError: errMsg, ID: outboxID}) -} - -// GetOffset returns a consumer's last acknowledged seq (0 if it has none). -func (s *Store) GetOffset(ctx context.Context, consumer string) (int64, error) { - seq, err := s.q.GetConsumerOffset(ctx, consumer) - if errors.Is(err, sql.ErrNoRows) { - return 0, nil - } - if err != nil { - return 0, fmt.Errorf("get consumer offset %s: %w", consumer, err) - } - return seq, nil -} - -// SetOffset durably records a consumer's acknowledged seq. -func (s *Store) SetOffset(ctx context.Context, consumer string, seq int64, at time.Time) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.UpsertConsumerOffset(ctx, gen.UpsertConsumerOffsetParams{ - Consumer: consumer, - LastSeq: seq, - UpdatedAt: at, - }) -} - -// MaxChangeLogSeq returns the highest change_log seq (0 if empty). Used by the -// consumer to resume after a snapshot resync. -func (s *Store) MaxChangeLogSeq(ctx context.Context) (int64, error) { - v, err := s.q.MaxChangeLogSeq(ctx) - if err != nil { - return 0, fmt.Errorf("max change_log seq: %w", err) - } - return v, nil -} - -// MinConsumerOffset returns the lowest acknowledged seq across all consumers -// (0 if none). The janitor uses it as the safe outbox-deletion watermark. -func (s *Store) MinConsumerOffset(ctx context.Context) (int64, error) { - v, err := s.q.MinConsumerOffset(ctx) - if err != nil { - return 0, fmt.Errorf("min consumer offset: %w", err) - } - return v, nil -} - -// DeleteSentOutboxBelow removes delivered outbox rows whose seq is below the -// watermark, returning the number removed. -func (s *Store) DeleteSentOutboxBelow(ctx context.Context, seq int64) (int64, error) { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.DeleteSentOutboxBelow(ctx, seq) -} diff --git a/backend/internal/storage/sqlite/changelog_store.go b/backend/internal/storage/sqlite/changelog_store.go new file mode 100644 index 00000000..927d7968 --- /dev/null +++ b/backend/internal/storage/sqlite/changelog_store.go @@ -0,0 +1,89 @@ +package sqlite + +import ( + "context" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ChangeLogRow is one durable CDC event. These rows are written by the DB +// triggers (migration 0001), never by application code; the store only reads +// them, for the CDC poller. +type ChangeLogRow struct { + Seq int64 + ProjectID string + SessionID string // empty when the event is project-level (NULL in the DB) + EventType string + Payload string + CreatedAt time.Time +} + +// ReadChangeLogAfter returns up to limit events with seq > after, in seq order +// — the CDC poller's read. The frontend's offset is `after`. +func (s *Store) ReadChangeLogAfter(ctx context.Context, after int64, limit int) ([]ChangeLogRow, error) { + rows, err := s.qr.ReadChangeLogAfter(ctx, gen.ReadChangeLogAfterParams{Seq: after, Limit: int64(limit)}) + if err != nil { + return nil, fmt.Errorf("read change_log after %d: %w", after, err) + } + out := make([]ChangeLogRow, 0, len(rows)) + for _, r := range rows { + out = append(out, changeLogRowFromGen(r)) + } + return out, nil +} + +// ReadChangeLogAfterForProject is the project-scoped variant — a client +// subscribed to one project reads only its events. +func (s *Store) ReadChangeLogAfterForProject(ctx context.Context, project string, after int64, limit int) ([]ChangeLogRow, error) { + rows, err := s.qr.ReadChangeLogAfterForProject(ctx, gen.ReadChangeLogAfterForProjectParams{ + ProjectID: project, Seq: after, Limit: int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("read change_log for %s after %d: %w", project, after, err) + } + out := make([]ChangeLogRow, 0, len(rows)) + for _, r := range rows { + out = append(out, changeLogRowFromGen(r)) + } + return out, nil +} + +// MaxChangeLogSeq returns the highest seq (0 if empty) — a fresh consumer's +// starting offset. +func (s *Store) MaxChangeLogSeq(ctx context.Context) (int64, error) { + v, err := s.qr.MaxChangeLogSeq(ctx) + if err != nil { + return 0, fmt.Errorf("max change_log seq: %w", err) + } + return asInt64(v), nil +} + +func changeLogRowFromGen(r gen.ChangeLog) ChangeLogRow { + row := ChangeLogRow{ + Seq: r.Seq, + ProjectID: r.ProjectID, + EventType: r.EventType, + Payload: r.Payload, + CreatedAt: r.CreatedAt, + } + if r.SessionID.Valid { + row.SessionID = r.SessionID.String + } + return row +} + +// asInt64 coerces sqlc's interface{} result for COALESCE(MAX(...)) — sqlc's +// SQLite type inference can't narrow the aggregate, so the generated signature +// is interface{}. modernc returns int64 for an integer aggregate. +func asInt64(v interface{}) int64 { + switch n := v.(type) { + case int64: + return n + case int: + return int64(n) + default: + return 0 + } +} diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go index 0a2555e4..63f6b7dd 100644 --- a/backend/internal/storage/sqlite/db.go +++ b/backend/internal/storage/sqlite/db.go @@ -1,7 +1,6 @@ -// Package sqlite is the durable persistence adapter behind ports.LifecycleStore. -// It owns the SQLite schema (goose migrations), the revision-CAS upsert, and the -// transactional outbox (one txn writes the session row, a change_log entry, and -// the outbox row that the CDC publisher later drains to JSONL). +// Package sqlite is the durable persistence adapter: the 6-table schema (goose +// migrations), typed CRUD over sqlc-generated queries, and the read side of the +// trigger-driven CDC (it reads change_log; the DB triggers write it). package sqlite import ( @@ -20,40 +19,52 @@ var migrationsFS embed.FS // pragmas are applied on every connection open. WAL + NORMAL lets readers run // concurrently with the writer; busy_timeout absorbs brief writer contention; -// foreign_keys enforces the cascades. +// foreign_keys enforces the cascades and the CDC triggers' lookups. const pragmas = "?_pragma=journal_mode(WAL)" + "&_pragma=busy_timeout(5000)" + "&_pragma=foreign_keys(ON)" + "&_pragma=synchronous(NORMAL)" -// maxConnections caps the pool. WAL allows many concurrent readers, so reads -// (List/Get/GetPR/...) scale across the pool instead of queuing behind a single -// connection. Writes do NOT rely on the pool for serialization — the Store funnels -// every write through its writeMu (see store.go), which keeps WAL's single-writer -// rule and the revision-CAS read-then-write atomic regardless of pool size. -const maxConnections = 8 +// maxReaders caps the reader pool. WAL allows many concurrent readers. +const maxReaders = 8 -// Open opens (creating if absent) the SQLite database under dataDir, applies the -// connection pragmas, and runs all goose migrations up. The returned *sql.DB is -// sized for the many-reader / serialized-single-writer workload the LCM and -// readers impose. -func Open(dataDir string) (*sql.DB, error) { +// Open opens (creating if absent) the SQLite database under dataDir and returns +// a Store. It uses TWO pools against the same file: +// +// - a single WRITER connection (writeDB, MaxOpenConns=1): every write goes +// here, so a write and the CDC triggers' subqueries it fires always see the +// prior writes on the same connection (read-your-writes). This is required +// because the pr/pr_checks triggers SELECT from sessions/pr to fill in the +// event's project_id; a pooled writer could land that read on a connection +// that hasn't caught up to the commit and read NULL. +// - a READER pool (readDB, MaxOpenConns=maxReaders): all reads scale across +// it; WAL readers see the latest committed snapshot. +func Open(dataDir string) (*Store, error) { if err := os.MkdirAll(dataDir, 0o755); err != nil { return nil, fmt.Errorf("create data dir: %w", err) } dsn := "file:" + filepath.Join(dataDir, "ao.db") + pragmas - db, err := sql.Open("sqlite", dsn) + + writeDB, err := sql.Open("sqlite", dsn) if err != nil { - return nil, fmt.Errorf("open sqlite: %w", err) + return nil, fmt.Errorf("open sqlite writer: %w", err) } - db.SetMaxOpenConns(maxConnections) - db.SetMaxIdleConns(maxConnections) // keep reader conns warm; avoid open/close churn - - if err := migrate(db); err != nil { - db.Close() + writeDB.SetMaxOpenConns(1) + writeDB.SetMaxIdleConns(1) + if err := migrate(writeDB); err != nil { + writeDB.Close() return nil, err } - return db, nil + + readDB, err := sql.Open("sqlite", dsn) + if err != nil { + writeDB.Close() + return nil, fmt.Errorf("open sqlite reader: %w", err) + } + readDB.SetMaxOpenConns(maxReaders) + readDB.SetMaxIdleConns(maxReaders) + + return NewStore(writeDB, readDB), nil } func migrate(db *sql.DB) error { diff --git a/backend/internal/storage/sqlite/gen/cdc.sql.go b/backend/internal/storage/sqlite/gen/cdc.sql.go deleted file mode 100644 index c2eedc8c..00000000 --- a/backend/internal/storage/sqlite/gen/cdc.sql.go +++ /dev/null @@ -1,199 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 -// source: cdc.sql - -package gen - -import ( - "context" - "database/sql" - "time" -) - -const deleteSentOutboxBelow = `-- name: DeleteSentOutboxBelow :execrows -DELETE FROM outbox WHERE sent = 1 AND change_log_seq < ? -` - -func (q *Queries) DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) { - result, err := q.db.ExecContext(ctx, deleteSentOutboxBelow, changeLogSeq) - if err != nil { - return 0, err - } - return result.RowsAffected() -} - -const getConsumerOffset = `-- name: GetConsumerOffset :one -SELECT last_seq FROM consumer_offsets WHERE consumer = ? -` - -func (q *Queries) GetConsumerOffset(ctx context.Context, consumer string) (int64, error) { - row := q.db.QueryRowContext(ctx, getConsumerOffset, consumer) - var last_seq int64 - err := row.Scan(&last_seq) - return last_seq, err -} - -const insertChangeLog = `-- name: InsertChangeLog :one -INSERT INTO change_log (session_id, event_type, revision, payload, created_at) -VALUES (?, ?, ?, ?, ?) -RETURNING seq -` - -type InsertChangeLogParams struct { - SessionID string - EventType string - Revision int64 - Payload string - CreatedAt time.Time -} - -// Appends a canonical-write record and returns its monotonic seq so the same -// transaction can thread it into the outbox row. -func (q *Queries) InsertChangeLog(ctx context.Context, arg InsertChangeLogParams) (int64, error) { - row := q.db.QueryRowContext(ctx, insertChangeLog, - arg.SessionID, - arg.EventType, - arg.Revision, - arg.Payload, - arg.CreatedAt, - ) - var seq int64 - err := row.Scan(&seq) - return seq, err -} - -const insertOutbox = `-- name: InsertOutbox :exec -INSERT INTO outbox (change_log_seq, created_at) -VALUES (?, ?) -` - -type InsertOutboxParams struct { - ChangeLogSeq int64 - CreatedAt time.Time -} - -func (q *Queries) InsertOutbox(ctx context.Context, arg InsertOutboxParams) error { - _, err := q.db.ExecContext(ctx, insertOutbox, arg.ChangeLogSeq, arg.CreatedAt) - return err -} - -const listUnsentOutbox = `-- name: ListUnsentOutbox :many -SELECT o.id, o.change_log_seq, o.attempts, - c.session_id, c.event_type, c.revision, c.payload, c.created_at -FROM outbox o -JOIN change_log c ON c.seq = o.change_log_seq -WHERE o.sent = 0 -ORDER BY o.change_log_seq -LIMIT ? -` - -type ListUnsentOutboxRow struct { - ID int64 - ChangeLogSeq int64 - Attempts int64 - SessionID string - EventType string - Revision int64 - Payload string - CreatedAt time.Time -} - -func (q *Queries) ListUnsentOutbox(ctx context.Context, limit int64) ([]ListUnsentOutboxRow, error) { - rows, err := q.db.QueryContext(ctx, listUnsentOutbox, limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []ListUnsentOutboxRow{} - for rows.Next() { - var i ListUnsentOutboxRow - if err := rows.Scan( - &i.ID, - &i.ChangeLogSeq, - &i.Attempts, - &i.SessionID, - &i.EventType, - &i.Revision, - &i.Payload, - &i.CreatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const markOutboxFailed = `-- name: MarkOutboxFailed :exec -UPDATE outbox SET attempts = attempts + 1, last_error = ? WHERE id = ? -` - -type MarkOutboxFailedParams struct { - LastError string - ID int64 -} - -func (q *Queries) MarkOutboxFailed(ctx context.Context, arg MarkOutboxFailedParams) error { - _, err := q.db.ExecContext(ctx, markOutboxFailed, arg.LastError, arg.ID) - return err -} - -const markOutboxSent = `-- name: MarkOutboxSent :exec -UPDATE outbox SET sent = 1, sent_at = ? WHERE id = ? -` - -type MarkOutboxSentParams struct { - SentAt sql.NullTime - ID int64 -} - -func (q *Queries) MarkOutboxSent(ctx context.Context, arg MarkOutboxSentParams) error { - _, err := q.db.ExecContext(ctx, markOutboxSent, arg.SentAt, arg.ID) - return err -} - -const maxChangeLogSeq = `-- name: MaxChangeLogSeq :one -SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) FROM change_log -` - -func (q *Queries) MaxChangeLogSeq(ctx context.Context) (int64, error) { - row := q.db.QueryRowContext(ctx, maxChangeLogSeq) - var column_1 int64 - err := row.Scan(&column_1) - return column_1, err -} - -const minConsumerOffset = `-- name: MinConsumerOffset :one -SELECT CAST(COALESCE(MIN(last_seq), 0) AS INTEGER) FROM consumer_offsets -` - -func (q *Queries) MinConsumerOffset(ctx context.Context) (int64, error) { - row := q.db.QueryRowContext(ctx, minConsumerOffset) - var column_1 int64 - err := row.Scan(&column_1) - return column_1, err -} - -const upsertConsumerOffset = `-- name: UpsertConsumerOffset :exec -INSERT INTO consumer_offsets (consumer, last_seq, updated_at) -VALUES (?, ?, ?) -ON CONFLICT (consumer) DO UPDATE SET last_seq = excluded.last_seq, updated_at = excluded.updated_at -` - -type UpsertConsumerOffsetParams struct { - Consumer string - LastSeq int64 - UpdatedAt time.Time -} - -func (q *Queries) UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error { - _, err := q.db.ExecContext(ctx, upsertConsumerOffset, arg.Consumer, arg.LastSeq, arg.UpdatedAt) - return err -} diff --git a/backend/internal/storage/sqlite/gen/changelog.sql.go b/backend/internal/storage/sqlite/gen/changelog.sql.go new file mode 100644 index 00000000..6568fdcc --- /dev/null +++ b/backend/internal/storage/sqlite/gen/changelog.sql.go @@ -0,0 +1,102 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: changelog.sql + +package gen + +import ( + "context" +) + +const maxChangeLogSeq = `-- name: MaxChangeLogSeq :one +SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log +` + +func (q *Queries) MaxChangeLogSeq(ctx context.Context) (interface{}, error) { + row := q.db.QueryRowContext(ctx, maxChangeLogSeq) + var seq interface{} + err := row.Scan(&seq) + return seq, err +} + +const readChangeLogAfter = `-- name: ReadChangeLogAfter :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE seq > ? ORDER BY seq LIMIT ? +` + +type ReadChangeLogAfterParams struct { + Seq int64 + Limit int64 +} + +func (q *Queries) ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfterParams) ([]ChangeLog, error) { + rows, err := q.db.QueryContext(ctx, readChangeLogAfter, arg.Seq, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ChangeLog{} + for rows.Next() { + var i ChangeLog + if err := rows.Scan( + &i.Seq, + &i.ProjectID, + &i.SessionID, + &i.EventType, + &i.Payload, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const readChangeLogAfterForProject = `-- name: ReadChangeLogAfterForProject :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ? +` + +type ReadChangeLogAfterForProjectParams struct { + ProjectID string + Seq int64 + Limit int64 +} + +func (q *Queries) ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) { + rows, err := q.db.QueryContext(ctx, readChangeLogAfterForProject, arg.ProjectID, arg.Seq, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ChangeLog{} + for rows.Next() { + var i ChangeLog + if err := rows.Scan( + &i.Seq, + &i.ProjectID, + &i.SessionID, + &i.EventType, + &i.Payload, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/backend/internal/storage/sqlite/gen/metadata.sql.go b/backend/internal/storage/sqlite/gen/metadata.sql.go deleted file mode 100644 index 2c0396f7..00000000 --- a/backend/internal/storage/sqlite/gen/metadata.sql.go +++ /dev/null @@ -1,82 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 -// source: metadata.sql - -package gen - -import ( - "context" - "time" -) - -const getSessionMetadata = `-- name: GetSessionMetadata :one -SELECT branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt -FROM session_metadata -WHERE session_id = ? -` - -type GetSessionMetadataRow struct { - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string -} - -func (q *Queries) GetSessionMetadata(ctx context.Context, sessionID string) (GetSessionMetadataRow, error) { - row := q.db.QueryRowContext(ctx, getSessionMetadata, sessionID) - var i GetSessionMetadataRow - err := row.Scan( - &i.Branch, - &i.WorkspacePath, - &i.RuntimeHandleID, - &i.RuntimeName, - &i.AgentSessionID, - &i.Prompt, - ) - return i, err -} - -const upsertSessionMetadata = `-- name: UpsertSessionMetadata :exec -INSERT INTO session_metadata ( - session_id, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET - branch = CASE WHEN excluded.branch <> '' THEN excluded.branch ELSE session_metadata.branch END, - workspace_path = CASE WHEN excluded.workspace_path <> '' THEN excluded.workspace_path ELSE session_metadata.workspace_path END, - runtime_handle_id = CASE WHEN excluded.runtime_handle_id <> '' THEN excluded.runtime_handle_id ELSE session_metadata.runtime_handle_id END, - runtime_name = CASE WHEN excluded.runtime_name <> '' THEN excluded.runtime_name ELSE session_metadata.runtime_name END, - agent_session_id = CASE WHEN excluded.agent_session_id <> '' THEN excluded.agent_session_id ELSE session_metadata.agent_session_id END, - prompt = CASE WHEN excluded.prompt <> '' THEN excluded.prompt ELSE session_metadata.prompt END, - updated_at = excluded.updated_at -` - -type UpsertSessionMetadataParams struct { - SessionID string - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string - UpdatedAt time.Time -} - -// Merge semantics: an empty incoming column is "leave unchanged", so a partial -// patch (e.g. spawn writing only the runtime handle) never clobbers a value set -// earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. -func (q *Queries) UpsertSessionMetadata(ctx context.Context, arg UpsertSessionMetadataParams) error { - _, err := q.db.ExecContext(ctx, upsertSessionMetadata, - arg.SessionID, - arg.Branch, - arg.WorkspacePath, - arg.RuntimeHandleID, - arg.RuntimeName, - arg.AgentSessionID, - arg.Prompt, - arg.UpdatedAt, - ) - return err -} diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go index 339062bf..0c5b5c91 100644 --- a/backend/internal/storage/sqlite/gen/models.go +++ b/backend/internal/storage/sqlite/gen/models.go @@ -11,50 +11,36 @@ import ( type ChangeLog struct { Seq int64 - SessionID string + ProjectID string + SessionID sql.NullString EventType string - Revision int64 Payload string CreatedAt time.Time } -type ConsumerOffset struct { - Consumer string - LastSeq int64 - UpdatedAt time.Time -} - -type Outbox struct { - ID int64 - ChangeLogSeq int64 - Sent int64 - SentAt sql.NullTime - Attempts int64 - LastError string - CreatedAt time.Time -} - type Pr struct { + Url string SessionID string + Number int64 + PrState string ReviewDecision string - Mergeability string CiState string - CiPassed int64 - CiFailed int64 - CiPending int64 - CiLogTail string - LastFetchedAt time.Time + Mergeability string + UpdatedAt time.Time } type PrCheck struct { - SessionID string - Name string - Status string - Url string + PrUrl string + Name string + CommitHash string + Status string + Url string + LogTail string + CreatedAt time.Time } type PrComment struct { - SessionID string + PrUrl string CommentID string Author string File string @@ -67,58 +53,34 @@ type PrComment struct { type Project struct { ID string Path string - RepoOwner string - RepoName string - RepoPlatform string RepoOriginUrl string - DefaultBranch string DisplayName string - SessionPrefix string - Source string RegisteredAt time.Time ArchivedAt sql.NullTime } -type ReactionTracker struct { - SessionID string - ReactionKey string - Attempts int64 - Escalated int64 - FirstAttemptAt sql.NullTime - ProjectID string -} - type Session struct { ID string ProjectID string + Num int64 IssueID string Kind string - CreatedAt time.Time - UpdatedAt time.Time - Revision int64 + Harness string SessionState string - SessionReason string - PrState string - PrReason string - PrNumber int64 - PrUrl string - RuntimeState string - RuntimeReason string + TerminationReason string + IsAlive int64 ActivityState string ActivityLastAt time.Time ActivitySource string DetectingAttempts sql.NullInt64 DetectingStartedAt sql.NullTime DetectingEvidenceHash sql.NullString -} - -type SessionMetadatum struct { - SessionID string - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string - UpdatedAt time.Time + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time } diff --git a/backend/internal/storage/sqlite/gen/pr.sql.go b/backend/internal/storage/sqlite/gen/pr.sql.go index 95cbd20a..f9fa3620 100644 --- a/backend/internal/storage/sqlite/gen/pr.sql.go +++ b/backend/internal/storage/sqlite/gen/pr.sql.go @@ -11,173 +11,56 @@ import ( ) const deletePR = `-- name: DeletePR :exec -DELETE FROM pr WHERE session_id = ? +DELETE FROM pr WHERE url = ? ` -func (q *Queries) DeletePR(ctx context.Context, sessionID string) error { - _, err := q.db.ExecContext(ctx, deletePR, sessionID) - return err -} - -const deletePRChecks = `-- name: DeletePRChecks :exec -DELETE FROM pr_check WHERE session_id = ? -` - -func (q *Queries) DeletePRChecks(ctx context.Context, sessionID string) error { - _, err := q.db.ExecContext(ctx, deletePRChecks, sessionID) - return err -} - -const deletePRComments = `-- name: DeletePRComments :exec -DELETE FROM pr_comment WHERE session_id = ? -` - -func (q *Queries) DeletePRComments(ctx context.Context, sessionID string) error { - _, err := q.db.ExecContext(ctx, deletePRComments, sessionID) +func (q *Queries) DeletePR(ctx context.Context, url string) error { + _, err := q.db.ExecContext(ctx, deletePR, url) return err } const getPR = `-- name: GetPR :one -SELECT session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at -FROM pr -WHERE session_id = ? +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE url = ? ` -func (q *Queries) GetPR(ctx context.Context, sessionID string) (Pr, error) { - row := q.db.QueryRowContext(ctx, getPR, sessionID) +func (q *Queries) GetPR(ctx context.Context, url string) (Pr, error) { + row := q.db.QueryRowContext(ctx, getPR, url) var i Pr err := row.Scan( + &i.Url, &i.SessionID, + &i.Number, + &i.PrState, &i.ReviewDecision, - &i.Mergeability, &i.CiState, - &i.CiPassed, - &i.CiFailed, - &i.CiPending, - &i.CiLogTail, - &i.LastFetchedAt, + &i.Mergeability, + &i.UpdatedAt, ) return i, err } -const insertPRCheck = `-- name: InsertPRCheck :exec -INSERT INTO pr_check (session_id, name, status, url) VALUES (?, ?, ?, ?) -` - -type InsertPRCheckParams struct { - SessionID string - Name string - Status string - Url string -} - -func (q *Queries) InsertPRCheck(ctx context.Context, arg InsertPRCheckParams) error { - _, err := q.db.ExecContext(ctx, insertPRCheck, - arg.SessionID, - arg.Name, - arg.Status, - arg.Url, - ) - return err -} - -const insertPRComment = `-- name: InsertPRComment :exec -INSERT INTO pr_comment (session_id, comment_id, author, file, line, body, resolved, created_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?) -` - -type InsertPRCommentParams struct { - SessionID string - CommentID string - Author string - File string - Line int64 - Body string - Resolved int64 - CreatedAt time.Time -} - -func (q *Queries) InsertPRComment(ctx context.Context, arg InsertPRCommentParams) error { - _, err := q.db.ExecContext(ctx, insertPRComment, - arg.SessionID, - arg.CommentID, - arg.Author, - arg.File, - arg.Line, - arg.Body, - arg.Resolved, - arg.CreatedAt, - ) - return err -} - -const listPRChecks = `-- name: ListPRChecks :many -SELECT name, status, url FROM pr_check WHERE session_id = ? ORDER BY name +const listPRsBySession = `-- name: ListPRsBySession :many +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE session_id = ? ORDER BY updated_at DESC ` -type ListPRChecksRow struct { - Name string - Status string - Url string -} - -func (q *Queries) ListPRChecks(ctx context.Context, sessionID string) ([]ListPRChecksRow, error) { - rows, err := q.db.QueryContext(ctx, listPRChecks, sessionID) +func (q *Queries) ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) { + rows, err := q.db.QueryContext(ctx, listPRsBySession, sessionID) if err != nil { return nil, err } defer rows.Close() - items := []ListPRChecksRow{} + items := []Pr{} for rows.Next() { - var i ListPRChecksRow - if err := rows.Scan(&i.Name, &i.Status, &i.Url); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const listPRComments = `-- name: ListPRComments :many -SELECT comment_id, author, file, line, body, resolved, created_at -FROM pr_comment -WHERE session_id = ? -ORDER BY created_at, comment_id -` - -type ListPRCommentsRow struct { - CommentID string - Author string - File string - Line int64 - Body string - Resolved int64 - CreatedAt time.Time -} - -func (q *Queries) ListPRComments(ctx context.Context, sessionID string) ([]ListPRCommentsRow, error) { - rows, err := q.db.QueryContext(ctx, listPRComments, sessionID) - if err != nil { - return nil, err - } - defer rows.Close() - items := []ListPRCommentsRow{} - for rows.Next() { - var i ListPRCommentsRow + var i Pr if err := rows.Scan( - &i.CommentID, - &i.Author, - &i.File, - &i.Line, - &i.Body, - &i.Resolved, - &i.CreatedAt, + &i.Url, + &i.SessionID, + &i.Number, + &i.PrState, + &i.ReviewDecision, + &i.CiState, + &i.Mergeability, + &i.UpdatedAt, ); err != nil { return nil, err } @@ -193,43 +76,39 @@ func (q *Queries) ListPRComments(ctx context.Context, sessionID string) ([]ListP } const upsertPR = `-- name: UpsertPR :exec -INSERT INTO pr ( - session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET +INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (url) DO UPDATE SET + session_id = excluded.session_id, + number = excluded.number, + pr_state = excluded.pr_state, review_decision = excluded.review_decision, - mergeability = excluded.mergeability, - ci_state = excluded.ci_state, - ci_passed = excluded.ci_passed, - ci_failed = excluded.ci_failed, - ci_pending = excluded.ci_pending, - ci_log_tail = excluded.ci_log_tail, - last_fetched_at = excluded.last_fetched_at + ci_state = excluded.ci_state, + mergeability = excluded.mergeability, + updated_at = excluded.updated_at ` type UpsertPRParams struct { + Url string SessionID string + Number int64 + PrState string ReviewDecision string - Mergeability string CiState string - CiPassed int64 - CiFailed int64 - CiPending int64 - CiLogTail string - LastFetchedAt time.Time + Mergeability string + UpdatedAt time.Time } func (q *Queries) UpsertPR(ctx context.Context, arg UpsertPRParams) error { _, err := q.db.ExecContext(ctx, upsertPR, + arg.Url, arg.SessionID, + arg.Number, + arg.PrState, arg.ReviewDecision, - arg.Mergeability, arg.CiState, - arg.CiPassed, - arg.CiFailed, - arg.CiPending, - arg.CiLogTail, - arg.LastFetchedAt, + arg.Mergeability, + arg.UpdatedAt, ) return err } diff --git a/backend/internal/storage/sqlite/gen/pr_checks.sql.go b/backend/internal/storage/sqlite/gen/pr_checks.sql.go new file mode 100644 index 00000000..58668ab1 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr_checks.sql.go @@ -0,0 +1,119 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr_checks.sql + +package gen + +import ( + "context" + "time" +) + +const listChecksByPR = `-- name: ListChecksByPR :many +SELECT pr_url, name, commit_hash, status, url, log_tail, created_at FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at +` + +func (q *Queries) ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) { + rows, err := q.db.QueryContext(ctx, listChecksByPR, prUrl) + if err != nil { + return nil, err + } + defer rows.Close() + items := []PrCheck{} + for rows.Next() { + var i PrCheck + if err := rows.Scan( + &i.PrUrl, + &i.Name, + &i.CommitHash, + &i.Status, + &i.Url, + &i.LogTail, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const listRecentChecks = `-- name: ListRecentChecks :many +SELECT status, commit_hash, created_at FROM pr_checks +WHERE pr_url = ? AND name = ? +ORDER BY created_at DESC LIMIT ? +` + +type ListRecentChecksParams struct { + PrUrl string + Name string + Limit int64 +} + +type ListRecentChecksRow struct { + Status string + CommitHash string + CreatedAt time.Time +} + +func (q *Queries) ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) { + rows, err := q.db.QueryContext(ctx, listRecentChecks, arg.PrUrl, arg.Name, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ListRecentChecksRow{} + for rows.Next() { + var i ListRecentChecksRow + if err := rows.Scan(&i.Status, &i.CommitHash, &i.CreatedAt); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPRCheck = `-- name: UpsertPRCheck :exec +INSERT INTO pr_checks (pr_url, name, commit_hash, status, url, log_tail, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET + status = excluded.status, + url = excluded.url, + log_tail = excluded.log_tail +` + +type UpsertPRCheckParams struct { + PrUrl string + Name string + CommitHash string + Status string + Url string + LogTail string + CreatedAt time.Time +} + +func (q *Queries) UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error { + _, err := q.db.ExecContext(ctx, upsertPRCheck, + arg.PrUrl, + arg.Name, + arg.CommitHash, + arg.Status, + arg.Url, + arg.LogTail, + arg.CreatedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/pr_comment.sql.go b/backend/internal/storage/sqlite/gen/pr_comment.sql.go new file mode 100644 index 00000000..a2f09f34 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr_comment.sql.go @@ -0,0 +1,89 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr_comment.sql + +package gen + +import ( + "context" + "time" +) + +const deletePRComments = `-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE pr_url = ? +` + +func (q *Queries) DeletePRComments(ctx context.Context, prUrl string) error { + _, err := q.db.ExecContext(ctx, deletePRComments, prUrl) + return err +} + +const listPRComments = `-- name: ListPRComments :many +SELECT pr_url, comment_id, author, file, line, body, resolved, created_at FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id +` + +func (q *Queries) ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) { + rows, err := q.db.QueryContext(ctx, listPRComments, prUrl) + if err != nil { + return nil, err + } + defer rows.Close() + items := []PrComment{} + for rows.Next() { + var i PrComment + if err := rows.Scan( + &i.PrUrl, + &i.CommentID, + &i.Author, + &i.File, + &i.Line, + &i.Body, + &i.Resolved, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPRComment = `-- name: UpsertPRComment :exec +INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, comment_id) DO UPDATE SET + author = excluded.author, file = excluded.file, line = excluded.line, + body = excluded.body, resolved = excluded.resolved +` + +type UpsertPRCommentParams struct { + PrUrl string + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time +} + +func (q *Queries) UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error { + _, err := q.db.ExecContext(ctx, upsertPRComment, + arg.PrUrl, + arg.CommentID, + arg.Author, + arg.File, + arg.Line, + arg.Body, + arg.Resolved, + arg.CreatedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/projects.sql.go b/backend/internal/storage/sqlite/gen/projects.sql.go index 33959b76..a7c953cd 100644 --- a/backend/internal/storage/sqlite/gen/projects.sql.go +++ b/backend/internal/storage/sqlite/gen/projects.sql.go @@ -25,19 +25,9 @@ func (q *Queries) ArchiveProject(ctx context.Context, arg ArchiveProjectParams) return err } -const deleteProject = `-- name: DeleteProject :exec -DELETE FROM projects WHERE id = ? -` - -func (q *Queries) DeleteProject(ctx context.Context, id string) error { - _, err := q.db.ExecContext(ctx, deleteProject, id) - return err -} - const getProject = `-- name: GetProject :one -SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at -FROM projects -WHERE id = ? +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE id = ? ` func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { @@ -46,14 +36,8 @@ func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { err := row.Scan( &i.ID, &i.Path, - &i.RepoOwner, - &i.RepoName, - &i.RepoPlatform, &i.RepoOriginUrl, - &i.DefaultBranch, &i.DisplayName, - &i.SessionPrefix, - &i.Source, &i.RegisteredAt, &i.ArchivedAt, ) @@ -61,10 +45,8 @@ func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { } const listProjects = `-- name: ListProjects :many -SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at -FROM projects -WHERE archived_at IS NULL -ORDER BY id +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE archived_at IS NULL ORDER BY id ` func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { @@ -79,14 +61,8 @@ func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { if err := rows.Scan( &i.ID, &i.Path, - &i.RepoOwner, - &i.RepoName, - &i.RepoPlatform, &i.RepoOriginUrl, - &i.DefaultBranch, &i.DisplayName, - &i.SessionPrefix, - &i.Source, &i.RegisteredAt, &i.ArchivedAt, ); err != nil { @@ -104,33 +80,20 @@ func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { } const upsertProject = `-- name: UpsertProject :exec -INSERT INTO projects (id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +INSERT INTO projects (id, path, repo_origin_url, display_name, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT (id) DO UPDATE SET path = excluded.path, - repo_owner = excluded.repo_owner, - repo_name = excluded.repo_name, - repo_platform = excluded.repo_platform, repo_origin_url = excluded.repo_origin_url, - default_branch = excluded.default_branch, display_name = excluded.display_name, - session_prefix = excluded.session_prefix, - source = excluded.source, - registered_at = excluded.registered_at, archived_at = excluded.archived_at ` type UpsertProjectParams struct { ID string Path string - RepoOwner string - RepoName string - RepoPlatform string RepoOriginUrl string - DefaultBranch string DisplayName string - SessionPrefix string - Source string RegisteredAt time.Time ArchivedAt sql.NullTime } @@ -139,14 +102,8 @@ func (q *Queries) UpsertProject(ctx context.Context, arg UpsertProjectParams) er _, err := q.db.ExecContext(ctx, upsertProject, arg.ID, arg.Path, - arg.RepoOwner, - arg.RepoName, - arg.RepoPlatform, arg.RepoOriginUrl, - arg.DefaultBranch, arg.DisplayName, - arg.SessionPrefix, - arg.Source, arg.RegisteredAt, arg.ArchivedAt, ) diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go index 83aa0c7e..365113b1 100644 --- a/backend/internal/storage/sqlite/gen/querier.go +++ b/backend/internal/storage/sqlite/gen/querier.go @@ -10,50 +10,29 @@ import ( type Querier interface { ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error - DeletePR(ctx context.Context, sessionID string) error - DeletePRChecks(ctx context.Context, sessionID string) error - DeletePRComments(ctx context.Context, sessionID string) error - DeleteProject(ctx context.Context, id string) error - DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error - DeleteSentOutboxBelow(ctx context.Context, changeLogSeq int64) (int64, error) - DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error - GetConsumerOffset(ctx context.Context, consumer string) (int64, error) - GetPR(ctx context.Context, sessionID string) (Pr, error) + DeletePR(ctx context.Context, url string) error + DeletePRComments(ctx context.Context, prUrl string) error + DeleteSession(ctx context.Context, id string) error + GetPR(ctx context.Context, url string) (Pr, error) GetProject(ctx context.Context, id string) (Project, error) GetSession(ctx context.Context, id string) (Session, error) - GetSessionMetadata(ctx context.Context, sessionID string) (GetSessionMetadataRow, error) - GetSessionRevision(ctx context.Context, id string) (int64, error) - // Appends a canonical-write record and returns its monotonic seq so the same - // transaction can thread it into the outbox row. - InsertChangeLog(ctx context.Context, arg InsertChangeLogParams) (int64, error) - InsertOutbox(ctx context.Context, arg InsertOutboxParams) error - InsertPRCheck(ctx context.Context, arg InsertPRCheckParams) error - InsertPRComment(ctx context.Context, arg InsertPRCommentParams) error - // CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; - // the row is persisted at revision 1. - InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) + InsertSession(ctx context.Context, arg InsertSessionParams) error ListAllSessions(ctx context.Context) ([]Session, error) - ListPRChecks(ctx context.Context, sessionID string) ([]ListPRChecksRow, error) - ListPRComments(ctx context.Context, sessionID string) ([]ListPRCommentsRow, error) + ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) + ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) + ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) ListProjects(ctx context.Context) ([]Project, error) - ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) + ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) - ListUnsentOutbox(ctx context.Context, limit int64) ([]ListUnsentOutboxRow, error) - MarkOutboxFailed(ctx context.Context, arg MarkOutboxFailedParams) error - MarkOutboxSent(ctx context.Context, arg MarkOutboxSentParams) error - MaxChangeLogSeq(ctx context.Context) (int64, error) - MinConsumerOffset(ctx context.Context) (int64, error) - // CAS update: succeeds only when the stored revision equals the caller's loaded - // revision (@expected_revision). 0 rows affected => revision mismatch. - UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) - UpsertConsumerOffset(ctx context.Context, arg UpsertConsumerOffsetParams) error + MaxChangeLogSeq(ctx context.Context) (interface{}, error) + NextSessionNum(ctx context.Context, projectID string) (int64, error) + ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfterParams) ([]ChangeLog, error) + ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) + UpdateSession(ctx context.Context, arg UpdateSessionParams) error UpsertPR(ctx context.Context, arg UpsertPRParams) error + UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error + UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error UpsertProject(ctx context.Context, arg UpsertProjectParams) error - UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error - // Merge semantics: an empty incoming column is "leave unchanged", so a partial - // patch (e.g. spawn writing only the runtime handle) never clobbers a value set - // earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. - UpsertSessionMetadata(ctx context.Context, arg UpsertSessionMetadataParams) error } var _ Querier = (*Queries)(nil) diff --git a/backend/internal/storage/sqlite/gen/reactions.sql.go b/backend/internal/storage/sqlite/gen/reactions.sql.go deleted file mode 100644 index dc7b01c2..00000000 --- a/backend/internal/storage/sqlite/gen/reactions.sql.go +++ /dev/null @@ -1,100 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 -// source: reactions.sql - -package gen - -import ( - "context" - "database/sql" -) - -const deleteReactionTracker = `-- name: DeleteReactionTracker :exec -DELETE FROM reaction_trackers WHERE session_id = ? AND reaction_key = ? -` - -type DeleteReactionTrackerParams struct { - SessionID string - ReactionKey string -} - -func (q *Queries) DeleteReactionTracker(ctx context.Context, arg DeleteReactionTrackerParams) error { - _, err := q.db.ExecContext(ctx, deleteReactionTracker, arg.SessionID, arg.ReactionKey) - return err -} - -const deleteSessionReactionTrackers = `-- name: DeleteSessionReactionTrackers :exec -DELETE FROM reaction_trackers WHERE session_id = ? -` - -func (q *Queries) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error { - _, err := q.db.ExecContext(ctx, deleteSessionReactionTrackers, sessionID) - return err -} - -const listReactionTrackers = `-- name: ListReactionTrackers :many -SELECT session_id, reaction_key, attempts, escalated, first_attempt_at, project_id -FROM reaction_trackers -` - -func (q *Queries) ListReactionTrackers(ctx context.Context) ([]ReactionTracker, error) { - rows, err := q.db.QueryContext(ctx, listReactionTrackers) - if err != nil { - return nil, err - } - defer rows.Close() - items := []ReactionTracker{} - for rows.Next() { - var i ReactionTracker - if err := rows.Scan( - &i.SessionID, - &i.ReactionKey, - &i.Attempts, - &i.Escalated, - &i.FirstAttemptAt, - &i.ProjectID, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const upsertReactionTracker = `-- name: UpsertReactionTracker :exec -INSERT INTO reaction_trackers (session_id, reaction_key, attempts, escalated, first_attempt_at, project_id) -VALUES (?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id, reaction_key) DO UPDATE SET - attempts = excluded.attempts, - escalated = excluded.escalated, - first_attempt_at = excluded.first_attempt_at, - project_id = excluded.project_id -` - -type UpsertReactionTrackerParams struct { - SessionID string - ReactionKey string - Attempts int64 - Escalated int64 - FirstAttemptAt sql.NullTime - ProjectID string -} - -func (q *Queries) UpsertReactionTracker(ctx context.Context, arg UpsertReactionTrackerParams) error { - _, err := q.db.ExecContext(ctx, upsertReactionTracker, - arg.SessionID, - arg.ReactionKey, - arg.Attempts, - arg.Escalated, - arg.FirstAttemptAt, - arg.ProjectID, - ) - return err -} diff --git a/backend/internal/storage/sqlite/gen/sessions.sql.go b/backend/internal/storage/sqlite/gen/sessions.sql.go index 00d97ad6..5365a22c 100644 --- a/backend/internal/storage/sqlite/gen/sessions.sql.go +++ b/backend/internal/storage/sqlite/gen/sessions.sql.go @@ -11,8 +11,17 @@ import ( "time" ) +const deleteSession = `-- name: DeleteSession :exec +DELETE FROM sessions WHERE id = ? +` + +func (q *Queries) DeleteSession(ctx context.Context, id string) error { + _, err := q.db.ExecContext(ctx, deleteSession, id) + return err +} + const getSession = `-- name: GetSession :one -SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions WHERE id = ? +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE id = ? ` func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { @@ -21,117 +30,99 @@ func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { err := row.Scan( &i.ID, &i.ProjectID, + &i.Num, &i.IssueID, &i.Kind, - &i.CreatedAt, - &i.UpdatedAt, - &i.Revision, + &i.Harness, &i.SessionState, - &i.SessionReason, - &i.PrState, - &i.PrReason, - &i.PrNumber, - &i.PrUrl, - &i.RuntimeState, - &i.RuntimeReason, + &i.TerminationReason, + &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, &i.DetectingAttempts, &i.DetectingStartedAt, &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, ) return i, err } -const getSessionRevision = `-- name: GetSessionRevision :one -SELECT revision FROM sessions WHERE id = ? -` - -func (q *Queries) GetSessionRevision(ctx context.Context, id string) (int64, error) { - row := q.db.QueryRowContext(ctx, getSessionRevision, id) - var revision int64 - err := row.Scan(&revision) - return revision, err -} - -const insertSession = `-- name: InsertSession :execrows +const insertSession = `-- name: InsertSession :exec INSERT INTO sessions ( - id, project_id, issue_id, kind, created_at, updated_at, - revision, - session_state, session_reason, - pr_state, pr_reason, pr_number, pr_url, - runtime_state, runtime_reason, + id, project_id, num, issue_id, kind, harness, + session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, - detecting_attempts, detecting_started_at, detecting_evidence_hash -) VALUES ( - ?, ?, ?, ?, ?, ?, - 1, - ?, ?, - ?, ?, ?, ?, - ?, ?, - ?, ?, ?, - ?, ?, ? -) -ON CONFLICT (id) DO NOTHING + detecting_attempts, detecting_started_at, detecting_evidence_hash, + branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + created_at, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ` type InsertSessionParams struct { ID string ProjectID string + Num int64 IssueID string Kind string - CreatedAt time.Time - UpdatedAt time.Time + Harness string SessionState string - SessionReason string - PrState string - PrReason string - PrNumber int64 - PrUrl string - RuntimeState string - RuntimeReason string + TerminationReason string + IsAlive int64 ActivityState string ActivityLastAt time.Time ActivitySource string DetectingAttempts sql.NullInt64 DetectingStartedAt sql.NullTime DetectingEvidenceHash sql.NullString + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time } -// CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; -// the row is persisted at revision 1. -func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) (int64, error) { - result, err := q.db.ExecContext(ctx, insertSession, +func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) error { + _, err := q.db.ExecContext(ctx, insertSession, arg.ID, arg.ProjectID, + arg.Num, arg.IssueID, arg.Kind, - arg.CreatedAt, - arg.UpdatedAt, + arg.Harness, arg.SessionState, - arg.SessionReason, - arg.PrState, - arg.PrReason, - arg.PrNumber, - arg.PrUrl, - arg.RuntimeState, - arg.RuntimeReason, + arg.TerminationReason, + arg.IsAlive, arg.ActivityState, arg.ActivityLastAt, arg.ActivitySource, arg.DetectingAttempts, arg.DetectingStartedAt, arg.DetectingEvidenceHash, + arg.Branch, + arg.WorkspacePath, + arg.RuntimeHandleID, + arg.RuntimeName, + arg.AgentSessionID, + arg.Prompt, + arg.CreatedAt, + arg.UpdatedAt, ) - if err != nil { - return 0, err - } - return result.RowsAffected() + return err } const listAllSessions = `-- name: ListAllSessions :many -SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions ORDER BY project_id, num ` func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { @@ -146,25 +137,27 @@ func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { if err := rows.Scan( &i.ID, &i.ProjectID, + &i.Num, &i.IssueID, &i.Kind, - &i.CreatedAt, - &i.UpdatedAt, - &i.Revision, + &i.Harness, &i.SessionState, - &i.SessionReason, - &i.PrState, - &i.PrReason, - &i.PrNumber, - &i.PrUrl, - &i.RuntimeState, - &i.RuntimeReason, + &i.TerminationReason, + &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, &i.DetectingAttempts, &i.DetectingStartedAt, &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, ); err != nil { return nil, err } @@ -180,7 +173,7 @@ func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { } const listSessionsByProject = `-- name: ListSessionsByProject :many -SELECT id, project_id, issue_id, kind, created_at, updated_at, revision, session_state, session_reason, pr_state, pr_reason, pr_number, pr_url, runtime_state, runtime_reason, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash FROM sessions WHERE project_id = ? +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE project_id = ? ORDER BY num ` func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) { @@ -195,25 +188,27 @@ func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ( if err := rows.Scan( &i.ID, &i.ProjectID, + &i.Num, &i.IssueID, &i.Kind, - &i.CreatedAt, - &i.UpdatedAt, - &i.Revision, + &i.Harness, &i.SessionState, - &i.SessionReason, - &i.PrState, - &i.PrReason, - &i.PrNumber, - &i.PrUrl, - &i.RuntimeState, - &i.RuntimeReason, + &i.TerminationReason, + &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, &i.DetectingAttempts, &i.DetectingStartedAt, &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, ); err != nil { return nil, err } @@ -228,80 +223,73 @@ func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ( return items, nil } -const updateSessionCAS = `-- name: UpdateSessionCAS :execrows +const nextSessionNum = `-- name: NextSessionNum :one +SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ? +` + +func (q *Queries) NextSessionNum(ctx context.Context, projectID string) (int64, error) { + row := q.db.QueryRowContext(ctx, nextSessionNum, projectID) + var next int64 + err := row.Scan(&next) + return next, err +} + +const updateSession = `-- name: UpdateSession :exec UPDATE sessions SET - project_id = ?, - issue_id = ?, - kind = ?, - updated_at = ?, - revision = revision + 1, - session_state = ?, - session_reason = ?, - pr_state = ?, - pr_reason = ?, - pr_number = ?, - pr_url = ?, - runtime_state = ?, - runtime_reason = ?, - activity_state = ?, - activity_last_at = ?, - activity_source = ?, - detecting_attempts = ?, - detecting_started_at = ?, - detecting_evidence_hash = ? -WHERE id = ? AND revision = ? + issue_id = ?, kind = ?, harness = ?, + session_state = ?, termination_reason = ?, is_alive = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, + detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + updated_at = ? +WHERE id = ? ` -type UpdateSessionCASParams struct { - ProjectID string +type UpdateSessionParams struct { IssueID string Kind string - UpdatedAt time.Time + Harness string SessionState string - SessionReason string - PrState string - PrReason string - PrNumber int64 - PrUrl string - RuntimeState string - RuntimeReason string + TerminationReason string + IsAlive int64 ActivityState string ActivityLastAt time.Time ActivitySource string DetectingAttempts sql.NullInt64 DetectingStartedAt sql.NullTime DetectingEvidenceHash sql.NullString + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + UpdatedAt time.Time ID string - Revision int64 } -// CAS update: succeeds only when the stored revision equals the caller's loaded -// revision (@expected_revision). 0 rows affected => revision mismatch. -func (q *Queries) UpdateSessionCAS(ctx context.Context, arg UpdateSessionCASParams) (int64, error) { - result, err := q.db.ExecContext(ctx, updateSessionCAS, - arg.ProjectID, +func (q *Queries) UpdateSession(ctx context.Context, arg UpdateSessionParams) error { + _, err := q.db.ExecContext(ctx, updateSession, arg.IssueID, arg.Kind, - arg.UpdatedAt, + arg.Harness, arg.SessionState, - arg.SessionReason, - arg.PrState, - arg.PrReason, - arg.PrNumber, - arg.PrUrl, - arg.RuntimeState, - arg.RuntimeReason, + arg.TerminationReason, + arg.IsAlive, arg.ActivityState, arg.ActivityLastAt, arg.ActivitySource, arg.DetectingAttempts, arg.DetectingStartedAt, arg.DetectingEvidenceHash, + arg.Branch, + arg.WorkspacePath, + arg.RuntimeHandleID, + arg.RuntimeName, + arg.AgentSessionID, + arg.Prompt, + arg.UpdatedAt, arg.ID, - arg.Revision, ) - if err != nil { - return 0, err - } - return result.RowsAffected() + return err } diff --git a/backend/internal/storage/sqlite/mapping.go b/backend/internal/storage/sqlite/mapping.go index 39ae2127..792854cf 100644 --- a/backend/internal/storage/sqlite/mapping.go +++ b/backend/internal/storage/sqlite/mapping.go @@ -7,104 +7,100 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// recordToInsert maps a domain record to the generated insert params. The -// revision column is fixed to 1 by the query itself (insert path), so it is not -// carried here. -func recordToInsert(rec domain.SessionRecord) gen.InsertSessionParams { - lc := rec.Lifecycle - da, ds, dh := detectingToNull(lc.Detecting) +func boolToInt(b bool) int64 { + if b { + return 1 + } + return 0 +} + +// rowToRecord maps a stored session row to a domain record. The folded-in +// operational columns become Metadata; the canonical lifecycle is reassembled +// from the typed columns. Display status is never reconstructed here. +func rowToRecord(row gen.Session) domain.SessionRecord { + return domain.SessionRecord{ + ID: domain.SessionID(row.ID), + ProjectID: domain.ProjectID(row.ProjectID), + IssueID: domain.IssueID(row.IssueID), + Kind: domain.SessionKind(row.Kind), + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Harness: domain.AgentHarness(row.Harness), + IsAlive: row.IsAlive != 0, + Session: domain.SessionSubstate{State: domain.SessionState(row.SessionState)}, + TerminationReason: domain.TerminationReason(row.TerminationReason), + Activity: domain.ActivitySubstate{ + State: domain.ActivityState(row.ActivityState), + LastActivityAt: row.ActivityLastAt, + Source: domain.ActivitySource(row.ActivitySource), + }, + Detecting: nullToDetecting(row), + }, + Metadata: domain.SessionMetadata{ + Branch: row.Branch, + WorkspacePath: row.WorkspacePath, + RuntimeHandleID: row.RuntimeHandleID, + RuntimeName: row.RuntimeName, + AgentSessionID: row.AgentSessionID, + Prompt: row.Prompt, + }, + CreatedAt: row.CreatedAt, + UpdatedAt: row.UpdatedAt, + } +} + +func recordToInsert(rec domain.SessionRecord, num int64) gen.InsertSessionParams { + da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) return gen.InsertSessionParams{ ID: string(rec.ID), ProjectID: string(rec.ProjectID), + Num: num, IssueID: string(rec.IssueID), Kind: string(rec.Kind), - CreatedAt: rec.CreatedAt, - UpdatedAt: rec.UpdatedAt, - SessionState: string(lc.Session.State), - SessionReason: string(lc.Session.Reason), - PrState: string(lc.PR.State), - PrReason: string(lc.PR.Reason), - PrNumber: int64(lc.PR.Number), - PrUrl: lc.PR.URL, - RuntimeState: string(lc.Runtime.State), - RuntimeReason: string(lc.Runtime.Reason), - ActivityState: string(lc.Activity.State), - ActivityLastAt: lc.Activity.LastActivityAt, - ActivitySource: string(lc.Activity.Source), + Harness: string(rec.Lifecycle.Harness), + SessionState: string(rec.Lifecycle.Session.State), + TerminationReason: string(rec.Lifecycle.TerminationReason), + IsAlive: boolToInt(rec.Lifecycle.IsAlive), + ActivityState: string(rec.Lifecycle.Activity.State), + ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, + ActivitySource: string(rec.Lifecycle.Activity.Source), DetectingAttempts: da, DetectingStartedAt: ds, DetectingEvidenceHash: dh, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + RuntimeName: rec.Metadata.RuntimeName, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + CreatedAt: rec.CreatedAt, + UpdatedAt: rec.UpdatedAt, } } -// recordToUpdate maps a domain record to the CAS update params. expectedRevision -// is the caller's loaded revision, used in the WHERE clause for the CAS check. -func recordToUpdate(rec domain.SessionRecord, expectedRevision int64) gen.UpdateSessionCASParams { - lc := rec.Lifecycle - da, ds, dh := detectingToNull(lc.Detecting) - return gen.UpdateSessionCASParams{ - ProjectID: string(rec.ProjectID), +func recordToUpdate(rec domain.SessionRecord) gen.UpdateSessionParams { + da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) + return gen.UpdateSessionParams{ IssueID: string(rec.IssueID), Kind: string(rec.Kind), - UpdatedAt: rec.UpdatedAt, - SessionState: string(lc.Session.State), - SessionReason: string(lc.Session.Reason), - PrState: string(lc.PR.State), - PrReason: string(lc.PR.Reason), - PrNumber: int64(lc.PR.Number), - PrUrl: lc.PR.URL, - RuntimeState: string(lc.Runtime.State), - RuntimeReason: string(lc.Runtime.Reason), - ActivityState: string(lc.Activity.State), - ActivityLastAt: lc.Activity.LastActivityAt, - ActivitySource: string(lc.Activity.Source), + Harness: string(rec.Lifecycle.Harness), + SessionState: string(rec.Lifecycle.Session.State), + TerminationReason: string(rec.Lifecycle.TerminationReason), + IsAlive: boolToInt(rec.Lifecycle.IsAlive), + ActivityState: string(rec.Lifecycle.Activity.State), + ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, + ActivitySource: string(rec.Lifecycle.Activity.Source), DetectingAttempts: da, DetectingStartedAt: ds, DetectingEvidenceHash: dh, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + RuntimeName: rec.Metadata.RuntimeName, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + UpdatedAt: rec.UpdatedAt, ID: string(rec.ID), - Revision: expectedRevision, - } -} - -// rowToRecord maps a stored session row back to a domain record. Metadata is -// deliberately left nil: it is a side-channel (session_metadata) read only by -// GetMetadata, never reconstructed here — mirroring the in-memory fakeStore. -func rowToRecord(row gen.Session) domain.SessionRecord { - return domain.SessionRecord{ - ID: domain.SessionID(row.ID), - ProjectID: domain.ProjectID(row.ProjectID), - IssueID: domain.IssueID(row.IssueID), - Kind: domain.SessionKind(row.Kind), - Lifecycle: rowToLifecycle(row), - CreatedAt: row.CreatedAt, - UpdatedAt: row.UpdatedAt, - } -} - -func rowToLifecycle(row gen.Session) domain.CanonicalSessionLifecycle { - return domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Revision: int(row.Revision), - Session: domain.SessionSubstate{ - State: domain.SessionState(row.SessionState), - Reason: domain.SessionReason(row.SessionReason), - }, - PR: domain.PRSubstate{ - State: domain.PRState(row.PrState), - Reason: domain.PRReason(row.PrReason), - Number: int(row.PrNumber), - URL: row.PrUrl, - }, - Runtime: domain.RuntimeSubstate{ - State: domain.RuntimeState(row.RuntimeState), - Reason: domain.RuntimeReason(row.RuntimeReason), - }, - Activity: domain.ActivitySubstate{ - State: domain.ActivityState(row.ActivityState), - LastActivityAt: row.ActivityLastAt, - Source: domain.ActivitySource(row.ActivitySource), - }, - Detecting: nullToDetecting(row), } } diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql index 38224125..6534816d 100644 --- a/backend/internal/storage/sqlite/migrations/0001_init.sql +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -1,116 +1,213 @@ -- +goose Up -- +goose StatementBegin --- sessions holds identity + the canonical lifecycle as typed columns. The --- display status is NEVER stored (it is derived on read). Metadata is NOT here — --- it lives in session_metadata, written by a side-channel that bypasses CDC. -CREATE TABLE sessions ( - id TEXT PRIMARY KEY, - project_id TEXT NOT NULL, - issue_id TEXT NOT NULL DEFAULT '', - kind TEXT NOT NULL, - created_at TIMESTAMP NOT NULL, - updated_at TIMESTAMP NOT NULL, - - -- canonical lifecycle: revision is the optimistic-concurrency (CAS) counter, - -- bumped only by the storage layer's Upsert. - revision INTEGER NOT NULL, - - session_state TEXT NOT NULL, - session_reason TEXT NOT NULL, - - pr_state TEXT NOT NULL, - pr_reason TEXT NOT NULL, - pr_number INTEGER NOT NULL DEFAULT 0, - pr_url TEXT NOT NULL DEFAULT '', - - runtime_state TEXT NOT NULL, - runtime_reason TEXT NOT NULL, +-- projects is the durable registry of repos AO manages (the SQLite twin of the +-- YAML config). id is a short human/LLM-friendly slug (mer, ao) with a numeric +-- suffix on collision (ao, ao1, ao2). Soft-delete via archived_at keeps the row +-- so a session's project_id always resolves. +CREATE TABLE projects ( + id TEXT PRIMARY KEY, + path TEXT NOT NULL, + repo_origin_url TEXT NOT NULL DEFAULT '', + display_name TEXT NOT NULL DEFAULT '', + registered_at TIMESTAMP NOT NULL, + archived_at TIMESTAMP +); - activity_state TEXT NOT NULL, - activity_last_at TIMESTAMP NOT NULL, - activity_source TEXT NOT NULL, +-- sessions is the canonical record. id is "{project_id}-{num}" (e.g. mer-1) — a +-- single string key, so every inbound FK is single-column. num is the per-project +-- counter (computed at insert under the write mutex). Operational metadata is +-- folded in (no separate table). is_alive replaces the old runtime axis; there is +-- no revision column — the per-session write mutex serializes and change_log.seq +-- orders. The display status is derived on read (from this + the pr row), never +-- stored. +CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + project_id TEXT NOT NULL REFERENCES projects (id), + num INTEGER NOT NULL, + issue_id TEXT NOT NULL DEFAULT '', + kind TEXT NOT NULL DEFAULT 'worker', + harness TEXT NOT NULL DEFAULT '' + CHECK (harness IN ('', 'claude-code', 'codex', 'aider', 'opencode')), + + session_state TEXT NOT NULL + CHECK (session_state IN ('not_started', 'working', 'idle', 'needs_input', 'stuck', 'detecting', 'done', 'terminated')), + -- only terminal sessions carry a reason; '' otherwise. + termination_reason TEXT NOT NULL DEFAULT '' + CHECK (termination_reason IN ('', 'manually_killed', 'runtime_lost', 'agent_process_exited', 'probe_failure', 'error_in_process', 'auto_cleanup', 'pr_merged')), + is_alive INTEGER NOT NULL DEFAULT 0, + + activity_state TEXT NOT NULL DEFAULT 'idle', + activity_last_at TIMESTAMP NOT NULL, + activity_source TEXT NOT NULL DEFAULT 'none', -- detecting quarantine memory; NULL when the session is not in detecting. - detecting_attempts INTEGER, - detecting_started_at TIMESTAMP, - detecting_evidence_hash TEXT + detecting_attempts INTEGER, + detecting_started_at TIMESTAMP, + detecting_evidence_hash TEXT, + + -- folded-in operational handles (was the session_metadata table) + branch TEXT NOT NULL DEFAULT '', + workspace_path TEXT NOT NULL DEFAULT '', + runtime_handle_id TEXT NOT NULL DEFAULT '', + runtime_name TEXT NOT NULL DEFAULT '', + agent_session_id TEXT NOT NULL DEFAULT '', + prompt TEXT NOT NULL DEFAULT '', + + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL, + + UNIQUE (project_id, num) ); - CREATE INDEX idx_sessions_project ON sessions (project_id); --- session_metadata is the 1:1 typed side-channel for a session's operational --- handles and seed inputs — the fields the Session Manager and reaper need but --- that are NOT part of the canonical lifecycle. One row per session, named --- columns (not a free-form key/value bag), so the set of metadata a session can --- carry is fixed by the schema. Written by PatchMetadata; never bumps revision --- and never emits a CDC event. -CREATE TABLE session_metadata ( - session_id TEXT PRIMARY KEY REFERENCES sessions (id) ON DELETE CASCADE, - branch TEXT NOT NULL DEFAULT '', - workspace_path TEXT NOT NULL DEFAULT '', - runtime_handle_id TEXT NOT NULL DEFAULT '', - runtime_name TEXT NOT NULL DEFAULT '', - agent_session_id TEXT NOT NULL DEFAULT '', - prompt TEXT NOT NULL DEFAULT '', - updated_at TIMESTAMP NOT NULL +-- pr holds PR facts keyed by the normalized PR URL. One session can own many PRs +-- (session_id FK), but a PR belongs to one session (enforced at runtime). ci_state +-- is the rolled-up status; the per-check history lives in pr_checks. +CREATE TABLE pr ( + url TEXT PRIMARY KEY, + session_id TEXT NOT NULL REFERENCES sessions (id) ON DELETE CASCADE, + number INTEGER NOT NULL DEFAULT 0, + pr_state TEXT NOT NULL DEFAULT 'open' + CHECK (pr_state IN ('draft', 'open', 'merged', 'closed')), + review_decision TEXT NOT NULL DEFAULT 'none' + CHECK (review_decision IN ('none', 'approved', 'changes_requested', 'review_required')), + ci_state TEXT NOT NULL DEFAULT 'unknown' + CHECK (ci_state IN ('unknown', 'pending', 'passing', 'failing')), + mergeability TEXT NOT NULL DEFAULT 'unknown' + CHECK (mergeability IN ('unknown', 'mergeable', 'conflicting', 'blocked', 'unstable')), + updated_at TIMESTAMP NOT NULL +); +CREATE INDEX idx_pr_session ON pr (session_id); + +-- pr_checks is CI run history: one row per (PR, check, commit). The CI-fix-loop +-- brake is a LIMIT 3 query over it ("last 3 runs of this check all failed?") — no +-- counter is stored. Re-polling the same commit upserts the same row. +CREATE TABLE pr_checks ( + pr_url TEXT NOT NULL REFERENCES pr (url) ON DELETE CASCADE, + name TEXT NOT NULL, + commit_hash TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'unknown' + CHECK (status IN ('unknown', 'queued', 'in_progress', 'passed', 'failed', 'skipped', 'cancelled')), + url TEXT NOT NULL DEFAULT '', + log_tail TEXT NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL, + PRIMARY KEY (pr_url, name, commit_hash) +); +CREATE INDEX idx_pr_checks_lookup ON pr_checks (pr_url, name, created_at); + +-- pr_comment holds review comments, persisted so a session page does not wait on +-- GitHub. Cascades from pr. +CREATE TABLE pr_comment ( + pr_url TEXT NOT NULL REFERENCES pr (url) ON DELETE CASCADE, + comment_id TEXT NOT NULL, + author TEXT NOT NULL DEFAULT '', + file TEXT NOT NULL DEFAULT '', + line INTEGER NOT NULL DEFAULT 0, + body TEXT NOT NULL DEFAULT '', + resolved INTEGER NOT NULL DEFAULT 0, + created_at TIMESTAMP NOT NULL, + PRIMARY KEY (pr_url, comment_id) ); --- change_log is the durable, ordered record of every canonical write. seq is the --- monotonic CDC ordering/idempotency key. +-- change_log is the durable, append-only CDC event log. seq is the monotonic +-- ordering + idempotency key. Rows are written by TRIGGERS on the user-visible +-- tables (DB-native capture, atomic with the change) — never by application +-- emit-code. project_id is required, session_id is nullable (project-level events +-- have no session). The log is immutable (no published flag); consumers track +-- their own offset (SSE Last-Event-ID). CREATE TABLE change_log ( seq INTEGER PRIMARY KEY AUTOINCREMENT, - session_id TEXT NOT NULL, + project_id TEXT NOT NULL REFERENCES projects (id), + session_id TEXT REFERENCES sessions (id), event_type TEXT NOT NULL, - revision INTEGER NOT NULL, payload TEXT NOT NULL, - created_at TIMESTAMP NOT NULL + created_at TIMESTAMP NOT NULL DEFAULT (datetime('now')) ); +CREATE INDEX idx_change_log_project ON change_log (project_id, seq); --- outbox is the transactional-outbox: one unsent row per canonical write, drained --- by the publisher into JSONL. change_log_seq links it to its change_log row. -CREATE TABLE outbox ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - change_log_seq INTEGER NOT NULL REFERENCES change_log (seq), - sent INTEGER NOT NULL DEFAULT 0, - sent_at TIMESTAMP, - attempts INTEGER NOT NULL DEFAULT 0, - last_error TEXT NOT NULL DEFAULT '', - created_at TIMESTAMP NOT NULL -); +-- +goose StatementEnd -CREATE INDEX idx_outbox_unsent ON outbox (change_log_seq) WHERE sent = 0; +-- CDC capture triggers. Each is its own goose statement (the trigger body holds +-- semicolons). They write change_log atomically with the originating change, so +-- the application never emits events — it just writes sessions/pr/pr_checks. --- consumer_offsets is the durable per-consumer cursor (at-least-once delivery). -CREATE TABLE consumer_offsets ( - consumer TEXT PRIMARY KEY, - last_seq INTEGER NOT NULL DEFAULT 0, - updated_at TIMESTAMP NOT NULL -); +-- +goose StatementBegin +CREATE TRIGGER sessions_cdc_insert +AFTER INSERT ON sessions +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES (NEW.project_id, NEW.id, 'session_created', + json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, + 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + NEW.updated_at); +END; +-- +goose StatementEnd --- reaction_trackers is the durable escalation budget (persisted so a restart does --- not re-fire human pages). Off the canonical CDC path. Mirrors the LCM's --- in-memory reactionTracker: attempts (numeric budget), escalated (silences --- further auto-dispatch), first_attempt_at (duration-escalation anchor), --- project_id (captured at first attempt for the escalation event). -CREATE TABLE reaction_trackers ( - session_id TEXT NOT NULL, - reaction_key TEXT NOT NULL, - attempts INTEGER NOT NULL DEFAULT 0, - escalated INTEGER NOT NULL DEFAULT 0, - first_attempt_at TIMESTAMP, - project_id TEXT NOT NULL DEFAULT '', - PRIMARY KEY (session_id, reaction_key) -); +-- +goose StatementBegin +CREATE TRIGGER sessions_cdc_update +AFTER UPDATE ON sessions +WHEN OLD.session_state <> NEW.session_state + OR OLD.termination_reason <> NEW.termination_reason + OR OLD.is_alive <> NEW.is_alive + OR OLD.activity_state <> NEW.activity_state +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES (NEW.project_id, NEW.id, 'session_updated', + json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, + 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER pr_cdc_insert +AFTER INSERT ON pr +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ((SELECT project_id FROM sessions WHERE id = NEW.session_id), NEW.session_id, 'pr_created', + json_object('url', NEW.url, 'session', NEW.session_id, 'state', NEW.pr_state, + 'ci', NEW.ci_state, 'review', NEW.review_decision, 'mergeability', NEW.mergeability), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER pr_cdc_update +AFTER UPDATE ON pr +WHEN OLD.pr_state <> NEW.pr_state + OR OLD.ci_state <> NEW.ci_state + OR OLD.review_decision <> NEW.review_decision + OR OLD.mergeability <> NEW.mergeability +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ((SELECT project_id FROM sessions WHERE id = NEW.session_id), NEW.session_id, 'pr_updated', + json_object('url', NEW.url, 'session', NEW.session_id, 'state', NEW.pr_state, + 'ci', NEW.ci_state, 'review', NEW.review_decision, 'mergeability', NEW.mergeability), + NEW.updated_at); +END; +-- +goose StatementEnd +-- +goose StatementBegin +CREATE TRIGGER pr_checks_cdc_insert +AFTER INSERT ON pr_checks +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ( + (SELECT s.project_id FROM pr p JOIN sessions s ON s.id = p.session_id WHERE p.url = NEW.pr_url), + (SELECT session_id FROM pr WHERE url = NEW.pr_url), + 'pr_check_recorded', + json_object('pr', NEW.pr_url, 'name', NEW.name, 'commit', NEW.commit_hash, 'status', NEW.status), + NEW.created_at); +END; -- +goose StatementEnd -- +goose Down -- +goose StatementBegin -DROP TABLE reaction_trackers; -DROP TABLE consumer_offsets; -DROP TABLE outbox; DROP TABLE change_log; -DROP TABLE session_metadata; +DROP TABLE pr_comment; +DROP TABLE pr_checks; +DROP TABLE pr; DROP TABLE sessions; +DROP TABLE projects; -- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql b/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql deleted file mode 100644 index da987ed5..00000000 --- a/backend/internal/storage/sqlite/migrations/0002_pr_projects.sql +++ /dev/null @@ -1,85 +0,0 @@ --- +goose Up --- +goose StatementBegin - --- projects is the durable registry of repos AO manages, the SQLite twin of the --- old YAML config (global config.yaml + per-repo agent-orchestrator.yaml). id is --- the {basename}_{sha256(path:originUrl)[:10]} key the session layer references --- via sessions.project_id. The relationship is app-enforced, NOT a hard FK: --- SQLite cannot ALTER ADD a FK without a table rebuild, and an existing-session --- backfill may land sessions before their project row. -CREATE TABLE projects ( - id TEXT PRIMARY KEY, - path TEXT NOT NULL, - repo_owner TEXT NOT NULL DEFAULT '', - repo_name TEXT NOT NULL DEFAULT '', - repo_platform TEXT NOT NULL DEFAULT '', - repo_origin_url TEXT NOT NULL DEFAULT '', - default_branch TEXT NOT NULL DEFAULT '', - display_name TEXT NOT NULL DEFAULT '', - session_prefix TEXT NOT NULL DEFAULT '', - source TEXT NOT NULL DEFAULT '', - registered_at TIMESTAMP NOT NULL, - - -- soft delete: NULL = active. Archiving keeps the row so a session's - -- project_id always resolves (there is no FK to enforce it), avoiding - -- dangling references; active-only reads filter archived_at IS NULL. - archived_at TIMESTAMP -); - --- pr is the SCM observer's per-session cache of the rich PR facts that do NOT --- live in the canonical lifecycle (which keeps only pr_state/reason/number/url). --- 1:1 with a session (a PR is tied to a session by its branch), written by the --- SCM observer OFF the canonical CDC path (no revision bump, no change_log/outbox --- event), and cascades away with its session. Scalar facts are typed columns — --- review_decision/mergeability/ci_state are CHECK-constrained enums and the CI --- counts are integers, not opaque strings; the list facts (individual checks and --- review comments) are normalized into pr_check / pr_comment. -CREATE TABLE pr ( - session_id TEXT PRIMARY KEY REFERENCES sessions (id) ON DELETE CASCADE, - review_decision TEXT NOT NULL DEFAULT 'none' - CHECK (review_decision IN ('none', 'approved', 'changes_requested', 'review_required')), - mergeability TEXT NOT NULL DEFAULT 'unknown' - CHECK (mergeability IN ('unknown', 'mergeable', 'conflicting', 'blocked', 'unstable')), - ci_state TEXT NOT NULL DEFAULT 'unknown' - CHECK (ci_state IN ('unknown', 'pending', 'passing', 'failing')), - ci_passed INTEGER NOT NULL DEFAULT 0, - ci_failed INTEGER NOT NULL DEFAULT 0, - ci_pending INTEGER NOT NULL DEFAULT 0, - ci_log_tail TEXT NOT NULL DEFAULT '', - last_fetched_at TIMESTAMP NOT NULL -); - --- pr_check is one CI check belonging to a pr (the normalized form of the old --- ci_summary string). It cascades from pr, so it cannot outlive its PR facts. -CREATE TABLE pr_check ( - session_id TEXT NOT NULL REFERENCES pr (session_id) ON DELETE CASCADE, - name TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'unknown' - CHECK (status IN ('unknown', 'queued', 'in_progress', 'passed', 'failed', 'skipped', 'cancelled')), - url TEXT NOT NULL DEFAULT '', - PRIMARY KEY (session_id, name) -); - --- pr_comment is one unresolved review comment belonging to a pr (the normalized --- form of the old pending_comments JSON-in-a-string). Cascades from pr. -CREATE TABLE pr_comment ( - session_id TEXT NOT NULL REFERENCES pr (session_id) ON DELETE CASCADE, - comment_id TEXT NOT NULL, - author TEXT NOT NULL DEFAULT '', - file TEXT NOT NULL DEFAULT '', - line INTEGER NOT NULL DEFAULT 0, - body TEXT NOT NULL DEFAULT '', - resolved INTEGER NOT NULL DEFAULT 0, - created_at TIMESTAMP NOT NULL, - PRIMARY KEY (session_id, comment_id) -); - --- +goose StatementEnd - --- +goose Down --- +goose StatementBegin -DROP TABLE pr_comment; -DROP TABLE pr_check; -DROP TABLE pr; -DROP TABLE projects; --- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/pr_projects_test.go b/backend/internal/storage/sqlite/pr_projects_test.go deleted file mode 100644 index 58227b1f..00000000 --- a/backend/internal/storage/sqlite/pr_projects_test.go +++ /dev/null @@ -1,210 +0,0 @@ -package sqlite - -import ( - "context" - "reflect" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -func TestProjectUpsertGetListDelete(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - now := time.Now().UTC().Truncate(time.Second) - - if _, ok, err := s.GetProject(ctx, "p1"); err != nil || ok { - t.Fatalf("get missing: ok=%v err=%v", ok, err) - } - - p := ProjectRow{ - ID: "p1", Path: "/repo", RepoOwner: "acme", RepoName: "widget", - RepoPlatform: "github", RepoOriginURL: "git@github.com:acme/widget.git", - DefaultBranch: "main", DisplayName: "Widget", SessionPrefix: "wid", - Source: "local", RegisteredAt: now, - } - if err := s.UpsertProject(ctx, p); err != nil { - t.Fatalf("upsert: %v", err) - } - - got, ok, err := s.GetProject(ctx, "p1") - if err != nil || !ok { - t.Fatalf("get: ok=%v err=%v", ok, err) - } - if got != p { - t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, p) - } - - // Upsert again with a changed field updates in place (no duplicate). - p.DisplayName = "Widget 2" - if err := s.UpsertProject(ctx, p); err != nil { - t.Fatalf("re-upsert: %v", err) - } - list, err := s.ListProjects(ctx) - if err != nil { - t.Fatalf("list: %v", err) - } - if len(list) != 1 || list[0].DisplayName != "Widget 2" { - t.Fatalf("list after re-upsert = %+v", list) - } - - if err := s.DeleteProject(ctx, "p1"); err != nil { - t.Fatalf("delete: %v", err) - } - if _, ok, _ := s.GetProject(ctx, "p1"); ok { - t.Fatal("project should be gone after delete") - } -} - -func TestArchiveProjectHidesFromListButGetResolves(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - now := time.Now().UTC().Truncate(time.Second) - - if err := s.UpsertProject(ctx, ProjectRow{ID: "p1", Path: "/repo", RegisteredAt: now}); err != nil { - t.Fatalf("upsert: %v", err) - } - if err := s.ArchiveProject(ctx, "p1", now); err != nil { - t.Fatalf("archive: %v", err) - } - - // Active-only list hides it. - list, err := s.ListProjects(ctx) - if err != nil { - t.Fatalf("list: %v", err) - } - if len(list) != 0 { - t.Fatalf("archived project should not appear in ListProjects, got %+v", list) - } - - // Get still resolves it (a session's project_id must not dangle) and reports - // the archived marker. - got, ok, err := s.GetProject(ctx, "p1") - if err != nil || !ok { - t.Fatalf("get archived: ok=%v err=%v", ok, err) - } - if got.ArchivedAt.IsZero() { - t.Fatal("archived project should carry a non-zero ArchivedAt") - } -} - -func TestPRUpsertGetDelete(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - now := time.Now().UTC().Truncate(time.Second) - - // pr FKs sessions(id); seed the session first. - if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { - t.Fatalf("seed session: %v", err) - } - - if _, ok, err := s.GetPR(ctx, "s1"); err != nil || ok { - t.Fatalf("get missing: ok=%v err=%v", ok, err) - } - - pr := PRRow{ - SessionID: "s1", ReviewDecision: "changes_requested", Mergeability: "blocked", - CIState: "failing", CIPassed: 3, CIFailed: 1, CIPending: 0, CILogTail: "FAIL TestX", - LastFetchedAt: now, - } - if err := s.UpsertPR(ctx, pr); err != nil { - t.Fatalf("upsert: %v", err) - } - - got, ok, err := s.GetPR(ctx, "s1") - if err != nil || !ok { - t.Fatalf("get: ok=%v err=%v", ok, err) - } - if got != pr { - t.Fatalf("round-trip mismatch:\n got %+v\nwant %+v", got, pr) - } - - if err := s.DeletePR(ctx, "s1"); err != nil { - t.Fatalf("delete: %v", err) - } - if _, ok, _ := s.GetPR(ctx, "s1"); ok { - t.Fatal("pr should be gone after delete") - } -} - -func TestPRRejectsBadEnum(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { - t.Fatalf("seed session: %v", err) - } - // review_decision is a CHECK-constrained enum; an off-list value must fail. - err := s.UpsertPR(ctx, PRRow{ - SessionID: "s1", ReviewDecision: "definitely_not_a_decision", - Mergeability: "unknown", CIState: "unknown", LastFetchedAt: time.Now().UTC(), - }) - if err == nil { - t.Fatal("expected CHECK constraint to reject an invalid review_decision") - } -} - -func TestPRChecksAndCommentsReplaceAndList(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - now := time.Now().UTC().Truncate(time.Second) - - if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { - t.Fatalf("seed session: %v", err) - } - // pr_check / pr_comment FK pr(session_id); the pr row must exist first. - if err := s.UpsertPR(ctx, PRRow{ - SessionID: "s1", ReviewDecision: "review_required", Mergeability: "unknown", - CIState: "pending", LastFetchedAt: now, - }); err != nil { - t.Fatalf("upsert pr: %v", err) - } - - checks := []PRCheck{ - {Name: "build", Status: "passed", URL: "https://ci/build"}, - {Name: "test", Status: "failed", URL: "https://ci/test"}, - } - if err := s.ReplacePRChecks(ctx, "s1", checks); err != nil { - t.Fatalf("replace checks: %v", err) - } - gotChecks, err := s.ListPRChecks(ctx, "s1") - if err != nil { - t.Fatalf("list checks: %v", err) - } - if !reflect.DeepEqual(gotChecks, checks) { - t.Fatalf("checks = %+v, want %+v", gotChecks, checks) - } - // Replace is a set-replace, not a merge: a shorter set removes the rest. - if err := s.ReplacePRChecks(ctx, "s1", []PRCheck{{Name: "build", Status: "passed"}}); err != nil { - t.Fatalf("replace checks 2: %v", err) - } - if gotChecks, _ = s.ListPRChecks(ctx, "s1"); len(gotChecks) != 1 { - t.Fatalf("after replace, checks = %+v, want 1", gotChecks) - } - - comments := []PRComment{ - {CommentID: "c1", Author: "alice", File: "a.go", Line: 10, Body: "nit", Resolved: false, CreatedAt: now}, - {CommentID: "c2", Author: "bob", File: "b.go", Line: 20, Body: "bug", Resolved: true, CreatedAt: now.Add(time.Second)}, - } - if err := s.ReplacePRComments(ctx, "s1", comments); err != nil { - t.Fatalf("replace comments: %v", err) - } - gotComments, err := s.ListPRComments(ctx, "s1") - if err != nil { - t.Fatalf("list comments: %v", err) - } - if !reflect.DeepEqual(gotComments, comments) { - t.Fatalf("comments = %+v, want %+v", gotComments, comments) - } - - // Deleting the pr cascades its checks and comments. - if err := s.DeletePR(ctx, "s1"); err != nil { - t.Fatalf("delete pr: %v", err) - } - if c, _ := s.ListPRChecks(ctx, "s1"); len(c) != 0 { - t.Fatalf("checks not cascaded: %+v", c) - } - if c, _ := s.ListPRComments(ctx, "s1"); len(c) != 0 { - t.Fatalf("comments not cascaded: %+v", c) - } -} diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go index 1eca08f8..4170da4d 100644 --- a/backend/internal/storage/sqlite/pr_store.go +++ b/backend/internal/storage/sqlite/pr_store.go @@ -10,136 +10,184 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// PRRow is the SCM observer's cache of the scalar PR facts that do not live in -// the canonical lifecycle (which keeps only pr_state/reason/number/url). It is -// 1:1 with a session and written OFF the canonical CDC path: upserting it never -// bumps revision and never emits a change_log/outbox event. The list facts -// (checks, comments) are separate rows — see PRCheck / PRComment. +// PRRow is the scalar PR facts row (the pr table), keyed by normalized URL. One +// session can own many PRs; a PR belongs to one session (session_id FK). type PRRow struct { + URL string SessionID string + Number int64 + State string // draft | open | merged | closed ReviewDecision string // none | approved | changes_requested | review_required - Mergeability string // unknown | mergeable | conflicting | blocked | unstable CIState string // unknown | pending | passing | failing - CIPassed int64 - CIFailed int64 - CIPending int64 - CILogTail string - LastFetchedAt time.Time -} - -// PRCheck is one CI check belonging to a session's PR. -type PRCheck struct { - Name string - Status string // unknown | queued | in_progress | passed | failed | skipped | cancelled - URL string -} - -// PRComment is one review comment belonging to a session's PR. -type PRComment struct { - CommentID string - Author string - File string - Line int64 - Body string - Resolved bool - CreatedAt time.Time + Mergeability string // unknown | mergeable | conflicting | blocked | unstable + UpdatedAt time.Time } -// UpsertPR inserts or replaces the scalar PR facts for one session. +// UpsertPR inserts or replaces the scalar PR facts for a PR URL. Empty enum +// fields default to their "nothing known yet" value so a partial row is valid +// against the CHECK constraints (matches the domain zero values none/unknown). func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { + if r.State == "" { + r.State = "open" + } + if r.ReviewDecision == "" { + r.ReviewDecision = "none" + } + if r.CIState == "" { + r.CIState = "unknown" + } + if r.Mergeability == "" { + r.Mergeability = "unknown" + } s.writeMu.Lock() defer s.writeMu.Unlock() - return s.q.UpsertPR(ctx, gen.UpsertPRParams{ + return s.qw.UpsertPR(ctx, gen.UpsertPRParams{ + Url: r.URL, SessionID: r.SessionID, + Number: r.Number, + PrState: r.State, ReviewDecision: r.ReviewDecision, - Mergeability: r.Mergeability, CiState: r.CIState, - CiPassed: r.CIPassed, - CiFailed: r.CIFailed, - CiPending: r.CIPending, - CiLogTail: r.CILogTail, - LastFetchedAt: r.LastFetchedAt, + Mergeability: r.Mergeability, + UpdatedAt: r.UpdatedAt, }) } -// GetPR returns the scalar PR facts for one session. ok is false when no row -// exists (the SCM observer has not fetched yet, or the session has no PR). -func (s *Store) GetPR(ctx context.Context, sessionID string) (PRRow, bool, error) { - p, err := s.q.GetPR(ctx, sessionID) +// GetPR returns the PR facts for a URL, or ok=false if absent. +func (s *Store) GetPR(ctx context.Context, url string) (PRRow, bool, error) { + p, err := s.qr.GetPR(ctx, url) if errors.Is(err, sql.ErrNoRows) { return PRRow{}, false, nil } if err != nil { - return PRRow{}, false, fmt.Errorf("get pr: %w", err) + return PRRow{}, false, fmt.Errorf("get pr %s: %w", url, err) } + return prRowFromGen(p), true, nil +} + +// ListPRsBySession returns every PR owned by a session, newest first. +func (s *Store) ListPRsBySession(ctx context.Context, sessionID string) ([]PRRow, error) { + rows, err := s.qr.ListPRsBySession(ctx, sessionID) + if err != nil { + return nil, fmt.Errorf("list prs for %s: %w", sessionID, err) + } + out := make([]PRRow, 0, len(rows)) + for _, p := range rows { + out = append(out, prRowFromGen(p)) + } + return out, nil +} + +// DeletePR removes a PR (cascades to its checks + comments). +func (s *Store) DeletePR(ctx context.Context, url string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.DeletePR(ctx, url) +} + +func prRowFromGen(p gen.Pr) PRRow { return PRRow{ + URL: p.Url, SessionID: p.SessionID, + Number: p.Number, + State: p.PrState, ReviewDecision: p.ReviewDecision, - Mergeability: p.Mergeability, CIState: p.CiState, - CIPassed: p.CiPassed, - CIFailed: p.CiFailed, - CIPending: p.CiPending, - CILogTail: p.CiLogTail, - LastFetchedAt: p.LastFetchedAt, - }, true, nil + Mergeability: p.Mergeability, + UpdatedAt: p.UpdatedAt, + } } -// DeletePR drops the scalar PR facts for one session, cascading its checks and -// comments. Normally unnecessary (the chain cascades on session delete); exposed -// for explicit eviction. -func (s *Store) DeletePR(ctx context.Context, sessionID string) error { +// ---- pr_checks: CI run history ---- + +// PRCheckRow is one CI check run for a PR (one row per check name per commit). +type PRCheckRow struct { + PRURL string + Name string + CommitHash string + Status string // unknown | queued | in_progress | passed | failed | skipped | cancelled + URL string + LogTail string + CreatedAt time.Time +} + +// RecordCheck upserts a CI check run. Re-polling the same (pr, name, commit) +// updates the same row; a new commit creates a new row (a fresh agent attempt). +func (s *Store) RecordCheck(ctx context.Context, r PRCheckRow) error { + if r.Status == "" { + r.Status = "unknown" + } s.writeMu.Lock() defer s.writeMu.Unlock() - return s.q.DeletePR(ctx, sessionID) + return s.qw.UpsertPRCheck(ctx, gen.UpsertPRCheckParams{ + PrUrl: r.PRURL, + Name: r.Name, + CommitHash: r.CommitHash, + Status: r.Status, + Url: r.URL, + LogTail: r.LogTail, + CreatedAt: r.CreatedAt, + }) } -// ReplacePRChecks atomically replaces the full set of CI checks for a session's -// PR — each SCM fetch reports the current set, so a replace (not a merge) keeps -// the table in sync (a check that disappeared upstream is removed). The PR row -// must already exist (pr_check FKs pr). -func (s *Store) ReplacePRChecks(ctx context.Context, sessionID string, checks []PRCheck) error { - return s.inTx(ctx, "replace pr checks", func(qtx *gen.Queries) error { - if err := qtx.DeletePRChecks(ctx, sessionID); err != nil { - return err - } - for _, c := range checks { - if err := qtx.InsertPRCheck(ctx, gen.InsertPRCheckParams{ - SessionID: sessionID, - Name: c.Name, - Status: c.Status, - Url: c.URL, - }); err != nil { - return fmt.Errorf("check %q: %w", c.Name, err) - } - } - return nil +// RecentCheckStatuses returns the statuses of the last `limit` runs of a check, +// most-recent first. The CI-fix-loop brake reads this: "last 3 all failed?". +func (s *Store) RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) { + rows, err := s.qr.ListRecentChecks(ctx, gen.ListRecentChecksParams{ + PrUrl: prURL, Name: name, Limit: int64(limit), }) + if err != nil { + return nil, fmt.Errorf("recent checks %s/%s: %w", prURL, name, err) + } + out := make([]string, 0, len(rows)) + for _, r := range rows { + out = append(out, r.Status) + } + return out, nil } -// ListPRChecks returns a session's CI checks, ordered by name. -func (s *Store) ListPRChecks(ctx context.Context, sessionID string) ([]PRCheck, error) { - rows, err := s.q.ListPRChecks(ctx, sessionID) +// ListChecks returns every recorded check run for a PR. +func (s *Store) ListChecks(ctx context.Context, prURL string) ([]PRCheckRow, error) { + rows, err := s.qr.ListChecksByPR(ctx, prURL) if err != nil { - return nil, fmt.Errorf("list pr checks: %w", err) + return nil, fmt.Errorf("list checks %s: %w", prURL, err) } - out := make([]PRCheck, 0, len(rows)) - for _, r := range rows { - out = append(out, PRCheck{Name: r.Name, Status: r.Status, URL: r.Url}) + out := make([]PRCheckRow, 0, len(rows)) + for _, c := range rows { + out = append(out, PRCheckRow{ + PRURL: c.PrUrl, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, URL: c.Url, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + }) } return out, nil } -// ReplacePRComments atomically replaces the full set of review comments for a -// session's PR (same replace-not-merge rationale as ReplacePRChecks). -func (s *Store) ReplacePRComments(ctx context.Context, sessionID string, comments []PRComment) error { - return s.inTx(ctx, "replace pr comments", func(qtx *gen.Queries) error { - if err := qtx.DeletePRComments(ctx, sessionID); err != nil { +// ---- pr_comment ---- + +// PRCommentRow is one review comment on a PR. +type PRCommentRow struct { + PRURL string + CommentID string + Author string + File string + Line int64 + Body string + Resolved bool + CreatedAt time.Time +} + +// ReplacePRComments atomically replaces the full comment set for a PR (each SCM +// fetch reports the current set, so a replace keeps it in sync). +func (s *Store) ReplacePRComments(ctx context.Context, prURL string, comments []PRCommentRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.inTx(ctx, "replace pr comments", func(q *gen.Queries) error { + if err := q.DeletePRComments(ctx, prURL); err != nil { return err } for _, c := range comments { - if err := qtx.InsertPRComment(ctx, gen.InsertPRCommentParams{ - SessionID: sessionID, + if err := q.UpsertPRComment(ctx, gen.UpsertPRCommentParams{ + PrUrl: prURL, CommentID: c.CommentID, Author: c.Author, File: c.File, @@ -155,47 +203,18 @@ func (s *Store) ReplacePRComments(ctx context.Context, sessionID string, comment }) } -// ListPRComments returns a session's review comments, ordered by creation time. -func (s *Store) ListPRComments(ctx context.Context, sessionID string) ([]PRComment, error) { - rows, err := s.q.ListPRComments(ctx, sessionID) +// ListPRComments returns a PR's review comments, oldest first. +func (s *Store) ListPRComments(ctx context.Context, prURL string) ([]PRCommentRow, error) { + rows, err := s.qr.ListPRComments(ctx, prURL) if err != nil { - return nil, fmt.Errorf("list pr comments: %w", err) + return nil, fmt.Errorf("list pr comments %s: %w", prURL, err) } - out := make([]PRComment, 0, len(rows)) - for _, r := range rows { - out = append(out, PRComment{ - CommentID: r.CommentID, - Author: r.Author, - File: r.File, - Line: r.Line, - Body: r.Body, - Resolved: r.Resolved != 0, - CreatedAt: r.CreatedAt, + out := make([]PRCommentRow, 0, len(rows)) + for _, c := range rows { + out = append(out, PRCommentRow{ + PRURL: c.PrUrl, CommentID: c.CommentID, Author: c.Author, File: c.File, + Line: c.Line, Body: c.Body, Resolved: c.Resolved != 0, CreatedAt: c.CreatedAt, }) } return out, nil } - -// inTx runs fn inside a single write transaction over the store's queries, -// rolling back on error. It holds writeMu for the duration, so callers must not -// already hold it. -func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - tx, err := s.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin %s: %w", what, err) - } - defer tx.Rollback() - if err := fn(s.q.WithTx(tx)); err != nil { - return fmt.Errorf("%s: %w", what, err) - } - return tx.Commit() -} - -func boolToInt(b bool) int64 { - if b { - return 1 - } - return 0 -} diff --git a/backend/internal/storage/sqlite/project_store.go b/backend/internal/storage/sqlite/project_store.go index 4837cafc..d81943c3 100644 --- a/backend/internal/storage/sqlite/project_store.go +++ b/backend/internal/storage/sqlite/project_store.go @@ -10,74 +10,46 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// ProjectRow is one registered repo, the durable twin of the old YAML config -// entry. It is the unit the registration path upserts and cross-project readers -// list. Off the canonical CDC path: writing a project never emits a change_log -// or outbox event. +// ProjectRow is one registered repo (the projects table). id is a short slug +// (mer, ao). ArchivedAt zero means active. type ProjectRow struct { ID string Path string - RepoOwner string - RepoName string - RepoPlatform string RepoOriginURL string - DefaultBranch string DisplayName string - SessionPrefix string - Source string RegisteredAt time.Time - // ArchivedAt is the soft-delete marker; zero means active. GetProject returns - // it regardless of state (so a session can resolve its archived project); - // ListProjects returns only rows where it is zero. - ArchivedAt time.Time + ArchivedAt time.Time } -// UpsertProject inserts or updates one registered project. +// UpsertProject inserts or updates a registered project. func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { s.writeMu.Lock() defer s.writeMu.Unlock() - return s.q.UpsertProject(ctx, gen.UpsertProjectParams{ + return s.qw.UpsertProject(ctx, gen.UpsertProjectParams{ ID: r.ID, Path: r.Path, - RepoOwner: r.RepoOwner, - RepoName: r.RepoName, - RepoPlatform: r.RepoPlatform, RepoOriginUrl: r.RepoOriginURL, - DefaultBranch: r.DefaultBranch, DisplayName: r.DisplayName, - SessionPrefix: r.SessionPrefix, - Source: r.Source, RegisteredAt: r.RegisteredAt, ArchivedAt: nullTime(r.ArchivedAt), }) } -// ArchiveProject soft-deletes one project, keeping the row so a session's -// project_id still resolves. Active-only reads (ListProjects) then hide it. -func (s *Store) ArchiveProject(ctx context.Context, id string, t time.Time) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.ArchiveProject(ctx, gen.ArchiveProjectParams{ - ArchivedAt: nullTime(t), - ID: id, - }) -} - -// GetProject returns one project by id. ok is false when no row exists. +// GetProject returns a project by id (active or archived), or ok=false. func (s *Store) GetProject(ctx context.Context, id string) (ProjectRow, bool, error) { - p, err := s.q.GetProject(ctx, id) + p, err := s.qr.GetProject(ctx, id) if errors.Is(err, sql.ErrNoRows) { return ProjectRow{}, false, nil } if err != nil { - return ProjectRow{}, false, fmt.Errorf("get project: %w", err) + return ProjectRow{}, false, fmt.Errorf("get project %s: %w", id, err) } return projectRowFromGen(p), true, nil } -// ListProjects returns every registered project, ordered by id. +// ListProjects returns active (non-archived) projects, ordered by id. func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { - rows, err := s.q.ListProjects(ctx) + rows, err := s.qr.ListProjects(ctx) if err != nil { return nil, fmt.Errorf("list projects: %w", err) } @@ -88,31 +60,31 @@ func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { return out, nil } -// DeleteProject removes one project by id. -func (s *Store) DeleteProject(ctx context.Context, id string) error { +// ArchiveProject soft-deletes a project (the row stays so session.project_id +// still resolves). +func (s *Store) ArchiveProject(ctx context.Context, id string, at time.Time) error { s.writeMu.Lock() defer s.writeMu.Unlock() - return s.q.DeleteProject(ctx, id) + return s.qw.ArchiveProject(ctx, gen.ArchiveProjectParams{ + ArchivedAt: nullTime(at), + ID: id, + }) } func projectRowFromGen(p gen.Project) ProjectRow { - return ProjectRow{ + r := ProjectRow{ ID: p.ID, Path: p.Path, - RepoOwner: p.RepoOwner, - RepoName: p.RepoName, - RepoPlatform: p.RepoPlatform, RepoOriginURL: p.RepoOriginUrl, - DefaultBranch: p.DefaultBranch, DisplayName: p.DisplayName, - SessionPrefix: p.SessionPrefix, - Source: p.Source, RegisteredAt: p.RegisteredAt, - ArchivedAt: p.ArchivedAt.Time, } + if p.ArchivedAt.Valid { + r.ArchivedAt = p.ArchivedAt.Time + } + return r } -// nullTime maps a zero time.Time to a NULL column, else a valid timestamp. func nullTime(t time.Time) sql.NullTime { if t.IsZero() { return sql.NullTime{} diff --git a/backend/internal/storage/sqlite/queries/cdc.sql b/backend/internal/storage/sqlite/queries/cdc.sql deleted file mode 100644 index b818194a..00000000 --- a/backend/internal/storage/sqlite/queries/cdc.sql +++ /dev/null @@ -1,42 +0,0 @@ --- name: InsertChangeLog :one --- Appends a canonical-write record and returns its monotonic seq so the same --- transaction can thread it into the outbox row. -INSERT INTO change_log (session_id, event_type, revision, payload, created_at) -VALUES (?, ?, ?, ?, ?) -RETURNING seq; - --- name: InsertOutbox :exec -INSERT INTO outbox (change_log_seq, created_at) -VALUES (?, ?); - --- name: ListUnsentOutbox :many -SELECT o.id, o.change_log_seq, o.attempts, - c.session_id, c.event_type, c.revision, c.payload, c.created_at -FROM outbox o -JOIN change_log c ON c.seq = o.change_log_seq -WHERE o.sent = 0 -ORDER BY o.change_log_seq -LIMIT ?; - --- name: MarkOutboxSent :exec -UPDATE outbox SET sent = 1, sent_at = ? WHERE id = ?; - --- name: MarkOutboxFailed :exec -UPDATE outbox SET attempts = attempts + 1, last_error = ? WHERE id = ?; - --- name: GetConsumerOffset :one -SELECT last_seq FROM consumer_offsets WHERE consumer = ?; - --- name: UpsertConsumerOffset :exec -INSERT INTO consumer_offsets (consumer, last_seq, updated_at) -VALUES (?, ?, ?) -ON CONFLICT (consumer) DO UPDATE SET last_seq = excluded.last_seq, updated_at = excluded.updated_at; - --- name: MaxChangeLogSeq :one -SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) FROM change_log; - --- name: MinConsumerOffset :one -SELECT CAST(COALESCE(MIN(last_seq), 0) AS INTEGER) FROM consumer_offsets; - --- name: DeleteSentOutboxBelow :execrows -DELETE FROM outbox WHERE sent = 1 AND change_log_seq < ?; diff --git a/backend/internal/storage/sqlite/queries/changelog.sql b/backend/internal/storage/sqlite/queries/changelog.sql new file mode 100644 index 00000000..0e11899c --- /dev/null +++ b/backend/internal/storage/sqlite/queries/changelog.sql @@ -0,0 +1,10 @@ +-- name: ReadChangeLogAfter :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE seq > ? ORDER BY seq LIMIT ?; + +-- name: ReadChangeLogAfterForProject :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ?; + +-- name: MaxChangeLogSeq :one +SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log; diff --git a/backend/internal/storage/sqlite/queries/metadata.sql b/backend/internal/storage/sqlite/queries/metadata.sql deleted file mode 100644 index 158552da..00000000 --- a/backend/internal/storage/sqlite/queries/metadata.sql +++ /dev/null @@ -1,20 +0,0 @@ --- name: GetSessionMetadata :one -SELECT branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt -FROM session_metadata -WHERE session_id = ?; - --- name: UpsertSessionMetadata :exec --- Merge semantics: an empty incoming column is "leave unchanged", so a partial --- patch (e.g. spawn writing only the runtime handle) never clobbers a value set --- earlier (e.g. the branch set at creation). Mirrors the old per-key map merge. -INSERT INTO session_metadata ( - session_id, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET - branch = CASE WHEN excluded.branch <> '' THEN excluded.branch ELSE session_metadata.branch END, - workspace_path = CASE WHEN excluded.workspace_path <> '' THEN excluded.workspace_path ELSE session_metadata.workspace_path END, - runtime_handle_id = CASE WHEN excluded.runtime_handle_id <> '' THEN excluded.runtime_handle_id ELSE session_metadata.runtime_handle_id END, - runtime_name = CASE WHEN excluded.runtime_name <> '' THEN excluded.runtime_name ELSE session_metadata.runtime_name END, - agent_session_id = CASE WHEN excluded.agent_session_id <> '' THEN excluded.agent_session_id ELSE session_metadata.agent_session_id END, - prompt = CASE WHEN excluded.prompt <> '' THEN excluded.prompt ELSE session_metadata.prompt END, - updated_at = excluded.updated_at; diff --git a/backend/internal/storage/sqlite/queries/pr.sql b/backend/internal/storage/sqlite/queries/pr.sql index 13c14a78..e6b41cf1 100644 --- a/backend/internal/storage/sqlite/queries/pr.sql +++ b/backend/internal/storage/sqlite/queries/pr.sql @@ -1,43 +1,20 @@ -- name: UpsertPR :exec -INSERT INTO pr ( - session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id) DO UPDATE SET +INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (url) DO UPDATE SET + session_id = excluded.session_id, + number = excluded.number, + pr_state = excluded.pr_state, review_decision = excluded.review_decision, - mergeability = excluded.mergeability, - ci_state = excluded.ci_state, - ci_passed = excluded.ci_passed, - ci_failed = excluded.ci_failed, - ci_pending = excluded.ci_pending, - ci_log_tail = excluded.ci_log_tail, - last_fetched_at = excluded.last_fetched_at; + ci_state = excluded.ci_state, + mergeability = excluded.mergeability, + updated_at = excluded.updated_at; -- name: GetPR :one -SELECT session_id, review_decision, mergeability, ci_state, ci_passed, ci_failed, ci_pending, ci_log_tail, last_fetched_at -FROM pr -WHERE session_id = ?; +SELECT * FROM pr WHERE url = ?; --- name: DeletePR :exec -DELETE FROM pr WHERE session_id = ?; - --- name: DeletePRChecks :exec -DELETE FROM pr_check WHERE session_id = ?; - --- name: InsertPRCheck :exec -INSERT INTO pr_check (session_id, name, status, url) VALUES (?, ?, ?, ?); - --- name: ListPRChecks :many -SELECT name, status, url FROM pr_check WHERE session_id = ? ORDER BY name; +-- name: ListPRsBySession :many +SELECT * FROM pr WHERE session_id = ? ORDER BY updated_at DESC; --- name: DeletePRComments :exec -DELETE FROM pr_comment WHERE session_id = ?; - --- name: InsertPRComment :exec -INSERT INTO pr_comment (session_id, comment_id, author, file, line, body, resolved, created_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?); - --- name: ListPRComments :many -SELECT comment_id, author, file, line, body, resolved, created_at -FROM pr_comment -WHERE session_id = ? -ORDER BY created_at, comment_id; +-- name: DeletePR :exec +DELETE FROM pr WHERE url = ?; diff --git a/backend/internal/storage/sqlite/queries/pr_checks.sql b/backend/internal/storage/sqlite/queries/pr_checks.sql new file mode 100644 index 00000000..2e3e3c15 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr_checks.sql @@ -0,0 +1,15 @@ +-- name: UpsertPRCheck :exec +INSERT INTO pr_checks (pr_url, name, commit_hash, status, url, log_tail, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET + status = excluded.status, + url = excluded.url, + log_tail = excluded.log_tail; + +-- name: ListRecentChecks :many +SELECT status, commit_hash, created_at FROM pr_checks +WHERE pr_url = ? AND name = ? +ORDER BY created_at DESC LIMIT ?; + +-- name: ListChecksByPR :many +SELECT * FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at; diff --git a/backend/internal/storage/sqlite/queries/pr_comment.sql b/backend/internal/storage/sqlite/queries/pr_comment.sql new file mode 100644 index 00000000..df4f99d0 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr_comment.sql @@ -0,0 +1,12 @@ +-- name: UpsertPRComment :exec +INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, comment_id) DO UPDATE SET + author = excluded.author, file = excluded.file, line = excluded.line, + body = excluded.body, resolved = excluded.resolved; + +-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE pr_url = ?; + +-- name: ListPRComments :many +SELECT * FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id; diff --git a/backend/internal/storage/sqlite/queries/projects.sql b/backend/internal/storage/sqlite/queries/projects.sql index 054b8f0e..3dc28950 100644 --- a/backend/internal/storage/sqlite/queries/projects.sql +++ b/backend/internal/storage/sqlite/queries/projects.sql @@ -1,32 +1,19 @@ -- name: UpsertProject :exec -INSERT INTO projects (id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +INSERT INTO projects (id, path, repo_origin_url, display_name, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT (id) DO UPDATE SET path = excluded.path, - repo_owner = excluded.repo_owner, - repo_name = excluded.repo_name, - repo_platform = excluded.repo_platform, repo_origin_url = excluded.repo_origin_url, - default_branch = excluded.default_branch, display_name = excluded.display_name, - session_prefix = excluded.session_prefix, - source = excluded.source, - registered_at = excluded.registered_at, archived_at = excluded.archived_at; -- name: GetProject :one -SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at -FROM projects -WHERE id = ?; +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE id = ?; -- name: ListProjects :many -SELECT id, path, repo_owner, repo_name, repo_platform, repo_origin_url, default_branch, display_name, session_prefix, source, registered_at, archived_at -FROM projects -WHERE archived_at IS NULL -ORDER BY id; +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE archived_at IS NULL ORDER BY id; -- name: ArchiveProject :exec UPDATE projects SET archived_at = ? WHERE id = ?; - --- name: DeleteProject :exec -DELETE FROM projects WHERE id = ?; diff --git a/backend/internal/storage/sqlite/queries/reactions.sql b/backend/internal/storage/sqlite/queries/reactions.sql deleted file mode 100644 index 0ccd99c3..00000000 --- a/backend/internal/storage/sqlite/queries/reactions.sql +++ /dev/null @@ -1,18 +0,0 @@ --- name: ListReactionTrackers :many -SELECT session_id, reaction_key, attempts, escalated, first_attempt_at, project_id -FROM reaction_trackers; - --- name: UpsertReactionTracker :exec -INSERT INTO reaction_trackers (session_id, reaction_key, attempts, escalated, first_attempt_at, project_id) -VALUES (?, ?, ?, ?, ?, ?) -ON CONFLICT (session_id, reaction_key) DO UPDATE SET - attempts = excluded.attempts, - escalated = excluded.escalated, - first_attempt_at = excluded.first_attempt_at, - project_id = excluded.project_id; - --- name: DeleteReactionTracker :exec -DELETE FROM reaction_trackers WHERE session_id = ? AND reaction_key = ?; - --- name: DeleteSessionReactionTrackers :exec -DELETE FROM reaction_trackers WHERE session_id = ?; diff --git a/backend/internal/storage/sqlite/queries/sessions.sql b/backend/internal/storage/sqlite/queries/sessions.sql index 48cdcacf..9b294de3 100644 --- a/backend/internal/storage/sqlite/queries/sessions.sql +++ b/backend/internal/storage/sqlite/queries/sessions.sql @@ -1,58 +1,34 @@ --- name: InsertSession :execrows --- CAS insert: only succeeds for a brand-new id. Incoming revision must be 0; --- the row is persisted at revision 1. +-- name: NextSessionNum :one +SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ?; + +-- name: InsertSession :exec INSERT INTO sessions ( - id, project_id, issue_id, kind, created_at, updated_at, - revision, - session_state, session_reason, - pr_state, pr_reason, pr_number, pr_url, - runtime_state, runtime_reason, + id, project_id, num, issue_id, kind, harness, + session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, - detecting_attempts, detecting_started_at, detecting_evidence_hash -) VALUES ( - ?, ?, ?, ?, ?, ?, - 1, - ?, ?, - ?, ?, ?, ?, - ?, ?, - ?, ?, ?, - ?, ?, ? -) -ON CONFLICT (id) DO NOTHING; + detecting_attempts, detecting_started_at, detecting_evidence_hash, + branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + created_at, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); --- name: UpdateSessionCAS :execrows --- CAS update: succeeds only when the stored revision equals the caller's loaded --- revision (@expected_revision). 0 rows affected => revision mismatch. +-- name: UpdateSession :exec UPDATE sessions SET - project_id = ?, - issue_id = ?, - kind = ?, - updated_at = ?, - revision = revision + 1, - session_state = ?, - session_reason = ?, - pr_state = ?, - pr_reason = ?, - pr_number = ?, - pr_url = ?, - runtime_state = ?, - runtime_reason = ?, - activity_state = ?, - activity_last_at = ?, - activity_source = ?, - detecting_attempts = ?, - detecting_started_at = ?, - detecting_evidence_hash = ? -WHERE id = ? AND revision = ?; - --- name: GetSessionRevision :one -SELECT revision FROM sessions WHERE id = ?; + issue_id = ?, kind = ?, harness = ?, + session_state = ?, termination_reason = ?, is_alive = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, + detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + updated_at = ? +WHERE id = ?; -- name: GetSession :one SELECT * FROM sessions WHERE id = ?; -- name: ListSessionsByProject :many -SELECT * FROM sessions WHERE project_id = ?; +SELECT * FROM sessions WHERE project_id = ? ORDER BY num; -- name: ListAllSessions :many -SELECT * FROM sessions; +SELECT * FROM sessions ORDER BY project_id, num; + +-- name: DeleteSession :exec +DELETE FROM sessions WHERE id = ?; diff --git a/backend/internal/storage/sqlite/reaction_store.go b/backend/internal/storage/sqlite/reaction_store.go deleted file mode 100644 index c703a21b..00000000 --- a/backend/internal/storage/sqlite/reaction_store.go +++ /dev/null @@ -1,86 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// ReactionTrackerRow is one persisted escalation budget, the durable mirror of -// the LCM's in-memory reactionTracker. It is the unit the lifecycle Manager -// hydrates on startup and writes through on each mutation. -type ReactionTrackerRow struct { - SessionID string - ReactionKey string - Attempts int - Escalated bool - FirstAttemptAt time.Time - ProjectID string -} - -// ListReactionTrackers returns every persisted escalation budget so the Manager -// can rehydrate its in-memory trackers after a restart. -func (s *Store) ListReactionTrackers(ctx context.Context) ([]ReactionTrackerRow, error) { - rows, err := s.q.ListReactionTrackers(ctx) - if err != nil { - return nil, fmt.Errorf("list reaction trackers: %w", err) - } - out := make([]ReactionTrackerRow, 0, len(rows)) - for _, r := range rows { - var first time.Time - if r.FirstAttemptAt.Valid { - first = r.FirstAttemptAt.Time - } - out = append(out, ReactionTrackerRow{ - SessionID: r.SessionID, - ReactionKey: r.ReactionKey, - Attempts: int(r.Attempts), - Escalated: r.Escalated != 0, - FirstAttemptAt: first, - ProjectID: r.ProjectID, - }) - } - return out, nil -} - -// SaveReactionTracker durably persists one escalation budget (insert or update). -func (s *Store) SaveReactionTracker(ctx context.Context, r ReactionTrackerRow) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - escalated := int64(0) - if r.Escalated { - escalated = 1 - } - first := sql.NullTime{} - if !r.FirstAttemptAt.IsZero() { - first = sql.NullTime{Time: r.FirstAttemptAt, Valid: true} - } - return s.q.UpsertReactionTracker(ctx, gen.UpsertReactionTrackerParams{ - SessionID: r.SessionID, - ReactionKey: r.ReactionKey, - Attempts: int64(r.Attempts), - Escalated: escalated, - FirstAttemptAt: first, - ProjectID: r.ProjectID, - }) -} - -// DeleteReactionTracker drops one escalation budget. -func (s *Store) DeleteReactionTracker(ctx context.Context, sessionID, reactionKey string) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.DeleteReactionTracker(ctx, gen.DeleteReactionTrackerParams{ - SessionID: sessionID, - ReactionKey: reactionKey, - }) -} - -// DeleteSessionReactionTrackers drops every escalation budget for a session. -func (s *Store) DeleteSessionReactionTrackers(ctx context.Context, sessionID string) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.DeleteSessionReactionTrackers(ctx, sessionID) -} diff --git a/backend/internal/storage/sqlite/spike_test.go b/backend/internal/storage/sqlite/spike_test.go deleted file mode 100644 index 30b43fc7..00000000 --- a/backend/internal/storage/sqlite/spike_test.go +++ /dev/null @@ -1,92 +0,0 @@ -package sqlite - -import ( - "context" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// TestSpikeOutboxTxn de-risks the whole adapter: it proves the sqlc-generated -// Querier composes inside one *sql.Tx and that the change_log seq returned -// mid-transaction threads into the outbox row — the transactional-outbox shape -// the publisher later drains. Step 0 of the implementation plan. -func TestSpikeOutboxTxn(t *testing.T) { - db, err := Open(t.TempDir()) - if err != nil { - t.Fatalf("open: %v", err) - } - defer db.Close() - - ctx := context.Background() - now := time.Now().UTC() - - tx, err := db.BeginTx(ctx, nil) - if err != nil { - t.Fatalf("begin: %v", err) - } - defer tx.Rollback() - - q := gen.New(db).WithTx(tx) - - // 1. CAS insert of a brand-new session (revision 0 -> persisted 1). - rows, err := q.InsertSession(ctx, gen.InsertSessionParams{ - ID: "s1", - ProjectID: "p1", - Kind: "worker", - CreatedAt: now, - UpdatedAt: now, - SessionState: "working", - SessionReason: "spawn_requested", - PrState: "none", - PrReason: "not_created", - RuntimeState: "unknown", - RuntimeReason: "spawn_incomplete", - ActivityState: "active", - ActivityLastAt: now, - ActivitySource: "none", - }) - if err != nil { - t.Fatalf("insert session: %v", err) - } - if rows != 1 { - t.Fatalf("insert session affected %d rows, want 1", rows) - } - - // 2. Append the change_log entry and capture its seq mid-transaction. - seq, err := q.InsertChangeLog(ctx, gen.InsertChangeLogParams{ - SessionID: "s1", - EventType: "session_created", - Revision: 1, - Payload: `{"id":"s1"}`, - CreatedAt: now, - }) - if err != nil { - t.Fatalf("insert change_log: %v", err) - } - if seq != 1 { - t.Fatalf("change_log seq = %d, want 1", seq) - } - - // 3. Thread the seq into the outbox row — the key thing the spike validates. - if err := q.InsertOutbox(ctx, gen.InsertOutboxParams{ChangeLogSeq: seq, CreatedAt: now}); err != nil { - t.Fatalf("insert outbox: %v", err) - } - - if err := tx.Commit(); err != nil { - t.Fatalf("commit: %v", err) - } - - // Verify the outbox row is visible, unsent, and linked to change_log seq 1. - unsent, err := gen.New(db).ListUnsentOutbox(ctx, 10) - if err != nil { - t.Fatalf("list unsent: %v", err) - } - if len(unsent) != 1 { - t.Fatalf("unsent outbox = %d rows, want 1", len(unsent)) - } - if unsent[0].ChangeLogSeq != 1 || unsent[0].SessionID != "s1" || unsent[0].EventType != "session_created" { - t.Fatalf("unexpected outbox row: %+v", unsent[0]) - } -} diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go index 2effeaee..800c1824 100644 --- a/backend/internal/storage/sqlite/store.go +++ b/backend/internal/storage/sqlite/store.go @@ -6,48 +6,77 @@ import ( "errors" "fmt" "sync" - "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" ) -// Store is the SQLite-backed ports.LifecycleStore. Reads (Load/Get/List/...) run -// concurrently across the connection pool; every write is funnelled through -// writeMu so there is exactly one writer at a time. That single-writer guarantee -// is load-bearing: it keeps WAL's single-writer rule and makes the revision-CAS -// (read-then-write in Upsert) atomic without depending on the pool size. Hold -// writeMu only around writes — never around a read — and never call one -// write method from inside another (the mutex is not reentrant). +// Store is the SQLite-backed persistence layer. It routes writes to a single +// writer connection (qw) and reads to a reader pool (qr) — see Open. writeMu +// guards the read-modify-write write methods (e.g. CreateSession's +// next-num-then-insert) so concurrent writes can't interleave them. +// +// CDC is captured by DB triggers (migration 0001), NOT by this layer: the store +// never writes change_log, it only reads it for the CDC poller. type Store struct { - db *sql.DB - q *gen.Queries + writeDB *sql.DB + readDB *sql.DB + qw *gen.Queries // bound to the single writer connection + qr *gen.Queries // bound to the reader pool writeMu sync.Mutex } -var _ ports.LifecycleStore = (*Store)(nil) - -// NewStore wraps an opened *sql.DB (see Open) as a LifecycleStore. -func NewStore(db *sql.DB) *Store { - return &Store{db: db, q: gen.New(db)} +// NewStore wraps an opened writer + reader *sql.DB (see Open) as a Store. +func NewStore(writeDB, readDB *sql.DB) *Store { + return &Store{ + writeDB: writeDB, + readDB: readDB, + qw: gen.New(writeDB), + qr: gen.New(readDB), + } } -// Load returns the canonical lifecycle for a session, or ok=false if absent. -func (s *Store) Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { - row, err := s.q.GetSession(ctx, string(id)) - if errors.Is(err, sql.ErrNoRows) { - return domain.CanonicalSessionLifecycle{}, false, nil +// Close closes both pools. +func (s *Store) Close() error { + err := s.writeDB.Close() + if e := s.readDB.Close(); e != nil && err == nil { + err = e } + return err +} + +// ---- sessions ---- + +// CreateSession assigns the per-project identity ("{project}-{num}") and inserts +// the record, returning it with ID populated. The next-num read and the insert +// run on the writer connection under writeMu, so two concurrent creates in the +// same project can't collide on num. +func (s *Store) CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + num, err := s.qw.NextSessionNum(ctx, string(rec.ProjectID)) if err != nil { - return domain.CanonicalSessionLifecycle{}, false, fmt.Errorf("load session %s: %w", id, err) + return domain.SessionRecord{}, fmt.Errorf("next session num for %s: %w", rec.ProjectID, err) } - return rowToLifecycle(row), true, nil + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, num)) + if err := s.qw.InsertSession(ctx, recordToInsert(rec, num)); err != nil { + return domain.SessionRecord{}, fmt.Errorf("insert session %s: %w", rec.ID, err) + } + return rec, nil +} + +// UpdateSession writes the full mutable state of an existing session. The +// id/project/num/created_at are immutable and not touched here. +func (s *Store) UpdateSession(ctx context.Context, rec domain.SessionRecord) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpdateSession(ctx, recordToUpdate(rec)) } -// Get returns the full record (no derived status) for a session. -func (s *Store) Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - row, err := s.q.GetSession(ctx, string(id)) +// GetSession returns the full record for a session, or ok=false if absent. +func (s *Store) GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + row, err := s.qr.GetSession(ctx, string(id)) if errors.Is(err, sql.ErrNoRows) { return domain.SessionRecord{}, false, nil } @@ -57,71 +86,49 @@ func (s *Store) Get(ctx context.Context, id domain.SessionID) (domain.SessionRec return rowToRecord(row), true, nil } -// List returns every record for a project (no archive filter — mirrors the -// in-memory store contract; terminal filtering is the caller's job). -func (s *Store) List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - rows, err := s.q.ListSessionsByProject(ctx, string(project)) +// ListSessions returns every session in a project, ordered by num. +func (s *Store) ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListSessionsByProject(ctx, string(project)) if err != nil { return nil, fmt.Errorf("list sessions for %s: %w", project, err) } - out := make([]domain.SessionRecord, 0, len(rows)) - for _, row := range rows { - out = append(out, rowToRecord(row)) - } - return out, nil + return mapSessionRows(rows), nil } -// ListAll returns every persisted session across all projects. The CDC snapshot -// source uses it to rebuild current state after a log-rotation gap. -func (s *Store) ListAll(ctx context.Context) ([]domain.SessionRecord, error) { - rows, err := s.q.ListAllSessions(ctx) +// ListAllSessions returns every session across all projects. +func (s *Store) ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListAllSessions(ctx) if err != nil { return nil, fmt.Errorf("list all sessions: %w", err) } + return mapSessionRows(rows), nil +} + +// DeleteSession removes a session (cascades to its pr/checks/comments). +func (s *Store) DeleteSession(ctx context.Context, id domain.SessionID) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.DeleteSession(ctx, string(id)) +} + +func mapSessionRows(rows []gen.Session) []domain.SessionRecord { out := make([]domain.SessionRecord, 0, len(rows)) - for _, row := range rows { - out = append(out, rowToRecord(row)) + for _, r := range rows { + out = append(out, rowToRecord(r)) } - return out, nil + return out } -// GetMetadata returns the typed metadata for a session, or the zero value if the -// session has no metadata row yet. -func (s *Store) GetMetadata(ctx context.Context, id domain.SessionID) (domain.SessionMetadata, error) { - row, err := s.q.GetSessionMetadata(ctx, string(id)) - if errors.Is(err, sql.ErrNoRows) { - return domain.SessionMetadata{}, nil - } +// inTx runs fn inside a single write transaction on the writer connection, +// rolling back on error. The caller must already hold writeMu. +func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { + tx, err := s.writeDB.BeginTx(ctx, nil) if err != nil { - return domain.SessionMetadata{}, fmt.Errorf("get metadata %s: %w", id, err) + return fmt.Errorf("begin %s: %w", what, err) } - return domain.SessionMetadata{ - Branch: row.Branch, - WorkspacePath: row.WorkspacePath, - RuntimeHandleID: row.RuntimeHandleID, - RuntimeName: row.RuntimeName, - AgentSessionID: row.AgentSessionID, - Prompt: row.Prompt, - }, nil -} - -// PatchMetadata merges meta into the session's metadata. It is outside the -// canonical write path: no revision bump, no CDC event. Empty fields are left -// unchanged (see UpsertSessionMetadata), so a partial patch is non-destructive. -func (s *Store) PatchMetadata(ctx context.Context, id domain.SessionID, meta domain.SessionMetadata) error { - if meta.IsZero() { - return nil + defer tx.Rollback() + if err := fn(s.qw.WithTx(tx)); err != nil { + return fmt.Errorf("%s: %w", what, err) } - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.q.UpsertSessionMetadata(ctx, gen.UpsertSessionMetadataParams{ - SessionID: string(id), - Branch: meta.Branch, - WorkspacePath: meta.WorkspacePath, - RuntimeHandleID: meta.RuntimeHandleID, - RuntimeName: meta.RuntimeName, - AgentSessionID: meta.AgentSessionID, - Prompt: meta.Prompt, - UpdatedAt: time.Now().UTC(), - }) + return tx.Commit() } diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store_test.go index a197f3af..55165c41 100644 --- a/backend/internal/storage/sqlite/store_test.go +++ b/backend/internal/storage/sqlite/store_test.go @@ -3,306 +3,314 @@ package sqlite import ( "context" "fmt" - "strings" "sync" "testing" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) func newTestStore(t *testing.T) *Store { t.Helper() - db, err := Open(t.TempDir()) + s, err := Open(t.TempDir()) if err != nil { t.Fatalf("open: %v", err) } - t.Cleanup(func() { db.Close() }) - return NewStore(db) + t.Cleanup(func() { _ = s.Close() }) + return s } -func sampleRecord(id string) domain.SessionRecord { +func seedProject(t *testing.T, s *Store, id string) { + t.Helper() + if err := s.UpsertProject(context.Background(), ProjectRow{ + ID: id, Path: "/tmp/" + id, RegisteredAt: time.Now().UTC().Truncate(time.Second), + }); err != nil { + t.Fatalf("seed project %s: %v", id, err) + } +} + +func sampleRecord(project string) domain.SessionRecord { now := time.Now().UTC().Truncate(time.Second) return domain.SessionRecord{ - ID: domain.SessionID(id), - ProjectID: "proj", - IssueID: "issue-1", + ProjectID: domain.ProjectID(project), Kind: domain.KindWorker, - CreatedAt: now, - UpdatedAt: now, Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, - PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, - Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + Version: domain.LifecycleVersion, + Harness: domain.HarnessClaudeCode, + IsAlive: true, + Session: domain.SessionSubstate{State: domain.SessionWorking}, + Activity: domain.ActivitySubstate{ + State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative, + }, }, + Metadata: domain.SessionMetadata{Branch: "feat/x", WorkspacePath: "/ws"}, + CreatedAt: now, + UpdatedAt: now, } } -func TestUpsertInsertThenUpdateBumpsRevision(t *testing.T) { +func TestProjectCRUDAndArchive(t *testing.T) { s := newTestStore(t) ctx := context.Background() - rec := sampleRecord("s1") + seedProject(t, s, "mer") - if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { - t.Fatalf("insert: %v", err) - } - lc, ok, err := s.Load(ctx, "s1") + got, ok, err := s.GetProject(ctx, "mer") if err != nil || !ok { - t.Fatalf("load after insert: ok=%v err=%v", ok, err) + t.Fatalf("get: ok=%v err=%v", ok, err) } - if lc.Revision != 1 { - t.Fatalf("revision after insert = %d, want 1", lc.Revision) + if got.ID != "mer" || got.Path != "/tmp/mer" { + t.Fatalf("project = %+v", got) } - - // Update must carry the loaded revision (1) and persist as 2. - rec.Lifecycle.Revision = 1 - rec.Lifecycle.Session.State = domain.SessionIdle - if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { - t.Fatalf("update: %v", err) + if list, _ := s.ListProjects(ctx); len(list) != 1 { + t.Fatalf("active list = %d, want 1", len(list)) } - lc, _, _ = s.Load(ctx, "s1") - if lc.Revision != 2 { - t.Fatalf("revision after update = %d, want 2", lc.Revision) + // archive hides from the active list but still resolves by id. + if err := s.ArchiveProject(ctx, "mer", time.Now().UTC()); err != nil { + t.Fatal(err) } - if lc.Session.State != domain.SessionIdle { - t.Fatalf("state after update = %q, want idle", lc.Session.State) + if list, _ := s.ListProjects(ctx); len(list) != 0 { + t.Fatalf("after archive, active list = %d, want 0", len(list)) } -} - -func TestUpsertStaleRevisionMismatch(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - rec := sampleRecord("s1") - if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { - t.Fatalf("insert: %v", err) + if _, ok, _ := s.GetProject(ctx, "mer"); !ok { + t.Fatal("archived project must still resolve by id") } - - // Stored revision is 1; submitting revision 0 (stale) must mismatch and - // write nothing new (no extra outbox/change_log rows). - rec.Lifecycle.Revision = 0 - err := s.Upsert(ctx, rec, ports.EventSessionStateChanged) - if err == nil || !strings.Contains(err.Error(), "revision mismatch") { - t.Fatalf("stale update err = %v, want revision mismatch", err) - } - assertOutboxCount(t, s, ctx, 1) } -func TestUpsertInsertNonZeroRevisionErrors(t *testing.T) { +func TestSessionCreateAssignsPerProjectID(t *testing.T) { s := newTestStore(t) ctx := context.Background() - rec := sampleRecord("s1") - rec.Lifecycle.Revision = 5 - err := s.Upsert(ctx, rec, ports.EventSessionCreated) - if err == nil || !strings.Contains(err.Error(), "revision mismatch") { - t.Fatalf("insert with revision 5 err = %v, want revision mismatch", err) + seedProject(t, s, "mer") + seedProject(t, s, "ao") + + r1, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) } - // Nothing should be persisted. - if _, ok, _ := s.Get(ctx, "s1"); ok { - t.Fatal("session persisted despite revision-mismatch insert") + r2, _ := s.CreateSession(ctx, sampleRecord("mer")) + r3, _ := s.CreateSession(ctx, sampleRecord("ao")) + if r1.ID != "mer-1" || r2.ID != "mer-2" || r3.ID != "ao-1" { + t.Fatalf("ids = %s, %s, %s; want mer-1, mer-2, ao-1", r1.ID, r2.ID, r3.ID) + } + got, ok, err := s.GetSession(ctx, "mer-1") + if err != nil || !ok { + t.Fatalf("get: ok=%v err=%v", ok, err) + } + if got.Lifecycle.Session.State != domain.SessionWorking || !got.Lifecycle.IsAlive || + got.Lifecycle.Harness != domain.HarnessClaudeCode || got.Metadata.Branch != "feat/x" { + t.Fatalf("round-trip mismatch: %+v", got) + } + if list, _ := s.ListSessions(ctx, "mer"); len(list) != 2 { + t.Fatalf("list mer = %d, want 2", len(list)) + } + if all, _ := s.ListAllSessions(ctx); len(all) != 3 { + t.Fatalf("list all = %d, want 3", len(all)) } - assertOutboxCount(t, s, ctx, 0) } -func TestUpsertOutboxAtomicityAndOrdering(t *testing.T) { +func TestSessionUpdateAndDetecting(t *testing.T) { s := newTestStore(t) ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) - rec := sampleRecord("s1") - if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { - t.Fatalf("insert: %v", err) - } - rec.Lifecycle.Revision = 1 - if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { - t.Fatalf("update: %v", err) - } - - rows, err := NewStore(s.db).q.ListUnsentOutbox(ctx, 100) - if err != nil { - t.Fatalf("list outbox: %v", err) - } - if len(rows) != 2 { - t.Fatalf("outbox rows = %d, want 2", len(rows)) + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionDetecting} + r.Lifecycle.IsAlive = false + r.Lifecycle.Detecting = &domain.DetectingState{Attempts: 2, StartedAt: r.CreatedAt, EvidenceHash: "abc"} + if err := s.UpdateSession(ctx, r); err != nil { + t.Fatal(err) } - // seq strictly monotonic, event types verbatim, revisions 1 then 2. - if rows[0].ChangeLogSeq != 1 || rows[1].ChangeLogSeq != 2 { - t.Fatalf("seq not monotonic: %d, %d", rows[0].ChangeLogSeq, rows[1].ChangeLogSeq) + got, _, _ := s.GetSession(ctx, r.ID) + if got.Lifecycle.Session.State != domain.SessionDetecting || got.Lifecycle.IsAlive { + t.Fatalf("update not persisted: %+v", got.Lifecycle.Session) } - if rows[0].EventType != string(ports.EventSessionCreated) || rows[1].EventType != string(ports.EventSessionStateChanged) { - t.Fatalf("event types = %q, %q", rows[0].EventType, rows[1].EventType) + if got.Lifecycle.Detecting == nil || got.Lifecycle.Detecting.Attempts != 2 || got.Lifecycle.Detecting.EvidenceHash != "abc" { + t.Fatalf("detecting not round-tripped: %+v", got.Lifecycle.Detecting) } - if rows[0].Revision != 1 || rows[1].Revision != 2 { - t.Fatalf("revisions = %d, %d, want 1, 2", rows[0].Revision, rows[1].Revision) + // clearing detecting persists as nil. + got.Lifecycle.Detecting = nil + got.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionWorking} + _ = s.UpdateSession(ctx, got) + again, _, _ := s.GetSession(ctx, r.ID) + if again.Lifecycle.Detecting != nil { + t.Fatalf("detecting should clear to nil, got %+v", again.Lifecycle.Detecting) } } -func TestGetListRoundTrip(t *testing.T) { +func TestPRCRUD(t *testing.T) { s := newTestStore(t) ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) - a := sampleRecord("a") - b := sampleRecord("b") - b.ProjectID = "other" - if err := s.Upsert(ctx, a, ports.EventSessionCreated); err != nil { - t.Fatal(err) + pr := PRRow{ + URL: "https://gh/pr/1", SessionID: string(r.ID), Number: 1, State: "open", + ReviewDecision: "review_required", CIState: "failing", Mergeability: "blocked", UpdatedAt: now, } - if err := s.Upsert(ctx, b, ports.EventSessionCreated); err != nil { + if err := s.UpsertPR(ctx, pr); err != nil { t.Fatal(err) } - - got, ok, err := s.Get(ctx, "a") - if err != nil || !ok { - t.Fatalf("get a: ok=%v err=%v", ok, err) + got, ok, err := s.GetPR(ctx, pr.URL) + if err != nil || !ok || got != pr { + t.Fatalf("get pr: ok=%v err=%v got=%+v", ok, err, got) } - if got.ID != "a" || got.Lifecycle.Revision != 1 || got.IssueID != "issue-1" { - t.Fatalf("unexpected record: %+v", got) + if list, _ := s.ListPRsBySession(ctx, string(r.ID)); len(list) != 1 { + t.Fatalf("list prs = %d, want 1", len(list)) } - if !got.Metadata.IsZero() { - t.Fatalf("Get must not reconstruct metadata, got %v", got.Metadata) - } - - list, err := s.List(ctx, "proj") - if err != nil { + if err := s.DeletePR(ctx, pr.URL); err != nil { t.Fatal(err) } - if len(list) != 1 || list[0].ID != "a" { - t.Fatalf("List(proj) = %+v, want only a", list) + if _, ok, _ := s.GetPR(ctx, pr.URL); ok { + t.Fatal("pr should be gone") } } -func TestMetadataSideChannel(t *testing.T) { +func TestPRChecksLoopBrakeQuery(t *testing.T) { s := newTestStore(t) ctx := context.Background() - if err := s.Upsert(ctx, sampleRecord("s1"), ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - - if err := s.PatchMetadata(ctx, "s1", domain.SessionMetadata{Branch: "feat/x", Prompt: "do it"}); err != nil { - t.Fatalf("patch: %v", err) - } - // A partial patch (only Branch) must not clobber the earlier Prompt. - if err := s.PatchMetadata(ctx, "s1", domain.SessionMetadata{Branch: "feat/y"}); err != nil { - t.Fatalf("patch overwrite: %v", err) - } + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: now}) - m, err := s.GetMetadata(ctx, "s1") + // three consecutive failing runs of "build" (one per commit). + for i := 1; i <= 3; i++ { + if err := s.RecordCheck(ctx, PRCheckRow{ + PRURL: "pr1", Name: "build", CommitHash: fmt.Sprintf("c%d", i), + Status: "failed", CreatedAt: now.Add(time.Duration(i) * time.Second), + }); err != nil { + t.Fatal(err) + } + } + last3, err := s.RecentCheckStatuses(ctx, "pr1", "build", 3) if err != nil { t.Fatal(err) } - if m.Branch != "feat/y" || m.Prompt != "do it" { - t.Fatalf("metadata = %+v", m) + if len(last3) != 3 || last3[0] != "failed" || last3[1] != "failed" || last3[2] != "failed" { + t.Fatalf("recent statuses = %v, want 3x failed (loop brake would trip)", last3) } - // Metadata writes must not bump revision (off the canonical path). - lc, _, _ := s.Load(ctx, "s1") - if lc.Revision != 1 { - t.Fatalf("revision = %d after metadata patch, want 1 (no bump)", lc.Revision) + // a pass on a newer commit breaks the streak. + _ = s.RecordCheck(ctx, PRCheckRow{PRURL: "pr1", Name: "build", CommitHash: "c4", Status: "passed", CreatedAt: now.Add(4 * time.Second)}) + last3, _ = s.RecentCheckStatuses(ctx, "pr1", "build", 3) + if last3[0] != "passed" { + t.Fatalf("most recent should be passed, got %v", last3) } } -func TestDetectingRoundTrip(t *testing.T) { +func TestPRCommentsReplace(t *testing.T) { s := newTestStore(t) ctx := context.Background() - rec := sampleRecord("s1") - rec.Lifecycle.Session.State = domain.SessionDetecting - rec.Lifecycle.Detecting = &domain.DetectingState{ - Attempts: 2, - StartedAt: time.Now().UTC().Truncate(time.Second), - EvidenceHash: "abc123", - } - if err := s.Upsert(ctx, rec, ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - lc, _, _ := s.Load(ctx, "s1") - if lc.Detecting == nil { - t.Fatal("Detecting lost on round-trip") - } - if lc.Detecting.Attempts != 2 || lc.Detecting.EvidenceHash != "abc123" { - t.Fatalf("detecting = %+v", lc.Detecting) - } + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: now}) - // Clearing Detecting must null the columns back out. - rec.Lifecycle.Revision = 1 - rec.Lifecycle.Detecting = nil - if err := s.Upsert(ctx, rec, ports.EventSessionStateChanged); err != nil { - t.Fatal(err) - } - lc, _, _ = s.Load(ctx, "s1") - if lc.Detecting != nil { - t.Fatalf("Detecting not cleared: %+v", lc.Detecting) + _ = s.ReplacePRComments(ctx, "pr1", []PRCommentRow{ + {PRURL: "pr1", CommentID: "c1", Author: "a", File: "a.go", Line: 1, Body: "nit", CreatedAt: now}, + {PRURL: "pr1", CommentID: "c2", Author: "b", File: "b.go", Line: 2, Body: "bug", Resolved: true, CreatedAt: now.Add(time.Second)}, + }) + if list, _ := s.ListPRComments(ctx, "pr1"); len(list) != 2 { + t.Fatalf("comments = %d, want 2", len(list)) + } + // replace with a smaller set drops the rest. + _ = s.ReplacePRComments(ctx, "pr1", []PRCommentRow{{PRURL: "pr1", CommentID: "c1", Body: "x", CreatedAt: now}}) + if list, _ := s.ListPRComments(ctx, "pr1"); len(list) != 1 { + t.Fatalf("after replace, comments = %d, want 1", len(list)) } } -func TestLoadGetMissing(t *testing.T) { +func TestCDCTriggersPopulateChangeLog(t *testing.T) { s := newTestStore(t) ctx := context.Background() - if _, ok, err := s.Load(ctx, "nope"); ok || err != nil { - t.Fatalf("Load missing: ok=%v err=%v", ok, err) - } - if _, ok, err := s.Get(ctx, "nope"); ok || err != nil { - t.Fatalf("Get missing: ok=%v err=%v", ok, err) - } - if m, err := s.GetMetadata(ctx, "nope"); err != nil || !m.IsZero() { - t.Fatalf("GetMetadata missing: m=%v err=%v", m, err) - } -} + seedProject(t, s, "mer") -func assertOutboxCount(t *testing.T, s *Store, ctx context.Context, want int) { - t.Helper() - rows, err := s.q.ListUnsentOutbox(ctx, 1000) + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + // a real state change logs; a metadata-only change does not (WHEN guard). + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionIdle} + _ = s.UpdateSession(ctx, r) + r.Metadata.Prompt = "only metadata changed" + _ = s.UpdateSession(ctx, r) + // a PR insert logs too. + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: r.UpdatedAt}) + + evs, err := s.ReadChangeLogAfter(ctx, 0, 100) if err != nil { - t.Fatalf("list outbox: %v", err) + t.Fatal(err) } - if len(rows) != want { - t.Fatalf("outbox count = %d, want %d", len(rows), want) + var types []string + for _, e := range evs { + if e.ProjectID != "mer" { + t.Fatalf("event project = %s, want mer", e.ProjectID) + } + types = append(types, e.EventType) + } + want := []string{"session_created", "session_updated", "pr_created"} + if len(types) != 3 || types[0] != want[0] || types[1] != want[1] || types[2] != want[2] { + t.Fatalf("change_log event types = %v, want %v (metadata-only update suppressed)", types, want) + } + max, _ := s.MaxChangeLogSeq(ctx) + if max != int64(len(evs)) { + t.Fatalf("max seq = %d, want %d", max, len(evs)) } } -// TestConcurrentReadsAndWrites exercises the read-pool + write-mutex model: -// many writers (each its own session) run alongside many readers hammering -// ListAll. Reads must not be serialized behind writes, writes must not corrupt -// or error under the revision-CAS, and the final state must be exact. Run under -// -race this also guards the writeMu discipline. -func TestConcurrentReadsAndWrites(t *testing.T) { +func TestConcurrentSessionCreateAssignsUniqueNums(t *testing.T) { s := newTestStore(t) ctx := context.Background() - const n = 16 + seedProject(t, s, "mer") + const n = 20 var wg sync.WaitGroup - errc := make(chan error, n*2) - + ids := make([]string, n) for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() - if err := s.Upsert(ctx, sampleRecord(fmt.Sprintf("s%02d", i)), ports.EventSessionCreated); err != nil { - errc <- err + r, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Errorf("create: %v", err) + return } + ids[i] = string(r.ID) }(i) } - for i := 0; i < n; i++ { - wg.Add(1) - go func() { - defer wg.Done() - for j := 0; j < 25; j++ { - if _, err := s.ListAll(ctx); err != nil { - errc <- err - return - } - } - }() - } wg.Wait() - close(errc) - for err := range errc { - t.Fatalf("concurrent op error: %v", err) + + seen := map[string]bool{} + for _, id := range ids { + if id == "" || seen[id] { + t.Fatalf("duplicate or empty id: %q in %v", id, ids) + } + seen[id] = true } + if all, _ := s.ListAllSessions(ctx); len(all) != n { + t.Fatalf("created %d sessions, want %d", len(all), n) + } +} - got, err := s.ListAll(ctx) - if err != nil { +func TestTerminationReasonRoundTripAndCheck(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + + // terminate with a valid reason -> round-trips. + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionTerminated} + r.Lifecycle.TerminationReason = domain.TermManuallyKilled + if err := s.UpdateSession(ctx, r); err != nil { t.Fatal(err) } - if len(got) != n { - t.Fatalf("after %d concurrent inserts, ListAll returned %d", n, len(got)) + got, _, _ := s.GetSession(ctx, r.ID) + if got.Lifecycle.TerminationReason != domain.TermManuallyKilled { + t.Fatalf("termination_reason = %q, want manually_killed", got.Lifecycle.TerminationReason) + } + if domain.DeriveStatus(got.Lifecycle, domain.PRFacts{}) != domain.StatusKilled { + t.Fatal("terminated+manually_killed should derive to killed") + } + + // an off-enum reason is rejected by the CHECK constraint. + r.Lifecycle.TerminationReason = domain.TerminationReason("definitely_not_a_reason") + if err := s.UpdateSession(ctx, r); err == nil { + t.Fatal("expected CHECK constraint to reject an invalid termination_reason") } } diff --git a/backend/internal/storage/sqlite/upsert.go b/backend/internal/storage/sqlite/upsert.go deleted file mode 100644 index f8ae4093..00000000 --- a/backend/internal/storage/sqlite/upsert.go +++ /dev/null @@ -1,115 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "encoding/json" - "errors" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// Upsert performs the one atomic canonical write: it CAS-checks and persists the -// session row (bumping revision), appends a change_log entry, and enqueues an -// outbox row linked to that entry's seq — all in a single transaction. Only the -// LCM calls this. -// -// Revision CAS (mirrors the in-memory store contract exactly): -// - existing row: rec.Lifecycle.Revision must equal the stored revision, else -// a revision-mismatch error and nothing is written; on match it persists at -// stored+1. -// - insert: rec.Lifecycle.Revision must be 0, persisted as 1. -func (s *Store) Upsert(ctx context.Context, rec domain.SessionRecord, eventType ports.EventType) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - tx, err := s.db.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin upsert: %w", err) - } - defer tx.Rollback() - qtx := s.q.WithTx(tx) - - newRevision, err := casPersist(ctx, qtx, rec) - if err != nil { - return err - } - - if err := appendOutbox(ctx, qtx, rec, newRevision, eventType); err != nil { - return err - } - - return tx.Commit() -} - -// casPersist applies the revision-CAS insert-or-update and returns the new -// stored revision. -func casPersist(ctx context.Context, q *gen.Queries, rec domain.SessionRecord) (int, error) { - stored, err := q.GetSessionRevision(ctx, string(rec.ID)) - switch { - case errors.Is(err, sql.ErrNoRows): - // Insert path: incoming revision must be 0; row persists at revision 1. - if rec.Lifecycle.Revision != 0 { - return 0, fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) - } - rows, err := q.InsertSession(ctx, recordToInsert(rec)) - if err != nil { - return 0, fmt.Errorf("insert session %s: %w", rec.ID, err) - } - if rows != 1 { - // Another writer raced us between the revision check and the insert. - // With single-writer this should not happen; treat as a CAS failure. - return 0, fmt.Errorf("revision mismatch for insert %s: row already exists", rec.ID) - } - return 1, nil - case err != nil: - return 0, fmt.Errorf("read revision %s: %w", rec.ID, err) - default: - // Update path: incoming revision must equal the stored revision. - if int64(rec.Lifecycle.Revision) != stored { - return 0, fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, stored) - } - rows, err := q.UpdateSessionCAS(ctx, recordToUpdate(rec, stored)) - if err != nil { - return 0, fmt.Errorf("update session %s: %w", rec.ID, err) - } - if rows != 1 { - return 0, fmt.Errorf("revision mismatch for %s: stale revision %d", rec.ID, rec.Lifecycle.Revision) - } - return int(stored) + 1, nil - } -} - -// appendOutbox writes the change_log entry and threads its seq into a fresh -// outbox row. The change_log payload is the persisted record at its new revision -// (metadata is excluded by SessionRecord's json:"-" tag — it is not on the -// canonical path). -func appendOutbox(ctx context.Context, q *gen.Queries, rec domain.SessionRecord, newRevision int, eventType ports.EventType) error { - now := time.Now().UTC() - payload := rec - payload.Lifecycle.Revision = newRevision - payload.Lifecycle.Version = domain.LifecycleVersion - blob, err := json.Marshal(payload) - if err != nil { - return fmt.Errorf("marshal change_log payload %s: %w", rec.ID, err) - } - - seq, err := q.InsertChangeLog(ctx, gen.InsertChangeLogParams{ - SessionID: string(rec.ID), - EventType: string(eventType), - Revision: int64(newRevision), - Payload: string(blob), - CreatedAt: now, - }) - if err != nil { - return fmt.Errorf("insert change_log %s: %w", rec.ID, err) - } - - if err := q.InsertOutbox(ctx, gen.InsertOutboxParams{ChangeLogSeq: seq, CreatedAt: now}); err != nil { - return fmt.Errorf("insert outbox %s: %w", rec.ID, err) - } - return nil -} From cdf55ebef506b4f16c8919c180d09c5ffd4f28a0 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 07:16:06 +0530 Subject: [PATCH 07/10] feat(backend): port lifecycle lane onto the new storage+CDC model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks the LCM, reactions, session manager, reaper, and boot wiring onto the redesigned domain model — collapsing the runtime axis to is_alive, moving PR facts to the pr table (read back as PRFacts), replacing the free-form SessionReason with a typed terminal-only TerminationReason, and dropping Revision/EventType/durable reaction-trackers (CDC is trigger-driven, escalation budgets are in-memory). - ports: SessionStore + PRWriter interfaces; PRObservation/RuntimeFacts/ ActivitySignal DTOs; drop LifecycleStore/EventType/ReactionStore. - lifecycle: single-writer reducer over is_alive; ApplyPRObservation writes the pr tables and reacts; CI-fix-loop brake derived from pr_checks history; review comments injected into the agent regardless of author (no bot detection); merge auto-terminates with pr_merged. - session: store-assigned "{project}-{n}" ids; folded metadata; status derived from PRFacts on read. - reaper: reports the four-valued probe vocabulary unchanged. - boot: trigger -> poller -> broadcaster; storeAdapter bridges *sqlite.Store. Lane shrinks 6218 -> 2803 LOC. go build/vet/test -race green. Co-Authored-By: Claude Opus 4.8 --- backend/cdc_e2e_test.go | 194 ----- backend/cdc_wiring.go | 142 +--- backend/internal/domain/lifecycle.go | 21 +- backend/internal/domain/session.go | 8 +- backend/internal/domain/status.go | 2 +- backend/internal/domain/status_test.go | 2 +- backend/internal/lifecycle/decide_bridge.go | 252 ++---- backend/internal/lifecycle/fakes_test.go | 164 ---- backend/internal/lifecycle/manager.go | 632 +++++--------- .../internal/lifecycle/manager_parity_test.go | 144 ---- backend/internal/lifecycle/manager_test.go | 802 ++++++------------ .../lifecycle/reaction_durability_test.go | 140 --- backend/internal/lifecycle/reaction_store.go | 94 -- backend/internal/lifecycle/reactions.go | 663 +++++++-------- backend/internal/lifecycle/reactions_test.go | 616 -------------- backend/internal/observe/reaper/reaper.go | 12 +- .../internal/observe/reaper/reaper_test.go | 392 ++------- backend/internal/ports/facts.go | 194 ++--- backend/internal/ports/inbound.go | 62 +- backend/internal/ports/outbound.go | 102 +-- backend/internal/session/fakes_test.go | 400 --------- backend/internal/session/manager.go | 466 ++++------ backend/internal/session/manager_test.go | 744 +++++----------- backend/lifecycle_wiring.go | 184 ++-- backend/main.go | 7 +- backend/main_test.go | 134 --- backend/wiring_test.go | 71 ++ 27 files changed, 1614 insertions(+), 5030 deletions(-) delete mode 100644 backend/cdc_e2e_test.go delete mode 100644 backend/internal/lifecycle/fakes_test.go delete mode 100644 backend/internal/lifecycle/manager_parity_test.go delete mode 100644 backend/internal/lifecycle/reaction_durability_test.go delete mode 100644 backend/internal/lifecycle/reaction_store.go delete mode 100644 backend/internal/lifecycle/reactions_test.go delete mode 100644 backend/internal/session/fakes_test.go delete mode 100644 backend/main_test.go create mode 100644 backend/wiring_test.go diff --git a/backend/cdc_e2e_test.go b/backend/cdc_e2e_test.go deleted file mode 100644 index 29b04534..00000000 --- a/backend/cdc_e2e_test.go +++ /dev/null @@ -1,194 +0,0 @@ -package main - -import ( - "context" - "encoding/json" - "path/filepath" - "sync" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/cdc" - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// These are full-stack end-to-end tests of the write+delivery path wired exactly -// as main.go wires it: real sqlite.Store -> real outboxAdapter -> real -// cdc.Publisher -> real JSONL log -> real cdc.Consumer -> real cdc.Broadcaster, -// using the REAL snapshotSource (store.ListAll) rather than a fake. The cdc -// package's own integration test covers the synchronous Drain/Poll happy path -// with a fake snapshot; these cover the two gaps it leaves: a rotation that -// resyncs from the actual sessions table, and the concurrent goroutine model -// the daemon actually runs. - -// TestE2E_RealSnapshotResyncThroughRotation forces a log rotation and asserts the -// consumer rebuilds state from the REAL sessions-table snapshot (not the -// rotated-away bytes), delivering the persisted record's payload. -func TestE2E_RealSnapshotResyncThroughRotation(t *testing.T) { - ctx := context.Background() - store := newWiringStore(t) - dir := t.TempDir() - log, err := cdc.OpenLog(dir, 80) // tiny cap: the second write forces a rotation - if err != nil { - t.Fatal(err) - } - defer log.Close() - - var mu sync.Mutex - var got []cdc.Event - bc := cdc.NewBroadcaster() - bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) - - con := cdc.NewConsumer("fe", filepath.Join(dir, cdc.LogFileName), store, bc, - cdc.ConsumerConfig{Snapshot: snapshotSource{store: store}}) - if _, err := con.Start(ctx); err != nil { - t.Fatal(err) - } - pub := cdc.NewPublisher(outboxAdapter{store: store}, log, cdc.PublisherConfig{}) - - // First canonical write: drained and consumed live from the original file. - if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { - t.Fatal(err) - } - if err := pub.Drain(ctx); err != nil { - t.Fatal(err) - } - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - mu.Lock() - before := len(got) - mu.Unlock() - - // Second write pushes the log past its cap -> rotation. The consumer sees a - // fresh file and must resync from the sessions table. - r := wiringRec("s1") - r.Lifecycle.Revision = 1 - if err := store.Upsert(ctx, r, ports.EventSessionStateChanged); err != nil { - t.Fatal(err) - } - if err := pub.Drain(ctx); err != nil { - t.Fatal(err) - } - if err := con.Poll(ctx); err != nil { - t.Fatal(err) - } - - mu.Lock() - defer mu.Unlock() - if len(got) <= before { - t.Fatalf("resync delivered nothing after rotation (got %d, before %d)", len(got), before) - } - // A real session_snapshot for s1 must have been delivered, carrying the full - // record persisted in the sessions table. - var snap *cdc.Event - for i := range got { - if got[i].EventType == "session_snapshot" && got[i].SessionID == "s1" { - snap = &got[i] - } - } - if snap == nil { - t.Fatalf("no real session_snapshot delivered after rotation; got %+v", got) - } - var rec domain.SessionRecord - if err := json.Unmarshal([]byte(snap.Payload), &rec); err != nil { - t.Fatalf("snapshot payload not a SessionRecord: %v", err) - } - if rec.ID != "s1" || rec.Lifecycle.Session.State != domain.SessionWorking { - t.Fatalf("snapshot payload mismatch: %+v", rec) - } - // The consumer's durable offset advanced to the change_log head. - off, err := store.GetOffset(ctx, "fe") - if err != nil { - t.Fatal(err) - } - maxSeq, err := store.MaxChangeLogSeq(ctx) - if err != nil { - t.Fatal(err) - } - if off != maxSeq { - t.Fatalf("offset = %d, want change_log head %d", off, maxSeq) - } -} - -// TestE2E_ConcurrentPublisherConsumer runs the publisher and consumer as the -// daemon runs them — independent goroutines on their own tickers — and asserts -// every canonical write is delivered exactly once, in order, with the offset -// landing at the head. Run under -race this also guards the broadcaster/consumer -// hand-off. -func TestE2E_ConcurrentPublisherConsumer(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - store := newWiringStore(t) - dir := t.TempDir() - log, err := cdc.OpenLog(dir, 0) - if err != nil { - t.Fatal(err) - } - defer log.Close() - - var mu sync.Mutex - var got []cdc.Event - bc := cdc.NewBroadcaster() - bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) - - pub := cdc.NewPublisher(outboxAdapter{store: store}, log, cdc.PublisherConfig{}) - con := cdc.NewConsumer("fe", filepath.Join(dir, cdc.LogFileName), store, bc, cdc.ConsumerConfig{}) - - pubDone := pub.Start(ctx) - conDone, err := con.Start(ctx) - if err != nil { - t.Fatal(err) - } - - const n = 5 - for i := 0; i < n; i++ { - r := wiringRec("s1") - r.Lifecycle.Revision = i - evt := ports.EventSessionStateChanged - if i == 0 { - evt = ports.EventSessionCreated - } - if err := store.Upsert(ctx, r, evt); err != nil { - t.Fatalf("upsert %d: %v", i, err) - } - } - - // Bounded wait for the goroutine pipeline to deliver everything. - deadline := time.Now().Add(5 * time.Second) - for { - mu.Lock() - count := len(got) - mu.Unlock() - if count >= n { - break - } - if time.Now().After(deadline) { - t.Fatalf("timed out: delivered %d/%d events", count, n) - } - time.Sleep(20 * time.Millisecond) - } - - cancel() - <-pubDone - <-conDone - - mu.Lock() - defer mu.Unlock() - if len(got) != n { - t.Fatalf("delivered %d events, want %d", len(got), n) - } - for i, e := range got { - if e.Seq != int64(i+1) { - t.Fatalf("event %d has seq %d, want %d (out-of-order or duplicate)", i, e.Seq, i+1) - } - } - off, err := store.GetOffset(context.Background(), "fe") - if err != nil { - t.Fatal(err) - } - if off != n { - t.Fatalf("offset = %d, want %d", off, n) - } -} diff --git a/backend/cdc_wiring.go b/backend/cdc_wiring.go index cfae4fdb..d824cbab 100644 --- a/backend/cdc_wiring.go +++ b/backend/cdc_wiring.go @@ -3,140 +3,62 @@ package main import ( "context" "encoding/json" - "fmt" "log/slog" - "path/filepath" - "time" "github.com/aoagents/agent-orchestrator/backend/internal/cdc" - "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// cdcConsumerName is the durable consumer_offsets key for the in-process FE -// broadcast consumer. A second transport (e.g. a cloud relay) would use its own -// key so each tracks an independent cursor. -const cdcConsumerName = "fe-broadcast" - -// cdcPipeline owns the running CDC goroutines and the broadcaster the FE -// transport subscribes to. It is the durable change-delivery substrate: the -// publisher drains the outbox to JSONL, the consumer tails the log and fans out -// through the broadcaster, and the janitor reclaims acknowledged outbox rows. +// cdcPipeline owns the running CDC poller and the broadcaster the SSE transport +// subscribes to. The DB triggers write change_log; the poller tails it and fans +// each new event out through the broadcaster. Durable catch-up is the client's +// job (it reads change_log from its own Last-Event-ID), so the poller only +// pushes live events and re-seeks to head on restart. type cdcPipeline struct { Broadcaster *cdc.Broadcaster - log *cdc.Log - dones []<-chan struct{} + done <-chan struct{} } -// startCDC opens the JSONL log and starts the publisher, consumer, and janitor -// against store, returning a handle whose Stop waits for the goroutines to -// drain after ctx is cancelled. The goroutines stop when ctx is cancelled. -func startCDC(ctx context.Context, store *sqlite.Store, dataDir string, logger *slog.Logger) (*cdcPipeline, error) { - log, err := cdc.OpenLog(dataDir, 0) - if err != nil { - return nil, fmt.Errorf("open cdc log: %w", err) - } - +// startCDC seeks the poller to the current head and starts its loop. It stops +// when ctx is cancelled; Stop waits for it to drain. +func startCDC(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*cdcPipeline, error) { bcast := cdc.NewBroadcaster() - logPath := filepath.Join(dataDir, cdc.LogFileName) - - pub := cdc.NewPublisher(outboxAdapter{store}, log, cdc.PublisherConfig{Logger: logger}) - con := cdc.NewConsumer(cdcConsumerName, logPath, store, bcast, cdc.ConsumerConfig{ - Snapshot: snapshotSource{store}, - Logger: logger, - }) - jan := cdc.NewJanitor(store, cdc.JanitorConfig{Logger: logger}) - - conDone, err := con.Start(ctx) - if err != nil { - log.Close() - return nil, fmt.Errorf("start cdc consumer: %w", err) + poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{Logger: logger}) + if err := poller.SeekToHead(ctx); err != nil { + return nil, err } - - return &cdcPipeline{ - Broadcaster: bcast, - log: log, - dones: []<-chan struct{}{pub.Start(ctx), conDone, jan.Start(ctx)}, - }, nil + return &cdcPipeline{Broadcaster: bcast, done: poller.Start(ctx)}, nil } -// Stop waits for every CDC goroutine to exit (the caller must have cancelled the -// ctx passed to startCDC) and closes the log file. +// Stop waits for the poller goroutine to exit (the caller must have cancelled the +// ctx passed to startCDC). func (p *cdcPipeline) Stop() error { - for _, d := range p.dones { - <-d - } - return p.log.Close() + <-p.done + return nil } -// outboxAdapter bridges *sqlite.Store's outbox methods to cdc.OutboxStore, -// mapping the storage-native OutboxEvent to the transport's PendingEvent. (The -// offset and vacuum contracts need no adapter — *sqlite.Store satisfies -// cdc.OffsetStore and cdc.Vacuum directly.) -type outboxAdapter struct{ store *sqlite.Store } +// cdcSource adapts *sqlite.Store's change_log reads to cdc.Source. +type cdcSource struct{ store *sqlite.Store } -func (a outboxAdapter) ListUnsent(ctx context.Context, limit int) ([]cdc.PendingEvent, error) { - evs, err := a.store.ListUnsent(ctx, limit) +func (s cdcSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { + rows, err := s.store.ReadChangeLogAfter(ctx, after, limit) if err != nil { return nil, err } - out := make([]cdc.PendingEvent, len(evs)) - for i, e := range evs { - out[i] = cdc.PendingEvent{ - OutboxID: e.OutboxID, - Event: cdc.Event{ - Seq: e.Seq, - SessionID: e.SessionID, - EventType: e.EventType, - Revision: e.Revision, - Payload: e.Payload, - CreatedAt: e.CreatedAt, - }, + out := make([]cdc.Event, len(rows)) + for i, r := range rows { + out[i] = cdc.Event{ + Seq: r.Seq, + ProjectID: r.ProjectID, + SessionID: r.SessionID, + Type: cdc.EventType(r.EventType), + Payload: json.RawMessage(r.Payload), + CreatedAt: r.CreatedAt, } } return out, nil } -func (a outboxAdapter) MarkSent(ctx context.Context, id int64, at time.Time) error { - return a.store.MarkSent(ctx, id, at) -} - -func (a outboxAdapter) MarkFailed(ctx context.Context, id int64, msg string) error { - return a.store.MarkFailed(ctx, id, msg) -} - -// snapshotSource rebuilds current state from the sessions table after a -// log-rotation gap, emitting one full-state event per session. Each event -// carries the change_log high-water seq so the consumer resumes its cursor -// there; the payload mirrors the canonical change_log payload (metadata -// excluded, version stamped) so subscribers parse snapshot and live events the -// same way. -type snapshotSource struct{ store *sqlite.Store } - -func (s snapshotSource) Snapshot(ctx context.Context) ([]cdc.Event, int64, error) { - recs, err := s.store.ListAll(ctx) - if err != nil { - return nil, 0, err - } - maxSeq, err := s.store.MaxChangeLogSeq(ctx) - if err != nil { - return nil, 0, err - } - events := make([]cdc.Event, 0, len(recs)) - for _, r := range recs { - r.Lifecycle.Version = domain.LifecycleVersion - blob, err := json.Marshal(r) - if err != nil { - return nil, 0, fmt.Errorf("marshal snapshot %s: %w", r.ID, err) - } - events = append(events, cdc.Event{ - Seq: maxSeq, - SessionID: string(r.ID), - EventType: "session_snapshot", - Revision: int64(r.Lifecycle.Revision), - Payload: string(blob), - CreatedAt: r.UpdatedAt, - }) - } - return events, maxSeq, nil +func (s cdcSource) LatestSeq(ctx context.Context) (int64, error) { + return s.store.MaxChangeLogSeq(ctx) } diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go index b5636761..a82ea85a 100644 --- a/backend/internal/domain/lifecycle.go +++ b/backend/internal/domain/lifecycle.go @@ -103,17 +103,16 @@ type SessionSubstate struct { // axis. The zero value (Exists=false) means "no PR", which derivation treats as // "session has no PR". type PRFacts struct { - URL string - Number int - Exists bool - Draft bool - Merged bool - Closed bool - CI CIState - Review ReviewDecision - Mergeability Mergeability - BotComments bool - IdleBeyond bool // idle past the stuck threshold + URL string + Number int + Exists bool + Draft bool + Merged bool + Closed bool + CI CIState + Review ReviewDecision + Mergeability Mergeability + ReviewComments bool // has unresolved review comments (any author) to address } type CIState string diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go index c9cd8d96..2b81088a 100644 --- a/backend/internal/domain/session.go +++ b/backend/internal/domain/session.go @@ -21,11 +21,11 @@ const ( // operational handles and seed inputs the Session Manager and reaper need but // that are NOT part of the canonical lifecycle. The set of fields is fixed here // (no free-form keys), so what a session can carry is a compile-time fact, and -// it is persisted 1:1 in the session_metadata table off the CDC path. +// it is folded into the sessions row off the CDC path. // -// Empty fields mean "unset": PatchMetadata never overwrites a stored value with -// an empty one, so a partial write (spawn setting only the runtime handle) does -// not clobber a value set earlier (the branch set at creation). +// Empty fields mean "unset": the LCM merges metadata without overwriting a +// stored value with an empty one, so a partial write (spawn setting only the +// runtime handle) does not clobber a value set earlier (the branch at creation). type SessionMetadata struct { Branch string `json:"branch,omitempty"` WorkspacePath string `json:"workspacePath,omitempty"` diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go index 0ff5c0fd..3ae1e00c 100644 --- a/backend/internal/domain/status.go +++ b/backend/internal/domain/status.go @@ -95,7 +95,7 @@ func prPipelineStatus(pr PRFacts) SessionStatus { return StatusCIFailed case pr.Draft: return StatusDraft - case pr.Review == ReviewChangesRequest || pr.BotComments: + case pr.Review == ReviewChangesRequest || pr.ReviewComments: return StatusChangesRequested case pr.Mergeability == MergeMergeable: return StatusMergeable diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go index ae63271c..57512577 100644 --- a/backend/internal/domain/status_test.go +++ b/backend/internal/domain/status_test.go @@ -42,7 +42,7 @@ func TestDeriveStatus(t *testing.T) { {"draft PR failing CI -> ci_failed (CI dominates)", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.CI = CIFailing }), StatusCIFailed}, {"draft PR ignores review state -> draft", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.Review = ReviewApproved }), StatusDraft}, {"open PR changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewChangesRequest }), StatusChangesRequested}, - {"open PR bot comments -> changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.BotComments = true }), StatusChangesRequested}, + {"open PR review comments -> changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.ReviewComments = true }), StatusChangesRequested}, {"open PR mergeable", sess(SessionWorking), openPR(func(f *PRFacts) { f.Mergeability = MergeMergeable }), StatusMergeable}, {"open PR approved", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewApproved }), StatusApproved}, {"open PR review required -> review_pending", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewRequired }), StatusReviewPending}, diff --git a/backend/internal/lifecycle/decide_bridge.go b/backend/internal/lifecycle/decide_bridge.go index 501d12ac..4f88cbe5 100644 --- a/backend/internal/lifecycle/decide_bridge.go +++ b/backend/internal/lifecycle/decide_bridge.go @@ -8,236 +8,102 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// defaultRecentActivityWindow is how fresh the last activity signal must be for -// the probe decider to treat the agent as "recently active" (which keeps an -// ambiguous dead-runtime probe in detecting instead of concluding death). +// defaultRecentActivityWindow is how fresh the last activity must be for the +// probe decider to treat the agent as "recently active" — which keeps an +// ambiguous dead-runtime probe in detecting instead of concluding death. const defaultRecentActivityWindow = 60 * time.Second -// ---- fact translation: ports DTOs -> pure decide inputs ---- - -// runtimeFactsToProbeInput maps a raw RuntimeFacts (plus the prior detecting -// memory and last-known activity read back from canonical) into the probe -// decider's input. KillRequested is always false here: the inferred-death path -// never carries an explicit kill — that arrives via OnKillRequested. -func runtimeFactsToProbeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { - rt, rtFailed := runtimeProbeToState(f.RuntimeState) - proc, procFailed := processProbeToLiveness(f.ProcessState) +// probeInput maps a raw RuntimeFacts (plus the prior detecting memory and last +// activity) into the pure decider's input. A failed/unknown probe is reported as +// such, never as a death — that routes to the detecting quarantine. +func probeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { now := nowOr(f.ObservedAt) - return decide.ProbeInput{ - Runtime: rt, - RuntimeFailed: rtFailed, - Process: proc, - ProcessFailed: procFailed, - RecentActivity: hasRecentActivity(cur.Activity, now, window), - Prior: cur.Detecting, - Now: now, - } -} -func runtimeProbeToState(p ports.RuntimeProbe) (domain.RuntimeState, bool) { - switch p { - case ports.RuntimeProbeAlive: - return domain.RuntimeAlive, false - case ports.RuntimeProbeDead: - return domain.RuntimeExited, false - case ports.RuntimeProbeFailed: - return domain.RuntimeProbeFailed, true - default: // indeterminate / unset: ambiguous, never a death conclusion - return domain.RuntimeUnknown, false + var runtimeAlive, runtimeFailed bool + switch f.Runtime { + case ports.ProbeAlive: + runtimeAlive = true + case ports.ProbeFailed, ports.ProbeUnknown: + runtimeFailed = true // ambiguous: quarantine, never conclude death } -} -func processProbeToLiveness(p ports.ProcessProbe) (decide.ProcessLiveness, bool) { - switch p { - case ports.ProcessProbeAlive: - return decide.ProcessAlive, false - case ports.ProcessProbeDead: - return decide.ProcessDead, false - case ports.ProcessProbeFailed: - return decide.ProcessIndeterminate, true - default: // indeterminate / unset - return decide.ProcessIndeterminate, false + var process decide.ProcessLiveness + var processFailed bool + switch f.Process { + case ports.ProbeAlive: + process = decide.ProcessAlive + case ports.ProbeDead: + process = decide.ProcessDead + case ports.ProbeFailed: + process, processFailed = decide.ProcessIndeterminate, true + default: + process = decide.ProcessIndeterminate } -} -// runtimeSubstateFromFacts derives the runtime sub-state to persist. Liveness -// always owns this axis, so it is written on every runtime observation -// regardless of what the session axis does. -func runtimeSubstateFromFacts(f ports.RuntimeFacts) domain.RuntimeSubstate { - switch f.RuntimeState { - case ports.RuntimeProbeAlive: - return domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} - case ports.RuntimeProbeDead: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonTmuxMissing} - case ports.RuntimeProbeFailed: - return domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError} - case ports.RuntimeProbeIndeterminate: - // Probe ran but couldn't tell — distinct from a probe error, so no - // probe_error reason; the ambiguity is carried by RuntimeUnknown alone. - return domain.RuntimeSubstate{State: domain.RuntimeUnknown} - default: // unset - return domain.RuntimeSubstate{State: domain.RuntimeUnknown} + return decide.ProbeInput{ + RuntimeAlive: runtimeAlive, + RuntimeFailed: runtimeFailed, + Process: process, + ProcessFailed: processFailed, + RecentActivity: hasRecentActivity(cur.Activity, now, window), + Prior: cur.Detecting, + Now: now, } } -// hasRecentActivity answers the probe decider's "was the agent heard from -// recently?" question. Sticky states (waiting_input/blocked) count as recent -// because they mean a live-but-paused agent; an explicit exited signal never -// counts; otherwise we age the last-activity timestamp against the window. +// hasRecentActivity answers the decider's "heard from the agent recently?" +// question. Sticky states (waiting_input/blocked) count as recent (a live-but- +// paused agent); an explicit exited never counts; else age the timestamp. func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool { - if a.State == domain.ActivityExited { + switch { + case a.State == domain.ActivityExited: return false - } - if a.State.IsSticky() { + case a.State.IsSticky(): return true - } - if a.LastActivityAt.IsZero() { + case a.LastActivityAt.IsZero(): return false + default: + return now.Sub(a.LastActivityAt) <= window } - return now.Sub(a.LastActivityAt) <= window -} - -// openPRInput maps SCM facts onto the open-PR ladder. IdleBeyond is always false -// in split A — the idle-duration signal is owned by the escalation engine -// (split B); the synchronous LCM has no clock of its own here. -func openPRInput(f ports.SCMFacts) decide.OpenPRInput { - hasBotComments, hasHumanComments := classifyPendingComments(f.PendingComments) - return decide.OpenPRInput{ - Draft: f.PRState == domain.PRDraft || f.Draft, - CIFailing: f.CISummary == ports.CIFailing, - ChangesRequested: f.ReviewDecision == ports.ReviewChangesRequested || hasHumanComments, - BotComments: hasBotComments, - MergeConflicts: hasMergeConflicts(f.Mergeability), - Approved: f.ReviewDecision == ports.ReviewApproved, - Mergeable: f.Mergeability.Mergeable, - ReviewPending: f.ReviewDecision == ports.ReviewPending, - Number: f.PRNumber, - URL: f.PRURL, - } -} - -func classifyPendingComments(comments []ports.ReviewComment) (hasBot, hasHuman bool) { - for _, c := range comments { - if c.IsBot { - hasBot = true - } else { - hasHuman = true - } - } - return hasBot, hasHuman -} - -func hasMergeConflicts(m ports.Mergeability) bool { - return !m.Mergeable && !m.NoConflicts && (m.CIPassing || m.Approved || len(m.Blockers) > 0) } -// ---- activity -> session axis mapping (activity owns working/idle/waiting) ---- - -// activityToSession maps an activity classification onto the session sub-state. -// exited returns ok=false: an exit signal must NOT write a terminal session -// state — only the probe pipeline (via detecting) may conclude inferred death. -func activityToSession(a domain.ActivityState) (domain.SessionState, domain.SessionReason, bool) { +// activityToSession maps an activity classification onto the session state. +// exited returns ok=false: only the probe pipeline may conclude death. +func activityToSession(a domain.ActivityState) (domain.SessionState, bool) { switch a { case domain.ActivityActive: - return domain.SessionWorking, domain.ReasonTaskInProgress, true - case domain.ActivityReady: - // ready = the agent finished a unit and is waiting for more work. - return domain.SessionIdle, domain.ReasonResearchComplete, true - case domain.ActivityIdle: - // plain inactivity carries no completion claim, so no specific reason - // (research_complete here would read misleadingly in diagnostics). - return domain.SessionIdle, "", true + return domain.SessionWorking, true + case domain.ActivityReady, domain.ActivityIdle: + return domain.SessionIdle, true case domain.ActivityWaitingInput: - return domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, true + return domain.SessionNeedsInput, true case domain.ActivityBlocked: - return domain.SessionStuck, domain.ReasonAwaitingUserInput, true - default: // exited / unset - return "", "", false + return domain.SessionStuck, true + default: + return "", false } } -// ---- composition predicates: who may write the session axis ---- - -// isTerminal reports a final session state that must not be resurrected by an -// observation (only an explicit Restore reopens a terminal session). +// isTerminal reports a final session state — reopened only by an explicit +// Restore, never by an observation. func isTerminal(s domain.SessionState) bool { return s == domain.SessionDone || s == domain.SessionTerminated } -// isLivenessOwned reports whether the current session sub-state was set by the -// liveness/death axis (the probe pipeline) and may therefore be recovered by a -// later healthy probe. detecting is always liveness-owned; a stuck/terminated -// state is liveness-owned only when its reason came from a death inference. -func isLivenessOwned(s domain.SessionSubstate) bool { - if s.State == domain.SessionDetecting { - return true - } - switch s.Reason { - case domain.ReasonRuntimeLost, domain.ReasonAgentProcessExited, domain.ReasonProbeFailure: - return true - } - return false -} - -// shouldWriteSessionRuntime is the #1 composition rule for ApplyRuntimeObservation. -// A death-axis verdict (detecting/stuck/terminal) always writes — it overrides -// activity because a (maybe) dead agent can't be working/waiting. A healthy -// "working" verdict only writes when it is recovering a liveness-owned state -// (e.g. detecting -> working); it must NOT clobber an activity-owned -// needs_input/blocked/idle the activity axis is responsible for. -func shouldWriteSessionRuntime(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { +// writeRuntimeSession reports whether a probe verdict may write the session +// state. A death-axis verdict (detecting/stuck/terminated) always writes; a +// healthy "working" verdict only recovers a detecting session — it must not +// clobber an activity-owned idle/needs_input. +func writeRuntimeSession(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { if isTerminal(cur.Session.State) { - // A terminal session is only reopened by an explicit Restore — never by - // an observation. Even a death-axis verdict (e.g. detecting) must not - // resurrect it; the runtime axis is still patched separately. return false } if d.SessionState == domain.SessionWorking { - return isLivenessOwned(cur.Session) + return cur.Session.State == domain.SessionDetecting } return true } -// shouldWriteSessionActivity is the mirror rule for ApplyActivitySignal: the -// activity axis owns working/idle/waiting. A valid activity signal is direct -// proof of life, so it is allowed to RESOLVE a detecting session (pull it out of -// the liveness quarantine) — but it must not resurrect a terminal session, and -// it leaves a liveness-escalated stuck state to the probe pipeline (stuck is a -// deliberate human-facing escalation, not a transient quarantine). -func shouldWriteSessionActivity(cur domain.CanonicalSessionLifecycle) bool { - if isTerminal(cur.Session.State) { - return false - } - if cur.Session.State == domain.SessionDetecting { - return true - } - return !isLivenessOwned(cur.Session) -} - -// ---- explicit-kill mapping (SM's terminal-write authority) ---- - -func killSession(k ports.LifecycleKillReason) domain.SessionSubstate { - switch k { - case ports.KillManual: - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonManuallyKilled} - case ports.KillCleanup: - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonAutoCleanup} - default: // error - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonErrorInProcess} - } -} - -func killRuntime(k ports.LifecycleKillReason) domain.RuntimeSubstate { - switch k { - case ports.KillManual: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonManualKillRequested} - case ports.KillCleanup: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonAutoCleanup} - default: // error - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonProbeError} - } -} - func nowOr(t time.Time) time.Time { if t.IsZero() { return time.Now() diff --git a/backend/internal/lifecycle/fakes_test.go b/backend/internal/lifecycle/fakes_test.go deleted file mode 100644 index 45aec91b..00000000 --- a/backend/internal/lifecycle/fakes_test.go +++ /dev/null @@ -1,164 +0,0 @@ -package lifecycle - -import ( - "context" - "fmt" - "sync" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// fakeStore is an in-memory LifecycleStore that faithfully applies full-row -// Upsert semantics so tests assert against the real persisted canonical. -type fakeStore struct { - mu sync.Mutex - records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]domain.SessionMetadata -} - -var _ ports.LifecycleStore = (*fakeStore)(nil) - -func newFakeStore() *fakeStore { - return &fakeStore{ - records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]domain.SessionMetadata{}, - } -} - -// seed installs a starting lifecycle for a session id (bypassing the patch path). -func (s *fakeStore) seed(id domain.SessionID, l domain.CanonicalSessionLifecycle) { - s.mu.Lock() - defer s.mu.Unlock() - if l.Version == 0 { - l.Version = domain.LifecycleVersion - } - s.records[id] = &domain.SessionRecord{ID: id, Lifecycle: l} -} - -func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.CanonicalSessionLifecycle{}, false, nil - } - return rec.Lifecycle, true, nil -} - -func (s *fakeStore) Upsert(_ context.Context, rec domain.SessionRecord, _ ports.EventType) error { - s.mu.Lock() - defer s.mu.Unlock() - if existing, ok := s.records[rec.ID]; ok { - if rec.Lifecycle.Revision != existing.Lifecycle.Revision { - return fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, existing.Lifecycle.Revision) - } - rec.Lifecycle.Revision = existing.Lifecycle.Revision + 1 - } else { - if rec.Lifecycle.Revision != 0 { - return fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) - } - rec.Lifecycle.Revision = 1 - } - if rec.Lifecycle.Version == 0 { - rec.Lifecycle.Version = domain.LifecycleVersion - } - r := rec - s.records[rec.ID] = &r - return nil -} - -func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.SessionRecord{}, false, nil - } - return *rec, true, nil -} - -func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - s.mu.Lock() - defer s.mu.Unlock() - var out []domain.SessionRecord - for _, rec := range s.records { - if rec.ProjectID == project { - out = append(out, *rec) - } - } - return out, nil -} - -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (domain.SessionMetadata, error) { - s.mu.Lock() - defer s.mu.Unlock() - return s.metadata[id], nil -} - -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, meta domain.SessionMetadata) error { - s.mu.Lock() - defer s.mu.Unlock() - s.metadata[id] = mergeSessionMetadata(s.metadata[id], meta) - return nil -} - -// mergeSessionMetadata applies meta onto dst with the store's "empty = leave -// unchanged" semantics, so partial patches do not clobber earlier values. -func mergeSessionMetadata(dst, meta domain.SessionMetadata) domain.SessionMetadata { - if meta.Branch != "" { - dst.Branch = meta.Branch - } - if meta.WorkspacePath != "" { - dst.WorkspacePath = meta.WorkspacePath - } - if meta.RuntimeHandleID != "" { - dst.RuntimeHandleID = meta.RuntimeHandleID - } - if meta.RuntimeName != "" { - dst.RuntimeName = meta.RuntimeName - } - if meta.AgentSessionID != "" { - dst.AgentSessionID = meta.AgentSessionID - } - if meta.Prompt != "" { - dst.Prompt = meta.Prompt - } - return dst -} - -// recordingNotifier captures emitted events for assertions. -type recordingNotifier struct { - mu sync.Mutex - events []ports.OrchestratorEvent -} - -var _ ports.Notifier = (*recordingNotifier)(nil) - -func (n *recordingNotifier) Notify(_ context.Context, e ports.OrchestratorEvent) error { - n.mu.Lock() - defer n.mu.Unlock() - n.events = append(n.events, e) - return nil -} - -// recordingMessenger captures messages injected into agents. -type recordingMessenger struct { - mu sync.Mutex - sent []struct { - ID domain.SessionID - Message string - } -} - -var _ ports.AgentMessenger = (*recordingMessenger)(nil) - -func (a *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { - a.mu.Lock() - defer a.mu.Unlock() - a.sent = append(a.sent, struct { - ID domain.SessionID - Message string - }{id, message}) - return nil -} diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 63d7164a..5c58f0a2 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -1,13 +1,8 @@ // Package lifecycle implements ports.LifecycleManager: the synchronous -// observe->decide->persist reducer. Every Apply*/On* entrypoint runs the same -// pipeline under a per-session lock — load the full canonical record, run the -// matching pure decider, classify the resulting change, and persist the full -// row through the store. The store owns Revision++; the LCM never polls and -// never writes the display status (that is derived on read). -// -// After a transition is persisted, the Apply* paths fire the mapped reaction -// (the ACT layer: reaction table + escalation engine) via the react() chokepoint -// in reactions.go. +// observe -> decide -> persist reducer. Every Apply*/On* entrypoint loads the +// session, runs the pure decider, and persists the full row under a single write +// lock. The DB triggers emit the CDC; the engine never writes the change log. +// After a transition it fires the mapped reaction (see reactions.go). package lifecycle import ( @@ -21,438 +16,241 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// Session metadata is now the typed domain.SessionMetadata struct (was a -// free-form string map keyed by Meta* constants). OnSpawnCompleted records the -// spawned session's handles via spawnMetadata; Prompt is the assembled launch -// prompt, persisted so a Restore that finds no captured agent session id can -// still fall back to a fresh launch with the same prompt rather than failing. - -// Manager is the LCM. The Apply* pipeline persists a transition and then fires -// the mapped reaction via Notifier/AgentMessenger (see reactions.go). +// Manager is the lifecycle engine. mu serialises the load->decide->persist +// read-modify-write across sessions; reactions dispatch after the lock releases +// so a slow agent send never blocks the write path. type Manager struct { - store ports.LifecycleStore + store ports.SessionStore + pr ports.PRWriter notifier ports.Notifier messenger ports.AgentMessenger - recentActivityWindow time.Duration - locks keyedMutex - - // trackers hold per-(session,reaction) escalation budgets (ACT policy, not - // canonical state). trackerMu guards them: react() touches them from the - // caller's goroutine, TickEscalations from the reaper's. clock is the time - // source for escalation stamping (overridable in tests). - trackers map[trackerKey]*reactionTracker - trackerMu sync.Mutex - clock func() time.Time + mu sync.Mutex + window time.Duration + clock func() time.Time - // reactionStore, when wired via WithReactionStore, makes the trackers map a - // write-through cache over durable rows so a restart does not re-fire an - // already-escalated human page. nil keeps the in-memory-only default. - reactionStore ReactionStore - - // sessionLister returns every session known to persistence so RunningSessions - // can filter by runtime axis without coupling the LCM to a cross-project - // store API the Tom-store does not yet expose. The daemon (lane #10) injects - // the production lister via WithSessionLister; until then, the call returns - // no sessions so a reaper attached to an unwired Manager is a clean no-op - // rather than a panic. - sessionLister func(ctx context.Context) ([]domain.SessionRecord, error) + // in-memory ACT state (policy, not canonical truth — reset on restart). + react reactionState } var _ ports.LifecycleManager = (*Manager)(nil) -func New(store ports.LifecycleStore, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { +func New(store ports.SessionStore, pr ports.PRWriter, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { return &Manager{ - store: store, - notifier: notifier, - messenger: messenger, - recentActivityWindow: defaultRecentActivityWindow, - trackers: map[trackerKey]*reactionTracker{}, - clock: time.Now, + store: store, + pr: pr, + notifier: notifier, + messenger: messenger, + window: defaultRecentActivityWindow, + clock: time.Now, + react: newReactionState(), } } -// WithSessionLister injects the function the LCM uses to enumerate all -// persisted sessions for RunningSessions. The daemon wires this against the -// store at startup; it must be called BEFORE any reaper attached to this -// Manager starts running, since concurrent calls would race the bare-field -// read in RunningSessions. Calling it more than once replaces the previous -// lister. -func (m *Manager) WithSessionLister(fn func(ctx context.Context) ([]domain.SessionRecord, error)) { - m.sessionLister = fn -} - -// ---- per-session serialisation ---- - -// keyedMutex hands out one lock per session id so the load->decide->persist -// read-modify-write is serial within a session but parallel across sessions. -// -// Entries are reference-counted and evicted when the last holder releases, so -// the map stays bounded to sessions with in-flight operations rather than -// growing unbounded over the lifetime of a long-running daemon. -type keyedMutex struct { - mu sync.Mutex - locks map[domain.SessionID]*lockEntry -} - -type lockEntry struct { - mu sync.Mutex - refs int -} - -func (k *keyedMutex) lock(id domain.SessionID) func() { - k.mu.Lock() - if k.locks == nil { - k.locks = make(map[domain.SessionID]*lockEntry) - } - e, ok := k.locks[id] - if !ok { - e = &lockEntry{} - k.locks[id] = e - } - e.refs++ - k.mu.Unlock() - - e.mu.Lock() - return func() { - e.mu.Unlock() - k.mu.Lock() - e.refs-- - if e.refs == 0 { - delete(k.locks, id) - } - k.mu.Unlock() - } -} - -func (m *Manager) withLock(id domain.SessionID, fn func() error) error { - unlock := m.locks.lock(id) - defer unlock() - return fn() -} - -// transition is what a persisted write produced: the canonical before and after -// the full-row upsert. The ACT layer (react) derives the reaction from these. It -// is nil when the pipeline made no write. -// -// projectID is captured so reaction events fired downstream (Notifier.Notify in -// executeReaction and escalate) can populate OrchestratorEvent.ProjectID — the -// human-facing event router groups events by project. Empty when the record has -// no ProjectID (e.g. test-only seeded records that omit identity). -type transition struct { - beforeLC domain.CanonicalSessionLifecycle - afterLC domain.CanonicalSessionLifecycle - projectID domain.ProjectID -} - -// mutate runs the shared pipeline: load full row -> build next canonical -> -// Upsert full row (only if changed). decideFn returns the full next lifecycle -// and whether it changed anything; false is a clean no-op (no write), which is -// how failed-probe / unknown-fact inputs are dropped. -// -// On a write it returns the transition (before/after canonical) so the caller — -// which still holds the originating facts — can fire the mapped reaction. +// mutate runs the shared pipeline: load -> decideFn -> persist (only if changed). +// It returns whether a write happened. A stray observation for an unknown session +// is a clean no-op. func (m *Manager) mutate( ctx context.Context, id domain.SessionID, - decideFn func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error), -) (*transition, error) { - var tr *transition - err := m.withLock(id, func() error { - rec, exists, err := m.store.Get(ctx, id) - if err != nil { - return err - } - cur := rec.Lifecycle - next, changed, err := decideFn(cur, exists) - if err != nil { - return err - } - if !changed { - return nil - } - rec.Lifecycle = m.prepareLifecycleWrite(next) - rec.UpdatedAt = m.clock() - if err := m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, false)); err != nil { - return err - } - // ProjectID is captured straight from the record we already loaded at the - // top of this closure — identity is set once at OnSpawnInitiated and never - // mutated, so no second store roundtrip is needed for reaction events. - tr = &transition{beforeLC: cur, afterLC: rec.Lifecycle, projectID: rec.ProjectID} - return nil - }) - return tr, err -} - -func (m *Manager) prepareLifecycleWrite(next domain.CanonicalSessionLifecycle) domain.CanonicalSessionLifecycle { + fn func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool), +) (bool, error) { + m.mu.Lock() + defer m.mu.Unlock() + + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return false, err + } + next, changed := fn(rec.Lifecycle) + if !changed { + return false, nil + } next.Version = domain.LifecycleVersion - return next + rec.Lifecycle = next + rec.UpdatedAt = m.clock() + if err := m.store.UpdateSession(ctx, rec); err != nil { + return false, err + } + return true, nil } // ---- OBSERVE entrypoints ---- -// ApplyRuntimeObservation feeds the probe decider. Liveness always writes the -// runtime axis; the session axis follows the #1 composition rule; and a -// non-detecting verdict clears any stale detecting memory (#3) so the next -// probe doesn't read a phantom prior. +// ApplyRuntimeObservation feeds the probe decider. is_alive always tracks the +// verdict; the session state follows the runtime-write rule; a non-detecting +// verdict clears stale detecting memory. func (m *Manager) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists { - return cur, false, nil // nothing seeded; ignore stray probe - } - - d := decide.ResolveProbeDecision(runtimeFactsToProbeInput(f, cur, m.recentActivityWindow)) - + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + d := decide.ResolveProbeDecision(probeInput(f, cur, m.window)) next := cur - changed := false - - if rt := runtimeSubstateFromFacts(f); cur.Runtime != rt { - next.Runtime = rt - changed = true + ch := false + if next.IsAlive != d.IsAlive { + next.IsAlive, ch = d.IsAlive, true } - // A terminal session is reopened only by an explicit Restore: an - // observation may refresh the runtime axis above but must touch neither - // the session axis nor the detecting memory. if !isTerminal(cur.Session.State) { - if shouldWriteSessionRuntime(d, cur) { - changed = setSessionIfChanged(&next, d.SessionState, d.SessionReason) || changed + if writeRuntimeSession(d, cur) { + ch = setSessionState(&next, d.SessionState, d.TerminationReason) || ch } - changed = setDetecting(&next, d.Detecting) || changed + ch = setDetecting(&next, d.Detecting) || ch } - - return next, changed, nil + return next, ch }) - if err != nil { + if err != nil || !changed { return err } - return m.react(ctx, id, tr, reactionContext{}) + return m.runReactions(ctx, id, reactionContent{}) } -// ApplySCMObservation maps PR facts onto the PR axis. A failed fetch is dropped -// (failed probe != "no PR"). An open or draft PR writes only the PR sub-state — -// the session axis stays owned by activity, and DeriveLegacyStatus surfaces the -// PR reason for display. A terminal PR (merged/closed) also parks the session. -func (m *Manager) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists || !f.Fetched { - return cur, false, nil - } - - switch f.PRState { - case domain.PRDraft, domain.PROpen: - d := decide.ResolveOpenPRDecision(openPRInput(f)) - next := cur - changed := setPRIfChanged(&next, d, f) - return next, changed, nil - - case domain.PRMerged, domain.PRClosed: - d := decide.ResolveTerminalPRStateDecision(f.PRState) - next := cur - changed := setPRIfChanged(&next, d, f) - // A merge/close is a milestone that ends the work, so it parks the - // session axis (idle / merged_waiting_decision) even over an - // activity-owned needs_input/blocked — unlike the open-PR path, - // which leaves the session axis to activity. A terminal session is - // still never reopened. - if !isTerminal(cur.Session.State) { - changed = setSessionIfChanged(&next, d.SessionState, d.SessionReason) || changed - } - return next, changed, nil - - default: // none / unset: no PR-driven transition in split A - return cur, false, nil - } - }) - if err != nil { - return err - } - return m.react(ctx, id, tr, reactionContext{ciFailureLogTail: f.CIFailureLogTail}) -} - -// ApplyActivitySignal updates the activity axis. Only a valid-confidence signal -// is authoritative (stale/unavailable/probe_failure != idleness). It refreshes -// the persisted activity sub-state (the probe decider's RecentActivity input) -// and maps the classification onto the session axis. A valid signal is proof of -// life, so it may resolve a detecting session — clearing the quarantine memory -// so a later probe doesn't resume counting from a stale prior. +// ApplyActivitySignal updates the activity axis. Only a valid signal is +// authoritative, and it is proof of life: it may resolve a detecting session and +// move the session out of any non-terminal state. func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists || s.State != ports.SignalValid { - return cur, false, nil + if !s.Valid { + return nil + } + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false } - next := cur - changed := false - - act := domain.ActivitySubstate{State: s.Activity, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} + ch := false + act := domain.ActivitySubstate{State: s.State, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} if !sameActivity(cur.Activity, act) { - next.Activity = act - changed = true + next.Activity, ch = act, true } - if st, rs, ok := activityToSession(s.Activity); ok && shouldWriteSessionActivity(cur) { - changed = setSessionIfChanged(&next, st, rs) || changed - // Proof of life that pulls the session out of detecting must also - // drop the quarantine memory (detecting memory only exists while - // detecting, so this is a no-op otherwise). - if cur.Detecting != nil { - next.Detecting = nil - changed = true + if st, ok := activityToSession(s.State); ok { + ch = setSessionState(&next, st, domain.TermNone) || ch + if next.Detecting != nil { + next.Detecting, ch = nil, true } } - - return next, changed, nil + if s.State != domain.ActivityExited && !next.IsAlive { + next.IsAlive, ch = true, true + } + return next, ch }) - if err != nil { + if err != nil || !changed { return err } - return m.react(ctx, id, tr, reactionContext{}) + return m.runReactions(ctx, id, reactionContent{}) } -// ---- mutation commands/outcomes reported by the Session Manager ---- +// ApplyPRObservation records the observed PR facts in the pr tables, terminates +// the session on a merge, and fires the PR-driven reactions. A failed fetch is +// dropped (failed probe != "PR closed"). +func (m *Manager) ApplyPRObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + if !o.Fetched { + return nil + } + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return err + } + if err := m.writePR(ctx, id, o); err != nil { + return err + } -// OnSpawnInitiated seeds or reopens the full session record for a spawn-like -// mutation. It is the Session Manager's create/reopen command under the Writer -// contract: the SM builds the identity + initial canonical row, but only the LCM -// writes it. Fresh rows emit session_created; reopening a terminal row reuses -// the current row as the before-image and lets the classifier emit the schema -// event for the reopen. -func (m *Manager) OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error { - return m.withLock(rec.ID, func() error { - cur := rec.Lifecycle - isInsert := true - if current, ok, err := m.store.Get(ctx, rec.ID); err != nil { - return err - } else if ok { - currentLC := current.Lifecycle - if !isTerminal(currentLC.Session.State) && !isTerminal(rec.Lifecycle.Session.State) { - return fmt.Errorf("lifecycle: OnSpawnInitiated for active session %q", rec.ID) + if o.Merged { + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false } - cur = currentLC - isInsert = false - } - rec.Lifecycle = m.prepareLifecycleWrite(rec.Lifecycle) - if isInsert { - rec.Lifecycle.Revision = 0 - } else { - rec.Lifecycle.Revision = cur.Revision - } - now := m.clock() - if rec.CreatedAt.IsZero() { - rec.CreatedAt = now - } - rec.UpdatedAt = now - return m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, isInsert)) - }) -} - -// OnSpawnCompleted records that a spawn finished: the runtime is up and the -// handles are known. Per the agreed rule it flips the runtime axis to alive and -// stores the handles in metadata, but leaves the session at not_started -// (display: spawning) — the agent "acknowledges" via the first activity signal. -func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { - return m.withLock(id, func() error { - rec, exists, err := m.store.Get(ctx, id) + next := cur + next.Session.State = domain.SessionTerminated + next.TerminationReason = domain.TermPRMerged + next.IsAlive = false + next.Detecting = nil + return next, true + }) if err != nil { return err } - if !exists { - // The SM seeds the initial lifecycle before spawning; a completion - // for an unseeded session is a contract violation, not a stray - // observation, so surface it rather than fabricating a record. - return fmt.Errorf("lifecycle: OnSpawnCompleted for unseeded session %q", id) - } - rt := domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} - if rec.Lifecycle.Runtime != rt { - cur := rec.Lifecycle - next := cur - next.Runtime = rt - rec.Lifecycle = m.prepareLifecycleWrite(next) - rec.UpdatedAt = m.clock() - if err := m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, false)); err != nil { - return err - } - } - if meta := spawnMetadata(o); !meta.IsZero() { - if err := m.store.PatchMetadata(ctx, id, meta); err != nil { - return err - } + if changed { + m.clearReactions(id) + return m.fireNotify(ctx, id, rec.ProjectID, reactions[rxMerged]) } return nil - }) + } + + return m.runReactions(ctx, id, prContent(o)) } -// OnKillRequested is the SM's explicit terminal-write authority (the one -// terminal path that does not go through the inferred-death decider). It writes -// the terminal session/runtime sub-states for the kill kind and clears any -// in-flight detecting memory. -func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { - // An explicit user kill is a human action, not an inferred event, so it - // fires no reaction — the transition is discarded. - _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists { - // Killing an unknown/already-gone session is a benign race; no-op - // rather than fabricating a terminal record for a session we never - // knew about. - return cur, false, nil +// writePR upserts the scalar facts, records each check run, and replaces the +// comment set. PR-table CDC is emitted by the DB triggers. +func (m *Manager) writePR(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + now := m.clock() + if err := m.pr.UpsertPR(ctx, ports.PRRow{ + URL: o.URL, SessionID: string(id), Number: o.Number, + Draft: o.Draft, Merged: o.Merged, Closed: o.Closed, + CI: o.CI, Review: o.Review, Mergeability: o.Mergeability, UpdatedAt: now, + }); err != nil { + return err + } + for _, c := range o.Checks { + c.PRURL = o.URL + if c.CreatedAt.IsZero() { + c.CreatedAt = now + } + if err := m.pr.RecordCheck(ctx, c); err != nil { + return err } + } + return m.pr.ReplacePRComments(ctx, o.URL, o.Comments) +} - next := cur - changed := false +// ---- mutation commands from the Session Manager ---- - if sess := killSession(r.Kind); cur.Session != sess { - next.Session = sess - changed = true - } - if rt := killRuntime(r.Kind); cur.Runtime != rt { - next.Runtime = rt - changed = true - } - if cur.Detecting != nil { - next.Detecting = nil - changed = true - } - return next, changed, nil - }) +// OnSpawnCompleted marks a session live and folds in its handles. It serves a +// fresh spawn (not_started -> live) and a restore (terminal -> reopened): both +// land at not_started + is_alive, with the agent acknowledging via first activity. +func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + m.mu.Lock() + defer m.mu.Unlock() + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { return err } - // A kill is terminal but bypasses react()'s incident-over cleanup (it fires - // no reaction). Drop any escalation trackers here so a later duration-based - // TickEscalations can't emit reaction.escalated for a dead session. - m.clearSessionTrackers(ctx, id) - return nil + if !ok { + return fmt.Errorf("lifecycle: OnSpawnCompleted for unknown session %q", id) + } + rec.Lifecycle.Version = domain.LifecycleVersion + rec.Lifecycle.Session.State = domain.SessionNotStarted + rec.Lifecycle.TerminationReason = domain.TermNone + rec.Lifecycle.IsAlive = true + rec.Lifecycle.Detecting = nil + rec.Metadata = mergeMetadata(rec.Metadata, spawnMetadata(o)) + rec.UpdatedAt = m.clock() + return m.store.UpdateSession(ctx, rec) } -// ---- read-snapshot helpers ---- +// OnKillRequested is the explicit terminal-write path (the one terminal that does +// not go through the inferred-death decider). It fires no reaction — an explicit +// kill is a human action — but drops the session's ACT state. +func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error { + _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false + } + if reason == domain.TermNone { + reason = domain.TermManuallyKilled + } + next := cur + next.Session.State = domain.SessionTerminated + next.TerminationReason = reason + next.IsAlive = false + next.Detecting = nil + return next, true + }) + m.clearReactions(id) + return err +} -// RunningSessions returns a snapshot of every persisted session worth probing -// in the next reaper tick. "Worth probing" is wider than "runtime axis alive": -// it includes sessions in the Detecting quarantine, because a fresh probe is -// the only fact that can recover them (back to working) or escalate them -// (terminal killed). Filtering to runtime-axis-alive would silently park every -// Detecting session — a single failed probe would never get a second chance -// and recovery via runtime probe would be unreachable. -// -// The predicate is "not a final session state". Terminal session states (done, -// terminated) are excluded because Restore is the only path back; observations -// must not reopen them (#1 invariant). Sessions in earlier states — not_started, -// working, idle, needs_input, stuck, detecting — are all included. Those that -// lack runtime handle metadata (e.g. not_started before OnSpawnCompleted) are -// returned and harmlessly skipped by the reaper's per-session handle guard. -// -// The call only reads and copies, so it does not break the single-writer -// invariant; concurrent Apply* calls may move sessions in or out of the probe -// set between snapshots, which is correct — the next tick re-reads. -// -// When no lister has been wired (e.g. tests construct a bare Manager), the -// method returns nil so a goroutine attached to such a Manager degrades to a -// no-op rather than panicking. +// RunningSessions snapshots every non-terminal session for the reaper to probe. +// Detecting sessions are included — a fresh probe is the only fact that recovers +// or escalates them. func (m *Manager) RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) { - if m.sessionLister == nil { - return nil, nil - } - all, err := m.sessionLister(ctx) + all, err := m.store.ListAllSessions(ctx) if err != nil { return nil, err } @@ -465,37 +263,28 @@ func (m *Manager) RunningSessions(ctx context.Context) ([]domain.SessionRecord, return out, nil } -// ---- diff helpers ---- +// ---- diff + metadata helpers ---- -// setSessionIfChanged sets next.Session only when the decided sub-state differs -// from the current next value; an empty decided state means "decider does not -// address the session axis" and is left untouched. -func setSessionIfChanged(next *domain.CanonicalSessionLifecycle, st domain.SessionState, rs domain.SessionReason) bool { +// setSessionState sets the state (and, for a terminal state, the reason) when it +// differs. An empty state means "decider doesn't address the session axis". +func setSessionState(next *domain.CanonicalSessionLifecycle, st domain.SessionState, reason domain.TerminationReason) bool { if st == "" { return false } - want := domain.SessionSubstate{State: st, Reason: rs} - if next.Session == want { - return false + changed := false + if next.Session.State != st { + next.Session.State, changed = st, true } - next.Session = want - return true -} - -// setPRIfChanged folds the decided PR sub-state plus the fact-borne PR identity -// (number/url) into next when it differs from the current next value. -func setPRIfChanged(next *domain.CanonicalSessionLifecycle, d decide.LifecycleDecision, f ports.SCMFacts) bool { - want := domain.PRSubstate{State: d.PRState, Reason: d.PRReason, Number: f.PRNumber, URL: f.PRURL} - if next.PR == want { - return false + want := domain.TermNone + if st == domain.SessionTerminated { + want = reason + } + if next.TerminationReason != want { + next.TerminationReason, changed = want, true } - next.PR = want - return true + return changed } -// setDetecting implements the detecting semantics on the full canonical row: -// set/replace when the decision carries memory, clear (#3) when it doesn't but -// canonical still holds stale memory, else leave untouched. func setDetecting(next *domain.CanonicalSessionLifecycle, d *domain.DetectingState) bool { if d != nil { if next.Detecting != nil && *next.Detecting == *d { @@ -512,27 +301,8 @@ func setDetecting(next *domain.CanonicalSessionLifecycle, d *domain.DetectingSta return false } -func classifyEventType(before, after domain.CanonicalSessionLifecycle, isInsert bool) ports.EventType { - switch { - case isInsert: - return ports.EventSessionCreated - case before.Session.State != after.Session.State && after.Session.State == domain.SessionTerminated: - return ports.EventSessionTerminated - case before.Session != after.Session: - return ports.EventSessionStateChanged - case before.PR != after.PR: - return ports.EventSessionPRUpdated - case before.Runtime != after.Runtime: - return ports.EventSessionRuntimeUpdated - case before.Activity != after.Activity: - return ports.EventSessionActivityUpdated - default: - return ports.EventSessionUpdated - } -} - -// sameActivity compares activity sub-states with time-aware equality (== on -// time.Time is monotonic-clock sensitive and would spuriously report changes). +// sameActivity compares with time-aware equality (== on time.Time is +// monotonic-clock sensitive and would spuriously report changes). func sameActivity(a, b domain.ActivitySubstate) bool { return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt) } @@ -547,3 +317,21 @@ func spawnMetadata(o ports.SpawnOutcome) domain.SessionMetadata { Prompt: o.Prompt, } } + +// mergeMetadata overlays set fields of in onto base without clobbering an +// existing value with an empty one (a partial spawn write keeps the branch set +// at creation). +func mergeMetadata(base, in domain.SessionMetadata) domain.SessionMetadata { + set := func(dst *string, v string) { + if v != "" { + *dst = v + } + } + set(&base.Branch, in.Branch) + set(&base.WorkspacePath, in.WorkspacePath) + set(&base.RuntimeHandleID, in.RuntimeHandleID) + set(&base.RuntimeName, in.RuntimeName) + set(&base.AgentSessionID, in.AgentSessionID) + set(&base.Prompt, in.Prompt) + return base +} diff --git a/backend/internal/lifecycle/manager_parity_test.go b/backend/internal/lifecycle/manager_parity_test.go deleted file mode 100644 index 146dcc16..00000000 --- a/backend/internal/lifecycle/manager_parity_test.go +++ /dev/null @@ -1,144 +0,0 @@ -package lifecycle - -import ( - "context" - "testing" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" -) - -// TestStoreParity is the key contract test from the plan: it drives the REAL -// Lifecycle Manager through identical operation sequences against the in-memory -// fakeStore (the authoritative store semantics) and the SQLite-backed Store, -// then asserts the resulting canonical lifecycle is byte-identical. If the -// SQLite adapter honored the port exactly, the two managers cannot diverge. -// -// Both stores are seeded the same way (via the public Upsert insert path, so -// both start at revision 1) — this makes revision numbers, not just states, -// directly comparable. -func TestStoreParity(t *testing.T) { - seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - seed.Activity = domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: t0, Source: domain.SourceNative} - - cases := []struct { - name string - ops []func(*Manager) error - }{ - { - name: "runtime dead then activity signal", - ops: []func(*Manager) error{ - func(m *Manager) error { - return m.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{ - RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0, - }) - }, - func(m *Manager) error { - return m.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{ - State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook, - }) - }, - }, - }, - { - name: "scm pr open then changes requested", - ops: []func(*Manager) error{ - func(m *Manager) error { - return m.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, PRNumber: 7, PRURL: "http://x/7", - }) - }, - }, - }, - { - name: "kill request terminates", - ops: []func(*Manager) error{ - func(m *Manager) error { - return m.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual, Detail: "x"}) - }, - }, - }, - } - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - fakeMgr, fakeS := newManager() - sqlMgr, sqlS := newSQLiteManager(t) - - seedViaUpsert(t, fakeS, seed) - seedViaUpsert(t, sqlS, seed) - - for i, op := range tc.ops { - errF := op(fakeMgr) - errS := op(sqlMgr) - if (errF == nil) != (errS == nil) { - t.Fatalf("op %d error divergence: fake=%v sqlite=%v", i, errF, errS) - } - } - - fl, okF, _ := fakeS.Load(context.Background(), sid) - sl, okS, _ := sqlS.Load(context.Background(), sid) - if okF != okS { - t.Fatalf("presence divergence: fake=%v sqlite=%v", okF, okS) - } - assertLifecycleEqual(t, fl, sl) - }) - } -} - -func newSQLiteManager(t *testing.T) (*Manager, *sqlite.Store) { - t.Helper() - db, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - t.Cleanup(func() { db.Close() }) - store := sqlite.NewStore(db) - return New(store, &recordingNotifier{}, &recordingMessenger{}), store -} - -func seedViaUpsert(t *testing.T, store ports.LifecycleStore, l domain.CanonicalSessionLifecycle) { - t.Helper() - rec := domain.SessionRecord{ - ID: sid, - ProjectID: "proj", - Kind: domain.KindWorker, - CreatedAt: t0, - UpdatedAt: t0, - Lifecycle: l, - } - if err := store.Upsert(context.Background(), rec, ports.EventSessionCreated); err != nil { - t.Fatalf("seed upsert: %v", err) - } -} - -func assertLifecycleEqual(t *testing.T, a, b domain.CanonicalSessionLifecycle) { - t.Helper() - if a.Revision != b.Revision { - t.Errorf("revision: fake=%d sqlite=%d", a.Revision, b.Revision) - } - if a.Session != b.Session { - t.Errorf("session: fake=%+v sqlite=%+v", a.Session, b.Session) - } - if a.PR != b.PR { - t.Errorf("pr: fake=%+v sqlite=%+v", a.PR, b.PR) - } - if a.Runtime != b.Runtime { - t.Errorf("runtime: fake=%+v sqlite=%+v", a.Runtime, b.Runtime) - } - if a.Activity.State != b.Activity.State || a.Activity.Source != b.Activity.Source || - !a.Activity.LastActivityAt.Equal(b.Activity.LastActivityAt) { - t.Errorf("activity: fake=%+v sqlite=%+v", a.Activity, b.Activity) - } - switch { - case a.Detecting == nil && b.Detecting == nil: - case a.Detecting == nil || b.Detecting == nil: - t.Errorf("detecting presence: fake=%v sqlite=%v", a.Detecting, b.Detecting) - default: - if a.Detecting.Attempts != b.Detecting.Attempts || a.Detecting.EvidenceHash != b.Detecting.EvidenceHash || - !a.Detecting.StartedAt.Equal(b.Detecting.StartedAt) { - t.Errorf("detecting: fake=%+v sqlite=%+v", a.Detecting, b.Detecting) - } - } -} diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index 96557e8f..7843f8af 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -2,8 +2,8 @@ package lifecycle import ( "context" - "errors" - "sync" + "fmt" + "strings" "testing" "time" @@ -11,605 +11,361 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) - -const sid domain.SessionID = "s1" - -func newManager() (*Manager, *fakeStore) { - store := newFakeStore() - return New(store, &recordingNotifier{}, &recordingMessenger{}), store -} - -func mustLoad(t *testing.T, store *fakeStore) domain.CanonicalSessionLifecycle { - t.Helper() - l, ok, err := store.Load(context.Background(), sid) - if err != nil || !ok { - t.Fatalf("load: ok=%v err=%v", ok, err) - } - return l -} - -// ---- ApplyRuntimeObservation + #1 composition + #3 detecting clear ---- - -func TestApplyRuntimeObservation(t *testing.T) { - aliveProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} - failedProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} - deadProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0} - - tests := []struct { - name string - seed domain.CanonicalSessionLifecycle - facts ports.RuntimeFacts - wantSession domain.SessionState - wantReason domain.SessionReason - wantRuntime domain.RuntimeState - wantDisplay domain.SessionStatus - wantDetecting bool // expect non-nil detecting memory persisted - }{ - { - name: "healthy probe must not clobber an activity-owned needs_input (#1)", - seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), - facts: aliveProbe, - wantSession: domain.SessionNeedsInput, - wantReason: domain.ReasonAwaitingUserInput, - wantRuntime: domain.RuntimeAlive, - wantDisplay: domain.StatusNeedsInput, - wantDetecting: false, - }, - { - name: "healthy probe recovers a liveness-owned detecting -> working and clears memory (#1 + #3)", - seed: detectingLC(), - facts: aliveProbe, - wantSession: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - wantRuntime: domain.RuntimeAlive, - wantDisplay: domain.StatusWorking, - wantDetecting: false, - }, - { - name: "failed probe routes to detecting and records memory", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - facts: failedProbe, - wantSession: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantRuntime: domain.RuntimeProbeFailed, - wantDisplay: domain.StatusDetecting, - wantDetecting: true, - }, - { - name: "dead+dead with no recent activity concludes killed and clears detecting (#3)", - seed: detectingLC(), - facts: deadProbe, - wantSession: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantRuntime: domain.RuntimeExited, - wantDisplay: domain.StatusKilled, - wantDetecting: false, - }, - } +var ctx = context.Background() - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, tt.seed) +// ---- fakes ---- - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, tt.facts); err != nil { - t.Fatalf("apply: %v", err) - } +// fakeStore is a mini SessionStore + PRWriter: it derives PRFacts and recent +// check statuses from what the engine writes, so PR-reaction tests exercise the +// write path and the read-back together. +type fakeStore struct { + sessions map[domain.SessionID]domain.SessionRecord + pr map[domain.SessionID]ports.PRRow + comments map[string][]ports.PRComment + checks []ports.PRCheckRow + num int +} - l := mustLoad(t, store) - if l.Session.State != tt.wantSession || l.Session.Reason != tt.wantReason { - t.Errorf("session = %v/%v, want %v/%v", l.Session.State, l.Session.Reason, tt.wantSession, tt.wantReason) - } - if l.Runtime.State != tt.wantRuntime { - t.Errorf("runtime = %v, want %v", l.Runtime.State, tt.wantRuntime) - } - if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { - t.Errorf("display = %v, want %v", got, tt.wantDisplay) - } - if (l.Detecting != nil) != tt.wantDetecting { - t.Errorf("detecting present = %v, want %v (%+v)", l.Detecting != nil, tt.wantDetecting, l.Detecting) - } - }) +func newFakeStore() *fakeStore { + return &fakeStore{ + sessions: map[domain.SessionID]domain.SessionRecord{}, + pr: map[domain.SessionID]ports.PRRow{}, + comments: map[string][]ports.PRComment{}, } } -func TestApplyRuntimeObservation_NoRecordIsNoOp(t *testing.T) { - mgr, store := newManager() - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { - t.Fatalf("apply: %v", err) - } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("a probe for an unseeded session must not fabricate a record") +func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + f.num++ + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) + f.sessions[rec.ID] = rec + return rec, nil +} +func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { + f.sessions[rec.ID] = rec + return nil +} +func (f *fakeStore) GetSession(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + r, ok := f.sessions[id] + return r, ok, nil +} +func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domain.SessionRecord, error) { + var out []domain.SessionRecord + for _, r := range f.sessions { + if r.ProjectID == p { + out = append(out, r) + } } + return out, nil } - -func TestApplyRuntimeObservation_DoesNotResurrectTerminal(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited)) - - // A failed probe would normally route to detecting, but a terminal session - // must not be reopened by an observation (only an explicit Restore does). - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { - t.Fatalf("apply: %v", err) +func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { + out := make([]domain.SessionRecord, 0, len(f.sessions)) + for _, r := range f.sessions { + out = append(out, r) } - - l := mustLoad(t, store) - if l.Session.State != domain.SessionTerminated || l.Session.Reason != domain.ReasonManuallyKilled { - t.Errorf("session = %v/%v, want terminated/manually_killed (no resurrection)", l.Session.State, l.Session.Reason) + return out, nil +} +func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { + r, ok := f.pr[id] + if !ok { + return domain.PRFacts{}, nil + } + facts := domain.PRFacts{ + URL: r.URL, Number: r.Number, Exists: true, + Draft: r.Draft, Merged: r.Merged, Closed: r.Closed, + CI: r.CI, Review: r.Review, Mergeability: r.Mergeability, + } + for _, c := range f.comments[r.URL] { + if !c.Resolved { + facts.ReviewComments = true + break + } } - if l.Detecting != nil { - t.Errorf("terminal session must not gain detecting memory, got %+v", l.Detecting) + return facts, nil +} +func (f *fakeStore) UpsertPR(_ context.Context, r ports.PRRow) error { + f.pr[domain.SessionID(r.SessionID)] = r + return nil +} +func (f *fakeStore) RecordCheck(_ context.Context, r ports.PRCheckRow) error { + f.checks = append(f.checks, r) + return nil +} +func (f *fakeStore) RecentCheckStatuses(_ context.Context, url, name string, limit int) ([]string, error) { + var out []string + for i := len(f.checks) - 1; i >= 0 && len(out) < limit; i-- { + if f.checks[i].PRURL == url && f.checks[i].Name == name { + out = append(out, f.checks[i].Status) + } } + return out, nil +} +func (f *fakeStore) ReplacePRComments(_ context.Context, url string, cs []ports.PRComment) error { + f.comments[url] = cs + return nil } -// ---- ApplyActivitySignal ---- +type fakeNotifier struct{ events []ports.Event } -func TestApplyActivitySignal(t *testing.T) { - tests := []struct { - name string - seed domain.CanonicalSessionLifecycle - signal ports.ActivitySignal - wantSession domain.SessionState - wantReason domain.SessionReason - checkReason bool - wantActivity domain.ActivityState - wantChanged bool - }{ - { - name: "valid waiting_input maps to needs_input", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityWaitingInput, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionNeedsInput, - wantActivity: domain.ActivityWaitingInput, - wantChanged: true, - }, - { - name: "valid active recovers needs_input -> working", - seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantActivity: domain.ActivityActive, - wantChanged: true, - }, - { - name: "valid idle maps to idle with a neutral reason", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionIdle, - wantReason: "", - checkReason: true, - wantActivity: domain.ActivityIdle, - wantChanged: true, - }, - { - name: "low-confidence signal is dropped (no idleness inferred)", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalProbeFailure, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantChanged: false, - }, - { - name: "valid activity resolves a detecting session (proof of life)", - seed: detectingLC(), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantActivity: domain.ActivityActive, - wantChanged: true, - }, +func (f *fakeNotifier) Notify(_ context.Context, e ports.Event) error { + f.events = append(f.events, e) + return nil +} +func (f *fakeNotifier) last() string { + if len(f.events) == 0 { + return "" } + return f.events[len(f.events)-1].Type +} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, tt.seed) - - if err := mgr.ApplyActivitySignal(context.Background(), sid, tt.signal); err != nil { - t.Fatalf("apply: %v", err) - } - - l := mustLoad(t, store) - if l.Session.State != tt.wantSession { - t.Errorf("session = %v, want %v", l.Session.State, tt.wantSession) - } - if tt.checkReason && l.Session.Reason != tt.wantReason { - t.Errorf("session reason = %q, want %q", l.Session.Reason, tt.wantReason) - } - if tt.wantChanged && l.Revision != 1 { - t.Errorf("revision = %d, want 1 (expected a write)", l.Revision) - } - if !tt.wantChanged && l.Revision != 0 { - t.Errorf("revision = %d, want 0 (expected a no-op)", l.Revision) - } - if tt.wantChanged && tt.wantActivity != "" && l.Activity.State != tt.wantActivity { - t.Errorf("activity = %v, want %v", l.Activity.State, tt.wantActivity) - } - if tt.name == "valid activity resolves a detecting session (proof of life)" && l.Detecting != nil { - t.Errorf("resolving detecting must clear the quarantine memory, got %+v", l.Detecting) - } - }) - } -} - -// ---- ApplySCMObservation ---- - -func TestApplySCMObservation(t *testing.T) { - t.Run("failed fetch is a no-op (failed probe != no PR)", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: false, PRState: domain.PROpen}); err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 || l.PR.State != "" { - t.Errorf("expected no-op, got revision=%d pr=%v", l.Revision, l.PR.State) - } - }) +type fakeMessenger struct{ msgs []string } - t.Run("open PR writes only the PR axis; session stays activity-owned", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - f := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 12, PRURL: "https://x/12"} - if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PROpen || l.PR.Reason != domain.PRReasonCIFailing || l.PR.Number != 12 { - t.Errorf("pr = %+v, want open/ci_failing/#12", l.PR) - } - if l.Session.State != domain.SessionWorking { - t.Errorf("session = %v, want working (untouched)", l.Session.State) - } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusCIFailed { - t.Errorf("display = %v, want ci_failed", got) - } - }) - - t.Run("draft PR writes draft or ci_failed without review states", func(t *testing.T) { - cases := []struct { - name string - facts ports.SCMFacts - wantReason domain.PRReason - wantStatus domain.SessionStatus - }{ - {"draft with failing CI", ports.SCMFacts{Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIFailing}, domain.PRReasonCIFailing, domain.StatusCIFailed}, - {"draft via bool with open state", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Draft: true}, domain.PRReasonInProgress, domain.StatusDraft}, - {"draft via bool with failing CI", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Draft: true, CISummary: ports.CIFailing}, domain.PRReasonCIFailing, domain.StatusCIFailed}, - {"draft ignores review and merge facts", ports.SCMFacts{Fetched: true, PRState: domain.PRDraft, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonInProgress, domain.StatusDraft}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - mgr, store := newManager() - wantSession := domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress} - store.seed(sid, lc(wantSession.State, wantSession.Reason, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PRDraft || l.PR.Reason != c.wantReason { - t.Errorf("pr = %v/%v, want draft/%v", l.PR.State, l.PR.Reason, c.wantReason) - } - if l.Session != wantSession { - t.Errorf("session = %+v, want untouched %+v", l.Session, wantSession) - } - if got := domain.DeriveLegacyStatus(l); got != c.wantStatus { - t.Errorf("display = %v, want %v", got, c.wantStatus) - } - }) - } - }) +func (f *fakeMessenger) Send(_ context.Context, _ domain.SessionID, m string) error { + f.msgs = append(f.msgs, m) + return nil +} - t.Run("merged PR parks the session and displays merged", func(t *testing.T) { - mgr, store := newManager() - seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - seed.PR = domain.PRSubstate{State: domain.PROpen, Reason: domain.PRReasonInProgress, Number: 12} - store.seed(sid, seed) - f := ports.SCMFacts{Fetched: true, PRState: domain.PRMerged, PRNumber: 12} - if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PRMerged || l.Session.Reason != domain.ReasonMergedWaitingDecision { - t.Errorf("got pr=%v session=%v, want merged + merged_waiting_decision", l.PR.State, l.Session.Reason) - } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusMerged { - t.Errorf("display = %v, want merged", got) - } - }) +func newManager() (*Manager, *fakeStore, *fakeNotifier, *fakeMessenger) { + st, n, msg := newFakeStore(), &fakeNotifier{}, &fakeMessenger{} + return New(st, st, n, msg), st, n, msg +} - t.Run("open-PR review branches map to the PR axis", func(t *testing.T) { - cases := []struct { - name string - facts ports.SCMFacts - wantReason domain.PRReason - wantStatus domain.SessionStatus - }{ - {"changes requested", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested}, domain.PRReasonChangesRequested, domain.StatusChangesRequested}, - {"pending human comments", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, PendingComments: []ports.ReviewComment{{Author: "human", Body: "fix"}}}, domain.PRReasonChangesRequested, domain.StatusChangesRequested}, - {"pending bot comments", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, PendingComments: []ports.ReviewComment{{Author: "bot", Body: "fix", IsBot: true}}}, domain.PRReasonBotComments, domain.StatusChangesRequested}, - {"merge conflicts", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Mergeability: ports.Mergeability{CIPassing: true, Approved: true, NoConflicts: false, Blockers: []string{"merge conflicts"}}}, domain.PRReasonMergeConflicts, domain.StatusPROpen}, - {"approved + mergeable", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonMergeReady, domain.StatusMergeable}, - {"review pending", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewPending}, domain.PRReasonReviewPending, domain.StatusReviewPending}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - mgr, store := newManager() - wantSession := domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress} - store.seed(sid, lc(wantSession.State, wantSession.Reason, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PROpen || l.PR.Reason != c.wantReason { - t.Errorf("pr = %v/%v, want open/%v", l.PR.State, l.PR.Reason, c.wantReason) - } - if got := domain.DeriveLegacyStatus(l); got != c.wantStatus { - t.Errorf("display = %v, want %v", got, c.wantStatus) - } - }) - } - }) +func working(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ + ID: id, ProjectID: "mer", + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: domain.SessionWorking}, + IsAlive: true, + }, + } +} - t.Run("no PR is a no-op in split A", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: true, PRState: domain.PRNone}); err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 { - t.Errorf("expected no-op, got revision=%d", l.Revision) - } - }) +func openPR(o ports.PRObservation) ports.PRObservation { + o.Fetched, o.URL, o.Number = true, "https://example/pr/1", 1 + return o } -// ---- mutation outcomes ---- +// ---- runtime observations ---- -func TestOnSpawnCompleted(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown)) +func TestRuntimeObservation_InferredDeath(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - out := ports.SpawnOutcome{ - Branch: "feat/x", - WorkspacePath: "/w/x", - RuntimeHandle: ports.RuntimeHandle{ID: "tmux:1", RuntimeName: "tmux"}, - AgentSessionID: "agent-1", + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeDead, Process: ports.ProbeDead}); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermRuntimeLost || got.IsAlive { + t.Fatalf("want terminated/runtime_lost/dead, got %+v", got) } - if err := mgr.OnSpawnCompleted(context.Background(), sid, out); err != nil { - t.Fatalf("apply: %v", err) + if n.last() != "reaction.agent-exited" { + t.Fatalf("want agent-exited notify, got %q", n.last()) } +} + +func TestRuntimeObservation_FailedProbeQuarantines(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - l := mustLoad(t, store) - if l.Runtime.State != domain.RuntimeAlive { - t.Errorf("runtime = %v, want alive", l.Runtime.State) + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeFailed, Process: ports.ProbeFailed}); err != nil { + t.Fatal(err) } - if l.Session.State != domain.SessionNotStarted { - t.Errorf("session = %v, want not_started (spawn does not assert acknowledgement)", l.Session.State) + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionDetecting || !got.IsAlive || got.Detecting == nil { + t.Fatalf("failed probe should quarantine alive, got %+v", got) } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusSpawning { - t.Errorf("display = %v, want spawning", got) +} + +func TestRuntimeObservation_RecoversDetecting(t *testing.T) { + m, st, _, _ := newManager() + rec := working("mer-1") + rec.Lifecycle.Session.State = domain.SessionDetecting + rec.Lifecycle.Detecting = &domain.DetectingState{Attempts: 1} + st.sessions["mer-1"] = rec + + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeAlive, Process: ports.ProbeAlive}); err != nil { + t.Fatal(err) } - meta, _ := store.GetMetadata(context.Background(), sid) - if meta.Branch != "feat/x" || meta.AgentSessionID != "agent-1" || meta.RuntimeName != "tmux" { - t.Errorf("metadata not recorded: %+v", meta) + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionWorking || got.Detecting != nil { + t.Fatalf("healthy probe should recover to working, got %+v", got) } } -func TestOnSpawnInitiated_ActiveSessionRejected(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) +// ---- activity signals ---- - err := mgr.OnSpawnInitiated(context.Background(), domain.SessionRecord{ - ID: sid, - ProjectID: domain.ProjectID("proj"), - Lifecycle: lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown), - }) - if err == nil { - t.Fatal("OnSpawnInitiated should reject a non-terminal row on top of an active session") - } +func TestActivity_WaitingInputPagesHuman(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - got := mustLoad(t, store) - if got.Session.State != domain.SessionWorking || got.Revision != 0 { - t.Fatalf("active row should be unchanged, got %+v", got) + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityWaitingInput, Timestamp: time.Now()}); err != nil { + t.Fatal(err) + } + if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionNeedsInput { + t.Fatalf("want needs_input, got %v", st.sessions["mer-1"].Lifecycle.Session.State) + } + if n.last() != "reaction.agent-needs-input" { + t.Fatalf("want needs-input notify, got %q", n.last()) } } -func TestOnKillRequested(t *testing.T) { - tests := []struct { - name string - kind ports.LifecycleKillReason - wantReason domain.SessionReason - wantRuntime domain.RuntimeReason - wantDisplay domain.SessionStatus - }{ - {"manual", ports.KillManual, domain.ReasonManuallyKilled, domain.RuntimeReasonManualKillRequested, domain.StatusKilled}, - {"cleanup", ports.KillCleanup, domain.ReasonAutoCleanup, domain.RuntimeReasonAutoCleanup, domain.StatusCleanup}, - {"error", ports.KillError, domain.ReasonErrorInProcess, domain.RuntimeReasonProbeError, domain.StatusErrored}, +func TestActivity_InvalidIsIgnored(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + before := st.sessions["mer-1"] + + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: false, State: domain.ActivityIdle}); err != nil { + t.Fatal(err) + } + if st.sessions["mer-1"] != before { + t.Fatal("invalid signal must not mutate the session") } +} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, detectingLC()) +// ---- PR observations ---- - if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: tt.kind, Detail: "x"}); err != nil { - t.Fatalf("apply: %v", err) - } +func TestPR_CIFailingNudgesAgentWithLogs(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") - l := mustLoad(t, store) - if l.Session.State != domain.SessionTerminated || l.Session.Reason != tt.wantReason { - t.Errorf("session = %v/%v, want terminated/%v", l.Session.State, l.Session.Reason, tt.wantReason) - } - if l.Runtime.Reason != tt.wantRuntime { - t.Errorf("runtime reason = %v, want %v", l.Runtime.Reason, tt.wantRuntime) - } - if l.Detecting != nil { - t.Errorf("kill must clear detecting memory, got %+v", l.Detecting) - } - if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { - t.Errorf("display = %v, want %v", got, tt.wantDisplay) - } - }) + o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []ports.PRCheckRow{{Name: "build", CommitHash: "c1", Status: "failed", LogTail: "boom"}}}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) + } + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "boom") { + t.Fatalf("want one CI nudge with log tail, got %v", msg.msgs) } } -func TestOnSpawnCompleted_UnseededErrors(t *testing.T) { - mgr, store := newManager() - err := mgr.OnSpawnCompleted(context.Background(), sid, ports.SpawnOutcome{Branch: "x"}) - if err == nil { - t.Error("OnSpawnCompleted for an unseeded session must error, not fabricate a record") +func TestPR_CIBrakeEscalatesAfterThreeFails(t *testing.T) { + m, st, n, msg := newManager() + st.sessions["mer-1"] = working("mer-1") + + for _, commit := range []string{"c1", "c2", "c3"} { + o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []ports.PRCheckRow{{Name: "build", CommitHash: commit, Status: "failed", LogTail: "boom"}}}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) + } } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("no record should have been created") + if len(msg.msgs) != 2 { + t.Fatalf("want 2 nudges then escalate, got %d nudges", len(msg.msgs)) + } + if n.last() != "reaction.escalated" { + t.Fatalf("3rd failure should escalate, got %q", n.last()) } } -func TestOnKillRequested_UnseededIsNoOp(t *testing.T) { - mgr, store := newManager() - if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill of unknown session should be a benign no-op, got %v", err) +func TestPR_ReviewCommentsInjectedRegardlessOfAuthor(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") + + o := openPR(ports.PRObservation{ + Review: domain.ReviewChangesRequest, + Comments: []ports.PRComment{{ID: "1", Author: "greptileai", Body: "use a constant here"}}, + }) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("killing an unknown session must not fabricate a terminal record") + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "use a constant here") { + t.Fatalf("review feedback should be injected verbatim, got %v", msg.msgs) } } -// ---- fake store contract ---- +func TestPR_ApprovedAndGreenNotifies(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") -func TestFakeStoreUpsertFullRow(t *testing.T) { - store := newFakeStore() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - rec, ok, err := store.Get(context.Background(), sid) - if err != nil || !ok { - t.Fatalf("seeded record missing: ok=%v err=%v", ok, err) + o := openPR(ports.PRObservation{Review: domain.ReviewApproved, Mergeability: domain.MergeMergeable}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - rec.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionIdle, Reason: domain.ReasonResearchComplete} - rec.Lifecycle.Runtime = domain.RuntimeSubstate{State: domain.RuntimeExited} - if err := store.Upsert(context.Background(), rec, ports.EventSessionStateChanged); err != nil { - t.Fatalf("upsert: %v", err) + if n.last() != "reaction.approved-and-green" { + t.Fatalf("want approved-and-green, got %q", n.last()) } +} + +func TestPR_MergeTerminatesSession(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - got, _, _ := store.Get(context.Background(), sid) - if got.Lifecycle.Session.State != domain.SessionIdle || got.Lifecycle.Runtime.State != domain.RuntimeExited { - t.Fatalf("upsert should replace the full canonical row, got %+v", got.Lifecycle) + o := openPR(ports.PRObservation{Merged: true}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - if got.Lifecycle.Revision != 1 { - t.Fatalf("upsert should bump revision inside the store, got %d want 1", got.Lifecycle.Revision) + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermPRMerged { + t.Fatalf("merge should terminate with pr_merged, got %+v", got) + } + if n.last() != "reaction.pr-merged" { + t.Fatalf("want pr-merged notify, got %q", n.last()) } } -// ---- per-session serialisation under the race detector ---- - -func TestPerSessionSerialization(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) +func TestPR_FailedFetchIsDropped(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") - const n = 50 - var wg sync.WaitGroup - wg.Add(n) - for i := 0; i < n; i++ { - go func(i int) { - defer wg.Done() - _ = mgr.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{ - State: ports.SignalValid, - Activity: domain.ActivityActive, - Timestamp: t0.Add(time.Duration(i) * time.Second), - Source: domain.SourceHook, - }) - }(i) + if err := m.ApplyPRObservation(ctx, "mer-1", ports.PRObservation{Fetched: false, CI: domain.CIFailing}); err != nil { + t.Fatal(err) } - wg.Wait() - - // Each goroutine writes a distinct LastActivityAt, so every call is a real - // change; with correct serialisation all n land without a lost update. - if l := mustLoad(t, store); l.Revision != n { - t.Errorf("revision = %d, want %d (lost update under concurrency)", l.Revision, n) + if len(msg.msgs) != 0 || len(st.pr) != 0 { + t.Fatal("a failed fetch must write nothing and fire nothing") } } -// ---- RunningSessions (reaper poll-set) ---- +// ---- explicit kill ---- -func TestRunningSessions_NoListerWired_ReturnsEmpty(t *testing.T) { - m, _ := newManager() - got, err := m.RunningSessions(context.Background()) - if err != nil { - t.Fatalf("RunningSessions: %v", err) +func TestKill_TerminatesWithoutReacting(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + + if err := m.OnKillRequested(ctx, "mer-1", domain.TermManuallyKilled); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermManuallyKilled || got.IsAlive { + t.Fatalf("want terminated/manually_killed/dead, got %+v", got) } - if len(got) != 0 { - t.Fatalf("expected empty slice when no lister wired, got %d records", len(got)) + if len(n.events) != 0 { + t.Fatal("an explicit kill must not fire a reaction") } } -func TestRunningSessions_ListerErrorPropagates(t *testing.T) { - m, _ := newManager() - wantErr := errors.New("boom") - m.WithSessionLister(func(_ context.Context) ([]domain.SessionRecord, error) { - return nil, wantErr - }) - _, err := m.RunningSessions(context.Background()) - if !errors.Is(err, wantErr) { - t.Fatalf("expected lister error to propagate, got %v", err) - } -} - -// TestRunningSessions_FilterIncludesProbableExcludesTerminal locks in the -// reaper poll-set predicate. The bug we are guarding against is filtering to -// "runtime.State == RuntimeAlive": detecting sessions (RuntimeMissing / -// RuntimeProbeFailed) would be silently parked, breaking the probe-driven -// recovery path proved by manager_test.go:59 and the dead+dead -> killed path -// proved by manager_test.go:79. -func TestRunningSessions_FilterIncludesProbableExcludesTerminal(t *testing.T) { - m, _ := newManager() - records := []domain.SessionRecord{ - {ID: "working-alive", Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)}, - {ID: "detecting-probefailed", Lifecycle: lc(domain.SessionDetecting, domain.ReasonProbeFailure, domain.RuntimeProbeFailed)}, - {ID: "detecting-missing", Lifecycle: lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing)}, - {ID: "idle-alive", Lifecycle: lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.RuntimeAlive)}, - {ID: "needs-input-alive", Lifecycle: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive)}, - {ID: "not-started", Lifecycle: lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown)}, - {ID: "terminated", Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited)}, - {ID: "done", Lifecycle: lc(domain.SessionDone, domain.ReasonPRMerged, domain.RuntimeExited)}, - } - m.WithSessionLister(func(_ context.Context) ([]domain.SessionRecord, error) { - return records, nil - }) +// ---- duration escalation ---- - got, err := m.RunningSessions(context.Background()) - if err != nil { - t.Fatalf("RunningSessions: %v", err) +func TestTickEscalations_DurationPagesHuman(t *testing.T) { + m, st, n, msg := newManager() + now := time.Now() + m.clock = func() time.Time { return now } + st.sessions["mer-1"] = working("mer-1") + + o := openPR(ports.PRObservation{Mergeability: domain.MergeConflicting}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - gotIDs := map[domain.SessionID]bool{} - for _, r := range got { - gotIDs[r.ID] = true + if len(msg.msgs) != 1 { + t.Fatalf("merge-conflict should nudge once, got %d", len(msg.msgs)) } - wantIncluded := []domain.SessionID{ - "working-alive", "detecting-probefailed", "detecting-missing", - "idle-alive", "needs-input-alive", "not-started", + if err := m.TickEscalations(ctx, now.Add(16*time.Minute)); err != nil { + t.Fatal(err) } - for _, id := range wantIncluded { - if !gotIDs[id] { - t.Errorf("expected %q in poll set, missing", id) - } - } - wantExcluded := []domain.SessionID{"terminated", "done"} - for _, id := range wantExcluded { - if gotIDs[id] { - t.Errorf("expected %q NOT in poll set, found", id) - } + if n.last() != "reaction.escalated" { + t.Fatalf("unaddressed conflict should escalate after 15m, got %q", n.last()) } } -// ---- helpers ---- +func TestRunningSessions_ExcludesTerminal(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + dead := working("mer-2") + dead.Lifecycle.Session.State = domain.SessionTerminated + st.sessions["mer-2"] = dead -func lc(state domain.SessionState, reason domain.SessionReason, rt domain.RuntimeState) domain.CanonicalSessionLifecycle { - return domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: state, Reason: reason}, - Runtime: domain.RuntimeSubstate{State: rt}, + got, err := m.RunningSessions(ctx) + if err != nil { + t.Fatal(err) + } + if len(got) != 1 || got[0].ID != "mer-1" { + t.Fatalf("want only the live session, got %+v", got) } -} - -func detectingLC() domain.CanonicalSessionLifecycle { - l := lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing) - l.Detecting = &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: "abc"} - return l } diff --git a/backend/internal/lifecycle/reaction_durability_test.go b/backend/internal/lifecycle/reaction_durability_test.go deleted file mode 100644 index 1866c8c9..00000000 --- a/backend/internal/lifecycle/reaction_durability_test.go +++ /dev/null @@ -1,140 +0,0 @@ -package lifecycle - -import ( - "context" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" -) - -// reactionStoreAdapter bridges the concrete *sqlite.Store to the lifecycle -// package's ReactionStore interface (string/row types <-> domain types). This is -// the same glue the composition root installs. -type reactionStoreAdapter struct{ s *sqlite.Store } - -func (a reactionStoreAdapter) LoadReactionTrackers(ctx context.Context) ([]PersistedTracker, error) { - rows, err := a.s.ListReactionTrackers(ctx) - if err != nil { - return nil, err - } - out := make([]PersistedTracker, len(rows)) - for i, r := range rows { - out[i] = PersistedTracker{ - SessionID: domain.SessionID(r.SessionID), - Key: r.ReactionKey, - Attempts: r.Attempts, - Escalated: r.Escalated, - FirstAttemptAt: r.FirstAttemptAt, - ProjectID: domain.ProjectID(r.ProjectID), - } - } - return out, nil -} - -func (a reactionStoreAdapter) SaveReactionTracker(ctx context.Context, t PersistedTracker) error { - return a.s.SaveReactionTracker(ctx, sqlite.ReactionTrackerRow{ - SessionID: string(t.SessionID), - ReactionKey: t.Key, - Attempts: t.Attempts, - Escalated: t.Escalated, - FirstAttemptAt: t.FirstAttemptAt, - ProjectID: string(t.ProjectID), - }) -} - -func (a reactionStoreAdapter) DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error { - return a.s.DeleteReactionTracker(ctx, string(id), key) -} - -func (a reactionStoreAdapter) DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error { - return a.s.DeleteSessionReactionTrackers(ctx, string(id)) -} - -// TestReaction_DurabilitySurvivesRestart is the plan's reaction_trackers -// durability check: once a reaction has escalated, a daemon restart (a fresh -// Manager hydrated from the same store) must NOT re-fire the human page — the -// exact failure the in-memory-only version had. -func TestReaction_DurabilitySurvivesRestart(t *testing.T) { - db, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - t.Cleanup(func() { db.Close() }) - store := sqlite.NewStore(db) - adapter := reactionStoreAdapter{store} - - // --- first process lifetime: drive ci-failed to escalation --- - notf1 := &recordingNotifier{} - m1 := New(store, notf1, &recordingMessenger{}) - m1.clock = func() time.Time { return t0 } - if err := m1.WithReactionStore(context.Background(), adapter); err != nil { - t.Fatalf("hydrate m1: %v", err) - } - seedViaUpsert(t, store, lcOpenPR(domain.PRReasonReviewPending)) - - // ci-failed: retries 2, persistent → escalate on the third failure. - for i := 0; i < 4; i++ { - failCI(t, m1) - pendingCI(t, m1) - } - if c := notifyCount(notf1, "reaction.escalated"); c != 1 { - t.Fatalf("precondition: want one escalation in first lifetime, got %d", c) - } - - // --- simulated restart: a fresh Manager hydrated from the same store --- - notf2 := &recordingNotifier{} - msgr2 := &recordingMessenger{} - m2 := New(store, notf2, msgr2) - m2.clock = func() time.Time { return t0 } - if err := m2.WithReactionStore(context.Background(), adapter); err != nil { - t.Fatalf("hydrate m2: %v", err) - } - - // The ci-failed tracker rehydrates with escalated=true, so further failures - // are silenced: no new send-to-agent, no re-escalation. - failCI(t, m2) - if c := notifyCount(notf2, "reaction.escalated"); c != 0 { - t.Errorf("restart re-fired an already-escalated page: got %d escalations", c) - } - if len(msgr2.sent) != 0 { - t.Errorf("restart re-sent to agent despite escalated budget: got %d sends", len(msgr2.sent)) - } -} - -// TestReaction_DurabilityClearsOnIncidentOver proves the durable rows are -// removed when an incident resolves, so a later unrelated incident starts from a -// fresh budget rather than a stale escalated=true. -func TestReaction_DurabilityClearsOnIncidentOver(t *testing.T) { - db, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - t.Cleanup(func() { db.Close() }) - store := sqlite.NewStore(db) - adapter := reactionStoreAdapter{store} - - m := New(store, &recordingNotifier{}, &recordingMessenger{}) - m.clock = func() time.Time { return t0 } - if err := m.WithReactionStore(context.Background(), adapter); err != nil { - t.Fatalf("hydrate: %v", err) - } - seedViaUpsert(t, store, lcOpenPR(domain.PRReasonReviewPending)) - - failCI(t, m) - if rows, _ := store.ListReactionTrackers(context.Background()); len(rows) == 0 { - t.Fatalf("precondition: expected a persisted ci-failed tracker") - } - - // Approved+green ends the incident → recovered() clears every tracker. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, CISummary: ports.CIPassing, PRNumber: 7, - }); err != nil { - t.Fatalf("recover: %v", err) - } - if rows, _ := store.ListReactionTrackers(context.Background()); len(rows) != 0 { - t.Errorf("incident-over must clear durable trackers, got %d rows", len(rows)) - } -} diff --git a/backend/internal/lifecycle/reaction_store.go b/backend/internal/lifecycle/reaction_store.go deleted file mode 100644 index f8da7415..00000000 --- a/backend/internal/lifecycle/reaction_store.go +++ /dev/null @@ -1,94 +0,0 @@ -package lifecycle - -// reaction_store.go is the optional durability seam for the escalation engine. -// By default the Manager keeps escalation budgets in memory only (a restart -// resets them, which costs at most a few extra agent retries — never a missed -// human page). When a ReactionStore is wired via WithReactionStore the in-memory -// map becomes a write-through cache over durable rows, so a restart does NOT -// re-fire an already-escalated human notification. -// -// The interface uses lifecycle-local types so the package stays free of any -// storage dependency; the composition root adapts the concrete store to it -// (mirroring the cdc.OutboxStore adapter). - -import ( - "context" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// PersistedTracker is the durable form of one (session,reaction) escalation -// budget — the storage-facing mirror of the in-memory reactionTracker. -type PersistedTracker struct { - SessionID domain.SessionID - Key string - Attempts int - Escalated bool - FirstAttemptAt time.Time - ProjectID domain.ProjectID -} - -// ReactionStore persists escalation budgets so they survive a daemon restart. -type ReactionStore interface { - LoadReactionTrackers(ctx context.Context) ([]PersistedTracker, error) - SaveReactionTracker(ctx context.Context, t PersistedTracker) error - DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error - DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error -} - -// WithReactionStore makes escalation budgets durable: it hydrates the in-memory -// trackers from rs and turns on write-through for subsequent mutations. Like -// WithSessionLister it must be called BEFORE any reaper or Apply* dispatch -// starts, since it populates the tracker map without holding trackerMu against -// concurrent reactors. A hydration error is returned so the caller can decide -// whether to proceed with an empty (in-memory) budget set. -func (m *Manager) WithReactionStore(ctx context.Context, rs ReactionStore) error { - m.reactionStore = rs - rows, err := rs.LoadReactionTrackers(ctx) - if err != nil { - return err - } - for _, r := range rows { - m.trackers[trackerKey{id: r.SessionID, key: reactionKey(r.Key)}] = &reactionTracker{ - attempts: r.Attempts, - escalated: r.Escalated, - firstAttemptAt: r.FirstAttemptAt, - projectID: r.ProjectID, - } - } - return nil -} - -// persistTracker write-throughs one tracker's current state. Best-effort: a -// failed write degrades durability to the in-memory default (a restart may -// re-fire one page), so it must not break the synchronous dispatch path. The -// snapshot is taken by the caller under trackerMu and passed by value here so no -// DB I/O happens while the lock is held. -func (m *Manager) persistTracker(ctx context.Context, id domain.SessionID, key reactionKey, snap reactionTracker) { - if m.reactionStore == nil { - return - } - _ = m.reactionStore.SaveReactionTracker(ctx, PersistedTracker{ - SessionID: id, - Key: string(key), - Attempts: snap.attempts, - Escalated: snap.escalated, - FirstAttemptAt: snap.firstAttemptAt, - ProjectID: snap.projectID, - }) -} - -func (m *Manager) deletePersistedTracker(ctx context.Context, id domain.SessionID, key reactionKey) { - if m.reactionStore == nil { - return - } - _ = m.reactionStore.DeleteReactionTracker(ctx, id, string(key)) -} - -func (m *Manager) deletePersistedSessionTrackers(ctx context.Context, id domain.SessionID) { - if m.reactionStore == nil { - return - } - _ = m.reactionStore.DeleteSessionReactionTrackers(ctx, id) -} diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go index ac4de400..94f149f4 100644 --- a/backend/internal/lifecycle/reactions.go +++ b/backend/internal/lifecycle/reactions.go @@ -1,460 +1,397 @@ package lifecycle -// reactions.go is the ACT layer: the reaction table, the per-(session,reaction) -// escalation engine, and the duration-driven TickEscalations the synchronous -// LCM can't wake itself for. Reactions fire from react() after a transition is -// persisted by the Apply* pipeline (see manager.go). +// reactions.go is the ACT layer: after a persisted transition the engine maps +// the session's (state, PR facts) to at most one reaction and dispatches it — +// nudging the agent or paging the human. Two reactions inject live content (CI +// logs, review comments) and re-fire when that content changes; the rest fire +// once on entry, with duration escalation driven by TickEscalations. // -// Dispatch is synchronous: react() runs Send/Notify inline. It is the single -// dispatch chokepoint, so moving it onto a worker goroutine later (once a daemon -// owns that goroutine's lifecycle) is a change confined to this one function. +// Budgets are in-memory: a restart re-arms them, which costs a few extra nudges, +// never a missed page. import ( "context" "fmt" + "strings" + "sync" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// reactionKey names a row in the reaction table and a tracker bucket. type reactionKey string const ( - reactionCIFailed reactionKey = "ci-failed" - reactionChangesRequested reactionKey = "changes-requested" - reactionBugbotComments reactionKey = "bugbot-comments" - reactionMergeConflicts reactionKey = "merge-conflicts" - reactionAgentIdle reactionKey = "agent-idle" - reactionApprovedAndGreen reactionKey = "approved-and-green" - reactionAgentStuck reactionKey = "agent-stuck" - reactionNeedsInput reactionKey = "agent-needs-input" - reactionAgentExited reactionKey = "agent-exited" - reactionPRClosed reactionKey = "pr-closed" - reactionAllComplete reactionKey = "all-complete" + rxCIFailed reactionKey = "ci-failed" + rxReviewComments reactionKey = "review-comments" + rxMergeConflicts reactionKey = "merge-conflicts" + rxIdle reactionKey = "agent-idle" + rxApprovedGreen reactionKey = "approved-and-green" + rxStuck reactionKey = "agent-stuck" + rxNeedsInput reactionKey = "agent-needs-input" + rxExited reactionKey = "agent-exited" + rxPRClosed reactionKey = "pr-closed" + rxMerged reactionKey = "pr-merged" ) -type actionKind string - +// Brakes: stop auto-handling and page a human after this many failed attempts. const ( - actionSendToAgent actionKind = "send-to-agent" - actionNotify actionKind = "notify" - actionAutoMerge actionKind = "auto-merge" + ciBrakeRuns = 3 // last N runs of a failing check all failed + reviewMaxNudge = 3 // re-nudged the agent N times over new review feedback ) -// reactionConfig is one row of the reaction table (distillation §4.1/§4.2). -// -// - retries numeric escalation cap: escalate once attempts exceed it. -// - escalateAfter duration escalation: escalate once this elapses since the -// first attempt (fired by TickEscalations, since the LCM never polls). -// - persistent the tracker survives the status leaving the triggering -// state; it only resets when the incident is truly over (PR no longer open -// or the session terminal). Only ci-failed is persistent, so a flapping -// CI (fail→pending→fail) keeps draining one shared retry budget. +// reactionConfig is one row of the reaction table. toAgent reactions nudge the +// agent; the rest notify the human. escalateAfter (when set) drives a +// duration-based escalation via TickEscalations. type reactionConfig struct { - action actionKind + toAgent bool message string - priority ports.EventPriority eventType string - retries int + priority ports.Priority escalateAfter time.Duration - persistent bool } -// defaultReactions is the product's default behaviour (distillation §4.2). -// auto-merge is intentionally absent: approved-and-green is a notify, so the -// human decides to merge. The auto-merge action kind exists for opt-in configs, -// but no default row uses it. -var defaultReactions = map[reactionKey]reactionConfig{ - reactionCIFailed: { - action: actionSendToAgent, persistent: true, retries: 2, - message: "CI is failing on your PR. Review the failing output below and push a fix.", - eventType: "reaction.ci-failed", priority: ports.PriorityAction, - }, - reactionChangesRequested: { - action: actionSendToAgent, escalateAfter: 30 * time.Minute, - message: "A reviewer requested changes on your PR. Address the comments and push.", - eventType: "reaction.changes-requested", priority: ports.PriorityAction, - }, - reactionBugbotComments: { - action: actionSendToAgent, escalateAfter: 30 * time.Minute, - message: "An automated reviewer left comments on your PR. Address them and push.", - eventType: "reaction.bugbot-comments", priority: ports.PriorityAction, - }, - reactionMergeConflicts: { - action: actionSendToAgent, escalateAfter: 15 * time.Minute, - message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them.", - eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, - }, - reactionAgentIdle: { - action: actionSendToAgent, retries: 2, escalateAfter: 15 * time.Minute, - message: "You appear idle. Continue the task or explain what is blocking you.", - eventType: "reaction.agent-idle", priority: ports.PriorityWarning, - }, - reactionApprovedAndGreen: { - // notify-only: a green, approved PR is the human-decision path — the human - // decides to merge (no auto-merge by default). - action: actionNotify, priority: ports.PriorityAction, - message: "PR is approved and green — ready to merge.", - eventType: "reaction.approved-and-green", - }, - reactionAgentStuck: { - // §4.2 lists a threshold: 10m here; it is intentionally not gated — entry - // into stuck is already debounced upstream by the detecting->stuck - // quarantine (DETECTING_MAX_ATTEMPTS/DURATION), so a second timer would be - // redundant. - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent is stuck and needs attention.", - eventType: "reaction.agent-stuck", - }, - reactionNeedsInput: { - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent needs input to continue.", - eventType: "reaction.agent-needs-input", - }, - reactionAgentExited: { - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent process exited unexpectedly.", - eventType: "reaction.agent-exited", - }, - reactionPRClosed: { - action: actionNotify, priority: ports.PriorityAction, - message: "PR was closed without merging — decide: resume, learn, or terminate.", - eventType: "reaction.pr-closed", - }, - reactionAllComplete: { - action: actionNotify, priority: ports.PriorityInfo, - message: "PR merged — work complete.", - eventType: "reaction.all-complete", - }, +var reactions = map[reactionKey]reactionConfig{ + rxCIFailed: {toAgent: true, eventType: "reaction.ci-failed", priority: ports.PriorityAction, message: "CI is failing on your PR. Review the output below and push a fix."}, + rxReviewComments: {toAgent: true, eventType: "reaction.review-comments", priority: ports.PriorityAction, message: "A reviewer left feedback on your PR. Address it and push."}, + rxMergeConflicts: {toAgent: true, eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, escalateAfter: 15 * time.Minute, message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them."}, + rxIdle: {toAgent: true, eventType: "reaction.agent-idle", priority: ports.PriorityInfo, escalateAfter: 15 * time.Minute, message: "You appear idle. Continue the task or say what is blocking you."}, + rxApprovedGreen: {eventType: "reaction.approved-and-green", priority: ports.PriorityAction, message: "PR is approved and green — ready to merge."}, + rxStuck: {eventType: "reaction.agent-stuck", priority: ports.PriorityUrgent, message: "Agent is stuck and needs attention."}, + rxNeedsInput: {eventType: "reaction.agent-needs-input", priority: ports.PriorityUrgent, message: "Agent needs input to continue."}, + rxExited: {eventType: "reaction.agent-exited", priority: ports.PriorityUrgent, message: "Agent process exited unexpectedly."}, + rxPRClosed: {eventType: "reaction.pr-closed", priority: ports.PriorityAction, message: "PR was closed without merging."}, + rxMerged: {eventType: "reaction.pr-merged", priority: ports.PriorityInfo, message: "PR merged — work complete."}, } -// reactionEventFor maps a canonical record to the reaction it should drive, -// mirroring DeriveLegacyStatus but for the ACT layer. ok is false when the -// current state has no reaction. -// -// A closed PR derives to the idle display status, so it is detected from the PR -// axis directly before falling through to the status mapping. Bot review -// comments and merge conflicts are represented as PR reasons so the ACT layer -// can distinguish them from human-requested changes and plain open PRs. -func reactionEventFor(l domain.CanonicalSessionLifecycle) (reactionKey, bool) { - if l.PR.State == domain.PRClosed { - return reactionPRClosed, true - } - if isActivePRState(l.PR.State) { - switch l.PR.Reason { - case domain.PRReasonBotComments: - return reactionBugbotComments, true - case domain.PRReasonMergeConflicts: - return reactionMergeConflicts, true +// reactionContent carries the live material the feedback reactions inject. Empty +// for runtime/activity transitions; populated from a PR observation. +type reactionContent struct { + ciCheck string + ciCommit string + ciURL string + ciLogTail string + comments []string + reviewSig string +} + +// prContent extracts the CI failure + review feedback from a PR observation. +func prContent(o ports.PRObservation) reactionContent { + c := reactionContent{} + for _, ch := range o.Checks { + if ch.Status == "failed" { + c.ciCheck, c.ciCommit, c.ciLogTail, c.ciURL = ch.Name, ch.CommitHash, ch.LogTail, o.URL + break } } - switch domain.DeriveLegacyStatus(l) { - case domain.StatusCIFailed: - return reactionCIFailed, true - case domain.StatusChangesRequested: - return reactionChangesRequested, true - case domain.StatusApproved, domain.StatusMergeable: - return reactionApprovedAndGreen, true - case domain.StatusIdle: - return reactionAgentIdle, true - case domain.StatusStuck: - return reactionAgentStuck, true - case domain.StatusNeedsInput: - return reactionNeedsInput, true - case domain.StatusKilled: - // Inferred death only — an explicit user kill goes through - // OnKillRequested, which does not react. - return reactionAgentExited, true - case domain.StatusMerged: - return reactionAllComplete, true + var ids []string + for _, cm := range o.Comments { + if cm.Resolved { + continue + } + c.comments = append(c.comments, cm.Body) + ids = append(ids, cm.ID) } - return "", false + c.reviewSig = strings.Join(ids, ",") + return c } -// reactionContext carries fact-derived material the message templates need. The -// SCM path populates it (CI failure log tail); other paths pass the zero value. -type reactionContext struct { - ciFailureLogTail *string -} +// ---- in-memory escalation state ---- -// trackerKey buckets an escalation tracker by session and reaction. type trackerKey struct { id domain.SessionID key reactionKey } -// reactionTracker is the per-(session,reaction) escalation budget. It lives in -// memory on the Manager: a daemon restart resets budgets, which only ever costs -// a few extra agent retries before re-escalating — never a missed human -// notification. Keeping it out of the canonical store preserves the -// truth-vs-policy split (the store holds session truth; this is ACT policy). -// -// projectID is captured at first attempt so TickEscalations — which fires from -// the reaper and has no transition on hand — can still populate ProjectID on -// the escalation event. It is set once and never overwritten; reaction-bearing -// transitions for a given session id always carry the same projectID. -type reactionTracker struct { - attempts int - escalated bool - firstAttemptAt time.Time - projectID domain.ProjectID +type tracker struct { + attempts int + firstAt time.Time + escalated bool + seenSig bool + lastSig string + projectID domain.ProjectID } -// react fires the ACT layer after a persisted transition: clear the tracker for -// the reaction we left, then dispatch the reaction for the one we entered. It -// fires only on a genuine reaction change, so re-persisting the same state does -// not re-dispatch. Synchronous by design (see file header). -// -// Integration-time caveat: react runs AFTER withLock releases (deliberately, so -// a busy-waiting send-to-agent never holds the per-session mutex). Under a live -// daemon with concurrent observers (SCM poller + reaper + activity ingest) the -// afterLC snapshot can be stale by dispatch time — e.g. a ci-failed send firing -// after the session already moved to approved. Tests are single-threaded so it -// is not observable yet; when the daemon lands, give react a per-session -// ordering (a small react queue) or re-check the triggering state before -// dispatching. -func (m *Manager) react(ctx context.Context, id domain.SessionID, tr *transition, rc reactionContext) error { - if tr == nil { - return nil +type reactionState struct { + mu sync.Mutex + trackers map[trackerKey]*tracker + lastKey map[domain.SessionID]reactionKey +} + +func newReactionState() reactionState { + return reactionState{trackers: map[trackerKey]*tracker{}, lastKey: map[domain.SessionID]reactionKey{}} +} + +// trackerFor returns the (id,key) tracker, creating it on first use. Caller holds mu. +func (rs *reactionState) trackerFor(id domain.SessionID, key reactionKey) *tracker { + k := trackerKey{id, key} + t := rs.trackers[k] + if t == nil { + t = &tracker{} + rs.trackers[k] = t } - beforeKey, hadBefore := reactionEventFor(tr.beforeLC) - afterKey, hasAfter := reactionEventFor(tr.afterLC) - - changed := beforeKey != afterKey - - switch { - case incidentOver(tr.afterLC) || recovered(tr.afterLC): - // The PR-pipeline incident has ended — the PR resolved (merged/closed), - // the session went terminal, or it reached an approved/green state. Every - // tracker for this session is now stale, including a persistent ci-failed - // one. This is keyed on the state REACHED, not the one left: the recovery - // transition is typically review_pending->approved (beforeKey empty), so - // clearing only beforeKey would leak the ci-failed tracker and leave its - // escalated=true to silence a future regression. Clear them all. - m.clearSessionTrackers(ctx, id) - case hadBefore && (!hasAfter || changed): - // Within an unresolved open PR: a normal tracker resets when its state is - // left. A persistent one (ci-failed) is NOT cleared here — it must survive - // the ambiguous review_pending limbo (the fail->pending->fail flap, §4.2); - // it only resets via the recovery/incident-over branch above. - if !defaultReactions[beforeKey].persistent { - m.clearTracker(ctx, id, beforeKey) + return t +} + +func (m *Manager) clearReactions(id domain.SessionID) { + m.react.mu.Lock() + defer m.react.mu.Unlock() + for k := range m.react.trackers { + if k.id == id { + delete(m.react.trackers, k) } } + delete(m.react.lastKey, id) +} + +// ---- dispatch ---- + +// runReactions is the chokepoint called after every persisted transition. It +// runs unlocked (the write lock is already released) so a busy agent send never +// blocks the write path. +func (m *Manager) runReactions(ctx context.Context, id domain.SessionID, content reactionContent) error { + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return err + } + lc := rec.Lifecycle + project := rec.ProjectID - if hasAfter && (!hadBefore || changed) { - return m.executeReaction(ctx, id, tr.projectID, afterKey, rc) + if isTerminal(lc.Session.State) { + err := m.dispatch(ctx, id, project, terminalReaction(lc.TerminationReason)) + m.clearReactions(id) // incident over: drop budgets after the final notify + return err } - return nil -} -// incidentOver reports that a PR-pipeline incident has truly ended (PR no longer -// active, or the session terminal), so all trackers for the session may reset. -func incidentOver(l domain.CanonicalSessionLifecycle) bool { - return !isActivePRState(l.PR.State) || isTerminal(l.Session.State) -} + pr, err := m.store.PRFactsForSession(ctx, id) + if err != nil { + return err + } -func isActivePRState(s domain.PRState) bool { - return s == domain.PROpen || s == domain.PRDraft + // Feedback reactions inject live content and re-fire as it changes — only + // while the agent can actually act on it. + if pr.Exists && !pr.Closed && !needsHuman(lc.Session.State) { + if pr.CI == domain.CIFailing && content.ciCheck != "" { + if err := m.handleCIFailure(ctx, id, project, content); err != nil { + return err + } + } + if hasReviewFeedback(pr) { + if err := m.handleReviewFeedback(ctx, id, project, content); err != nil { + return err + } + } + } + + return m.dispatch(ctx, id, project, reactionFor(lc, pr)) } -// recovered reports a genuinely-green open PR: an approved/mergeable state, which -// unambiguously means CI is no longer failing (the open-PR ladder ranks ci_failing -// above approved, so an approved display cannot coexist with failing CI). Unlike -// the ambiguous review_pending state — which may just be CI re-running — reaching -// this ends a ci-failed incident and re-arms its budget. Draft PRs are active, -// but not recoverable via review/merge state. -func recovered(l domain.CanonicalSessionLifecycle) bool { - if !isActivePRState(l.PR.State) || l.PR.State == domain.PRDraft { - return false +// dispatch fires the entry reaction for key, deduped so a steady state does not +// re-fire. Leaving a reaction drops its budget. +func (m *Manager) dispatch(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey) error { + m.react.mu.Lock() + if m.react.lastKey[id] == key { + m.react.mu.Unlock() + return nil } - switch l.PR.Reason { - case domain.PRReasonApproved, domain.PRReasonMergeReady: - return true - default: - return false + if prev := m.react.lastKey[id]; prev != "" { + delete(m.react.trackers, trackerKey{id, prev}) } -} + m.react.lastKey[id] = key + m.react.mu.Unlock() -func (m *Manager) executeReaction(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey, rc reactionContext) error { - cfg := defaultReactions[key] - switch cfg.action { - case actionNotify: - // notify reactions are human-attention terminals: fire once on the - // triggering transition, no retry/escalation budget. - return m.notifier.Notify(ctx, ports.OrchestratorEvent{ - Type: cfg.eventType, - Priority: cfg.priority, - SessionID: id, - ProjectID: projectID, - Message: cfg.message, - }) - case actionAutoMerge: - // Off by default: no default row maps here, and wiring a merge port is a - // later PR. An opt-in config could route a reaction here. + if key == "" { return nil - case actionSendToAgent: - return m.sendToAgent(ctx, id, projectID, key, cfg, rc) } - return nil + cfg := reactions[key] + if cfg.toAgent { + return m.fireAgentEntry(ctx, id, project, key, cfg) + } + return m.fireNotify(ctx, id, project, cfg) } -// sendToAgent runs the escalation engine for an auto send-to-agent reaction: -// count the attempt, escalate when the numeric cap or duration is exceeded -// (silencing further auto-dispatch), else inject the message via the messenger. -func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey, cfg reactionConfig, rc reactionContext) error { - m.trackerMu.Lock() - tk := m.trackerFor(id, key) - // Capture projectID once so the duration-based TickEscalations path — which - // has no transition on hand — can still populate ProjectID on the escalation - // event. A non-empty incoming projectID always wins, in case the tracker was - // first created from an observation that lacked one. - if projectID != "" { - tk.projectID = projectID - } - if tk.escalated { - m.trackerMu.Unlock() - return nil // silenced until the condition clears the tracker +// reactionFor maps (session state, PR facts) to the reaction to enter. CI failure +// and review feedback return "" here — they are handled by the feedback path. +func reactionFor(lc domain.CanonicalSessionLifecycle, pr domain.PRFacts) reactionKey { + switch lc.Session.State { + case domain.SessionStuck: + return rxStuck + case domain.SessionNeedsInput: + return rxNeedsInput } - now := m.clock() - freshFirst := tk.firstAttemptAt.IsZero() - if freshFirst { - tk.firstAttemptAt = now + if pr.Exists { + if pr.Closed { + if !pr.Merged { + return rxPRClosed + } + return "" + } + switch { + case pr.CI == domain.CIFailing, hasReviewFeedback(pr): + return "" // feedback path + case pr.Mergeability == domain.MergeConflicting: + return rxMergeConflicts + case pr.Mergeability == domain.MergeMergeable, pr.Review == domain.ReviewApproved: + return rxApprovedGreen + } } - tk.attempts++ - escalateNow := shouldEscalate(tk, cfg, now) - if escalateNow { - tk.escalated = true + if lc.Session.State == domain.SessionIdle { + return rxIdle } - snap := *tk - m.trackerMu.Unlock() + return "" +} - // Write through the new budget (incl. escalated) before dispatching, so a - // crash between persist and notify re-fires at most the same page on restart. - m.persistTracker(ctx, id, key, snap) +func hasReviewFeedback(pr domain.PRFacts) bool { + return pr.Review == domain.ReviewChangesRequest || pr.ReviewComments +} - if escalateNow { - return m.escalate(ctx, id, snap.projectID, key) - } +func needsHuman(s domain.SessionState) bool { + return s == domain.SessionStuck || s == domain.SessionNeedsInput +} - if err := m.messenger.Send(ctx, id, composeMessage(cfg, rc)); err != nil { - // A delivery failure must not consume escalation budget: roll this - // attempt back so the next relevant transition retries from the same - // point rather than marching toward escalation on undelivered messages - // (distillation §4.3). - m.trackerMu.Lock() - tk.attempts-- - if freshFirst { - tk.firstAttemptAt = time.Time{} - } - rolled := *tk - m.trackerMu.Unlock() - m.persistTracker(ctx, id, key, rolled) - return err +// terminalReaction is the notify fired when a session reaches a terminal state by +// inferred death. An explicit kill goes through OnKillRequested (no reaction); +// auto_cleanup / pr_merged are notified elsewhere. +func terminalReaction(r domain.TerminationReason) reactionKey { + switch r { + case domain.TermRuntimeLost, domain.TermAgentProcessExited, domain.TermProbeFailure, domain.TermErrorInProcess: + return rxExited + default: + return "" } - return nil } -// shouldEscalate uses inclusive boundaries: escalate once the numeric cap is -// exceeded or once exactly escalateAfter has elapsed (don't wait for the next -// tick to cross a strict threshold). -func shouldEscalate(tk *reactionTracker, cfg reactionConfig, now time.Time) bool { - if cfg.retries > 0 && tk.attempts > cfg.retries { - return true - } - if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { - return true - } - return false +// ---- feedback reactions (content-driven re-fire + brake) ---- + +func (m *Manager) handleCIFailure(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { + msg := reactions[rxCIFailed].message + "\n\nFailing output:\n" + c.ciLogTail + return m.fireFeedback(ctx, id, project, rxCIFailed, c.ciCommit, msg, func(int) (bool, error) { + st, err := m.pr.RecentCheckStatuses(ctx, c.ciURL, c.ciCheck, ciBrakeRuns) + if err != nil { + return false, err + } + return allFailed(st, ciBrakeRuns), nil + }) } -// escalate emits reaction.escalated and notifies the human. The caller has -// already set tracker.escalated under the lock, which silences further -// auto-dispatch for this reaction until the tracker clears. -func (m *Manager) escalate(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey) error { - return m.notifier.Notify(ctx, ports.OrchestratorEvent{ - Type: "reaction.escalated", - Priority: ports.PriorityUrgent, - SessionID: id, - ProjectID: projectID, - Message: fmt.Sprintf("auto-handling of %q is exhausted and needs a human.", key), - Data: map[string]any{"reaction": string(key)}, +func (m *Manager) handleReviewFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { + msg := reactions[rxReviewComments].message + if len(c.comments) > 0 { + msg += "\n\n" + strings.Join(c.comments, "\n\n") + } + return m.fireFeedback(ctx, id, project, rxReviewComments, c.reviewSig, msg, func(attempts int) (bool, error) { + return attempts > reviewMaxNudge, nil }) } -func composeMessage(cfg reactionConfig, rc reactionContext) string { - if rc.ciFailureLogTail != nil && *rc.ciFailureLogTail != "" { - return cfg.message + "\n\nFailing output:\n" + *rc.ciFailureLogTail +// fireFeedback nudges the agent with fresh content, deduped by signature so the +// same content is not re-sent each poll. braked decides whether to escalate to a +// human instead (CI: history; review: attempt count). +func (m *Manager) fireFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, sig, message string, braked func(attempts int) (bool, error)) error { + m.react.mu.Lock() + t := m.react.trackerFor(id, key) + if project != "" { + t.projectID = project + } + if t.escalated || (t.seenSig && t.lastSig == sig) { + m.react.mu.Unlock() + return nil } - return cfg.message + t.seenSig, t.lastSig = true, sig + t.attempts++ + attempts, pid := t.attempts, t.projectID + m.react.lastKey[id] = key // feedback owns the slot so a later dispatch("") clears it + m.react.mu.Unlock() + + brake, err := braked(attempts) + if err != nil { + return err + } + if brake { + m.react.mu.Lock() + t.escalated = true + m.react.mu.Unlock() + return m.escalate(ctx, id, pid, key) + } + return m.messenger.Send(ctx, id, message) } -// trackerFor returns the tracker for (id,key), creating it on first use. The -// caller must hold trackerMu. -func (m *Manager) trackerFor(id domain.SessionID, key reactionKey) *reactionTracker { - k := trackerKey{id: id, key: key} - tk := m.trackers[k] - if tk == nil { - tk = &reactionTracker{} - m.trackers[k] = tk +// ---- entry reactions ---- + +// fireAgentEntry nudges the agent once on entry into a static reaction +// (idle/merge-conflicts); escalation is duration-based via TickEscalations. +func (m *Manager) fireAgentEntry(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, cfg reactionConfig) error { + m.react.mu.Lock() + t := m.react.trackerFor(id, key) + if project != "" { + t.projectID = project + } + if t.escalated { + m.react.mu.Unlock() + return nil + } + if t.firstAt.IsZero() { + t.firstAt = m.clock() } - return tk + t.attempts++ + m.react.mu.Unlock() + return m.messenger.Send(ctx, id, cfg.message) } -func (m *Manager) clearTracker(ctx context.Context, id domain.SessionID, key reactionKey) { - m.trackerMu.Lock() - delete(m.trackers, trackerKey{id: id, key: key}) - m.trackerMu.Unlock() - m.deletePersistedTracker(ctx, id, key) +func (m *Manager) fireNotify(ctx context.Context, id domain.SessionID, project domain.ProjectID, cfg reactionConfig) error { + return m.notifier.Notify(ctx, ports.Event{ + Type: cfg.eventType, Priority: cfg.priority, + SessionID: id, ProjectID: project, Message: cfg.message, + }) } -// clearSessionTrackers drops every tracker for a session — used when its -// incident is over, so no budget (and no stale escalated=true) survives into a -// later unrelated incident. -func (m *Manager) clearSessionTrackers(ctx context.Context, id domain.SessionID) { - m.trackerMu.Lock() - for k := range m.trackers { - if k.id == id { - delete(m.trackers, k) - } - } - m.trackerMu.Unlock() - m.deletePersistedSessionTrackers(ctx, id) +func (m *Manager) escalate(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey) error { + return m.notifier.Notify(ctx, ports.Event{ + Type: "reaction.escalated", Priority: ports.PriorityUrgent, + SessionID: id, ProjectID: project, + Message: fmt.Sprintf("Automatic handling of %q is exhausted — needs a human.", key), + }) } -// TickEscalations fires the duration-based escalations the synchronous LCM -// cannot wake itself for. The reaper calls it on a timer; it escalates any -// not-yet-escalated tracker whose escalateAfter has elapsed. Notifications are -// sent outside the lock so agent/notifier latency never blocks tracker access. +// TickEscalations fires the duration-based escalations the synchronous engine +// cannot wake itself for. The reaper calls it on a timer. func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { type due struct { - id domain.SessionID - projectID domain.ProjectID - key reactionKey - snap reactionTracker + id domain.SessionID + project domain.ProjectID + key reactionKey } var fire []due - - m.trackerMu.Lock() - for k, tk := range m.trackers { - if tk.escalated { + m.react.mu.Lock() + for k, t := range m.react.trackers { + if t.escalated { continue } - cfg := defaultReactions[k.key] - if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { - tk.escalated = true - fire = append(fire, due{id: k.id, projectID: tk.projectID, key: k.key, snap: *tk}) + cfg := reactions[k.key] + if cfg.escalateAfter > 0 && !t.firstAt.IsZero() && now.Sub(t.firstAt) >= cfg.escalateAfter { + t.escalated = true + fire = append(fire, due{k.id, t.projectID, k.key}) } } - m.trackerMu.Unlock() + m.react.mu.Unlock() for _, d := range fire { - m.persistTracker(ctx, d.id, d.key, d.snap) - if err := m.escalate(ctx, d.id, d.projectID, d.key); err != nil { + if err := m.escalate(ctx, d.id, d.project, d.key); err != nil { return err } } return nil } + +func allFailed(statuses []string, n int) bool { + if len(statuses) < n { + return false + } + for i := 0; i < n; i++ { + if statuses[i] != "failed" { + return false + } + } + return true +} diff --git a/backend/internal/lifecycle/reactions_test.go b/backend/internal/lifecycle/reactions_test.go deleted file mode 100644 index 637b1e5b..00000000 --- a/backend/internal/lifecycle/reactions_test.go +++ /dev/null @@ -1,616 +0,0 @@ -package lifecycle - -import ( - "context" - "fmt" - "strings" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// failingMessenger always fails delivery, counting attempts — used to assert a -// send failure does not consume escalation budget. -type failingMessenger struct{ attempts int } - -func (f *failingMessenger) Send(_ context.Context, _ domain.SessionID, _ string) error { - f.attempts++ - return fmt.Errorf("messenger unavailable") -} - -// newReactive wires a Manager with handles on the recording fakes so reaction -// tests can assert what was sent/notified. clock is pinned to t0 for -// deterministic escalation stamping. -func newReactive() (*Manager, *fakeStore, *recordingNotifier, *recordingMessenger) { - store := newFakeStore() - notf := &recordingNotifier{} - msgr := &recordingMessenger{} - m := New(store, notf, msgr) - m.clock = func() time.Time { return t0 } - return m, store, notf, msgr -} - -func lcOpenPR(reason domain.PRReason) domain.CanonicalSessionLifecycle { - l := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - l.PR = domain.PRSubstate{State: domain.PROpen, Reason: reason, Number: 7} - return l -} - -func notifyCount(n *recordingNotifier, eventType string) int { - n.mu.Lock() - defer n.mu.Unlock() - c := 0 - for _, e := range n.events { - if e.Type == eventType { - c++ - } - } - return c -} - -func ctx() context.Context { return context.Background() } - -// ---- right reaction per transition ---- - -func TestReaction_CIFailedSendsToAgentWithLogTail(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - tail := "build failed\nundefined: foo" - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, - PRNumber: 7, CIFailureLogTail: &tail, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want 1 send, got %d", len(msgr.sent)) - } - if got := msgr.sent[0].Message; !strings.Contains(got, "CI is failing") || !strings.Contains(got, tail) { - t.Errorf("message missing base text or log tail: %q", got) - } - if notifyCount(notf, "reaction.escalated") != 0 { - t.Error("a first failure must not escalate") - } -} - -func TestReaction_BotAndHumanCommentsRouteSeparately(t *testing.T) { - tests := []struct { - name string - comments []ports.ReviewComment - wantMessage string - }{ - { - name: "bot comments -> bugbot-comments", - comments: []ports.ReviewComment{{Author: "bugbot", Body: "fix", IsBot: true}}, - wantMessage: "automated reviewer", - }, - { - name: "human comments -> changes-requested", - comments: []ports.ReviewComment{{Author: "reviewer", Body: "fix"}}, - wantMessage: "reviewer requested changes", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, PendingComments: tt.comments, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want one send, got %d", len(msgr.sent)) - } - if !strings.Contains(msgr.sent[0].Message, tt.wantMessage) { - t.Errorf("message %q does not contain %q", msgr.sent[0].Message, tt.wantMessage) - } - }) - } -} - -func TestReaction_MergeConflictsSendsToAgent(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, PRNumber: 7, - Mergeability: ports.Mergeability{CIPassing: true, Approved: true, NoConflicts: false, Blockers: []string{"merge conflicts"}}, - }); err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want one send, got %d", len(msgr.sent)) - } - if !strings.Contains(msgr.sent[0].Message, "merge conflicts") { - t.Errorf("message = %q, want merge conflict nudge", msgr.sent[0].Message) - } -} - -func TestReaction_ApprovedAndGreenNotifiesNeverAutoMerges(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - // approved-and-green is notify (human decides to merge); the agent is never - // messaged and no auto-merge fires. - if len(msgr.sent) != 0 { - t.Errorf("approved-and-green must not message the agent, got %d sends", len(msgr.sent)) - } - if notifyCount(notf, "reaction.approved-and-green") != 1 { - t.Errorf("want one approved-and-green notify, got events %+v", notf.events) - } -} - -func TestReaction_NotifyEventsForHardStates(t *testing.T) { - tests := []struct { - name string - apply func(m *Manager) - eventType string - }{ - { - name: "waiting_input -> agent-needs-input", - apply: func(m *Manager) { applyActivity(m, domain.ActivityWaitingInput) }, - eventType: "reaction.agent-needs-input", - }, - { - name: "blocked -> agent-stuck", - apply: func(m *Manager) { applyActivity(m, domain.ActivityBlocked) }, - eventType: "reaction.agent-stuck", - }, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - tc.apply(m) - if notifyCount(notf, tc.eventType) != 1 { - t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) - } - if len(msgr.sent) != 0 { - t.Errorf("notify reaction must not message the agent, got %d", len(msgr.sent)) - } - }) - } -} - -func TestReaction_InferredDeathNotifiesAgentExited(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, detectingLC()) - - err := m.ApplyRuntimeObservation(ctx(), sid, ports.RuntimeFacts{ - RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); domain.DeriveLegacyStatus(l) != domain.StatusKilled { - t.Fatalf("precondition: want killed, got %s", domain.DeriveLegacyStatus(l)) - } - if notifyCount(notf, "reaction.agent-exited") != 1 { - t.Errorf("want one agent-exited, got events %+v", notf.events) - } -} - -func TestReaction_PRClosedAndMerged(t *testing.T) { - tests := []struct { - name string - prState domain.PRState - eventType string - }{ - {"closed -> pr-closed", domain.PRClosed, "reaction.pr-closed"}, - {"merged -> all-complete", domain.PRMerged, "reaction.all-complete"}, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: tc.prState, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if notifyCount(notf, tc.eventType) != 1 { - t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) - } - }) - } -} - -func TestReaction_OnKillRequestedDoesNotReact(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // An explicit human kill is not an inferred event: no agent-exited, no send. - if len(notf.events) != 0 || len(msgr.sent) != 0 { - t.Errorf("explicit kill must fire no reaction: notifies=%+v sends=%+v", notf.events, msgr.sent) - } -} - -// ---- escalation engine ---- - -func TestReaction_CIFailedNumericEscalation(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // ci-failed has retries 2 and is persistent, so the budget is shared across - // fail->pending->fail oscillations and escalates on the third failure. - failN := 4 - for i := 0; i < failN; i++ { - failCI(t, m) - pendingCI(t, m) // oscillate out (persistent tracker must NOT reset) - } - - if len(msgr.sent) != 2 { - t.Errorf("want 2 auto-sends before escalation, got %d", len(msgr.sent)) - } - if c := notifyCount(notf, "reaction.escalated"); c != 1 { - t.Errorf("want exactly one escalation, got %d", c) - } -} - -func TestReaction_DraftPRDoesNotEndCIFailedIncident(t *testing.T) { - m, store, _, _ := newReactive() - seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - seed.PR = domain.PRSubstate{State: domain.PRDraft, Reason: domain.PRReasonInProgress, Number: 7} - store.seed(sid, seed) - - tail := "fail" - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail, - }); err != nil { - t.Fatalf("draft fail: %v", err) - } - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a ci-failed tracker") - } - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIPending, PRNumber: 7, - }); err != nil { - t.Fatalf("draft pending: %v", err) - } - if n := sessionTrackerCount(m, sid); n == 0 { - t.Errorf("draft PR is still active; ci-failed tracker should survive, got %d", n) - } -} - -func TestReaction_DurationEscalationFiresOnTick(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // changes-requested: send once now, then escalate by duration (30m) — which - // only the reaper's TickEscalations can fire (the LCM never polls). - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if len(msgr.sent) != 1 { - t.Fatalf("want one send on transition, got %d", len(msgr.sent)) - } - - if err := m.TickEscalations(ctx(), t0.Add(10*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - if notifyCount(notf, "reaction.escalated") != 0 { - t.Error("must not escalate before escalateAfter elapses") - } - - // Inclusive boundary: escalate at exactly escalateAfter (30m), not only past it. - if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - if notifyCount(notf, "reaction.escalated") != 1 { - t.Errorf("want one duration escalation at exactly 30m, got events %+v", notf.events) - } -} - -func TestReaction_KillClearsEscalationTrackers(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // changes-requested creates a duration-based tracker. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a tracker") - } - - if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - if n := sessionTrackerCount(m, sid); n != 0 { - t.Errorf("kill must clear trackers, %d left", n) - } - // A later duration tick must not escalate a dead session. - if err := m.TickEscalations(ctx(), t0.Add(time.Hour)); err != nil { - t.Fatalf("tick: %v", err) - } - if c := notifyCount(notf, "reaction.escalated"); c != 0 { - t.Errorf("killed session must not escalate, got %d", c) - } -} - -func TestReaction_SendFailureDoesNotBurnBudget(t *testing.T) { - store := newFakeStore() - notf := &recordingNotifier{} - fm := &failingMessenger{} - m := New(store, notf, fm) - m.clock = func() time.Time { return t0 } - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - tail := "fail" - failing := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail} - pending := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7} - - // ci-failed has retries 2; with every delivery failing, the budget is rolled - // back each time, so even 5 failures never escalate. - for i := 0; i < 5; i++ { - _ = m.ApplySCMObservation(ctx(), sid, failing) // returns the delivery error - _ = m.ApplySCMObservation(ctx(), sid, pending) - } - if fm.attempts < 5 { - t.Errorf("expected at least 5 send attempts, got %d", fm.attempts) - } - if c := notifyCount(notf, "reaction.escalated"); c != 0 { - t.Errorf("undelivered messages must not escalate, got %d", c) - } -} - -func TestReaction_NonPersistentTrackerClearsOnLeave(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - // agent-idle has retries 2 but is NOT persistent: leaving idle clears the - // tracker, so three idle incidents each send fresh and none escalate. - for i := 0; i < 3; i++ { - applyActivity(m, domain.ActivityIdle) - applyActivity(m, domain.ActivityActive) - } - if len(msgr.sent) != 3 { - t.Errorf("want 3 idle sends (budget reset each incident), got %d", len(msgr.sent)) - } -} - -func TestReaction_CIFailedRearmsOnGenuineRecovery(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // Drain the ci-failed budget to escalation (silenced thereafter). - for i := 0; i < 4; i++ { - failCI(t, m) - pendingCI(t, m) - } - if notifyCount(notf, "reaction.escalated") != 1 { - t.Fatalf("precondition: want one escalation, got %d", notifyCount(notf, "reaction.escalated")) - } - sentBefore := len(msgr.sent) - - // A genuine recovery (approved + green) ends the incident and re-arms the - // budget; a later regression must re-nudge the agent, not stay silenced. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }); err != nil { - t.Fatalf("recover: %v", err) - } - failCI(t, m) - - if len(msgr.sent) != sentBefore+1 { - t.Errorf("regression after recovery must re-nudge the agent: sends %d -> %d", sentBefore, len(msgr.sent)) - } -} - -func TestReaction_IncidentOverClearsAllSessionTrackers(t *testing.T) { - m, store, _, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - failCI(t, m) // creates a persistent ci-failed tracker - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a ci-failed tracker") - } - - // Merging ends the incident; no tracker (and no stale escalated=true) may - // survive for the session. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRMerged, PRNumber: 7, - }); err != nil { - t.Fatalf("merge: %v", err) - } - if n := sessionTrackerCount(m, sid); n != 0 { - t.Errorf("incident over must clear all trackers, %d left", n) - } -} - -// ---- ProjectID propagation (review R11) ---- - -// TestReaction_ProjectIDOnNotifyAndEscalateEvents asserts that both Notify call -// sites in reactions.go (executeReaction's notify and escalate) carry the -// record's ProjectID. The human-facing event router groups by project, so a -// missing id would land events in the wrong bucket. -func TestReaction_ProjectIDOnNotifyAndEscalateEvents(t *testing.T) { - const proj domain.ProjectID = "acme" - - t.Run("notify path -> ProjectID populated", func(t *testing.T) { - m, store, notf, _ := newReactive() - // Seed via Upsert (not the lifecycle-only seed helper) so the record carries - // the ProjectID that mutate's transition then propagates to react. - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // approved-and-green is a notify reaction; it fires once via executeReaction. - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.approved-and-green" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected approved-and-green notify, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("notify ProjectID = %q, want %q", got.ProjectID, proj) - } - if got.SessionID != sid { - t.Errorf("notify SessionID = %q, want %q", got.SessionID, sid) - } - }) - - t.Run("escalate path -> ProjectID populated (numeric cap)", func(t *testing.T) { - m, store, notf, _ := newReactive() - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // Drain the ci-failed budget to numeric escalation (sendToAgent -> escalate). - for i := 0; i < 4; i++ { - failCI(t, m) - pendingCI(t, m) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.escalated" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected reaction.escalated event, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("escalate ProjectID = %q, want %q", got.ProjectID, proj) - } - }) - - t.Run("escalate path -> ProjectID populated (TickEscalations duration)", func(t *testing.T) { - m, store, notf, _ := newReactive() - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // changes-requested creates a duration-based tracker on the first send; - // TickEscalations fires escalate from a path with no transition on hand, - // so the tracker's captured ProjectID is what must surface on the event. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.escalated" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected duration-escalated event, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("tick-escalate ProjectID = %q, want %q", got.ProjectID, proj) - } - }) -} - -func sessionTrackerCount(m *Manager, id domain.SessionID) int { - m.trackerMu.Lock() - defer m.trackerMu.Unlock() - c := 0 - for k := range m.trackers { - if k.id == id { - c++ - } - } - return c -} - -// ---- TickEscalations never writes canonical state ---- - -func TestTickEscalations_DoesNotPersist(t *testing.T) { - m, store, _, _ := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := m.TickEscalations(ctx(), t0); err != nil { - t.Fatalf("tick: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 { - t.Errorf("TickEscalations must not write canonical state, got revision=%d", l.Revision) - } -} - -// ---- helpers ---- - -func applyActivity(m *Manager, a domain.ActivityState) { - _ = m.ApplyActivitySignal(ctx(), sid, ports.ActivitySignal{ - State: ports.SignalValid, Activity: a, Timestamp: t0, Source: domain.SourceHook, - }) -} - -func failCI(t *testing.T, m *Manager) { - t.Helper() - tail := "fail" - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail, - }); err != nil { - t.Fatalf("failCI: %v", err) - } -} - -func pendingCI(t *testing.T, m *Manager) { - t.Helper() - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7, - }); err != nil { - t.Fatalf("pendingCI: %v", err) - } -} diff --git a/backend/internal/observe/reaper/reaper.go b/backend/internal/observe/reaper/reaper.go index 579f1d63..7edee2b1 100644 --- a/backend/internal/observe/reaper/reaper.go +++ b/backend/internal/observe/reaper/reaper.go @@ -183,16 +183,16 @@ func (r *Reaper) probeOne(ctx context.Context, sess domain.SessionRecord, now ti // transient tmux/zellij outage hide a really-dead session, and a // transient adapter bug terminate a really-alive one. Report failed // and let the LCM's detecting quarantine arbitrate. - facts.RuntimeState = ports.RuntimeProbeFailed - facts.ProcessState = ports.ProcessProbeFailed + facts.Runtime = ports.ProbeFailed + facts.Process = ports.ProbeFailed r.logger.Debug("reaper: probe error reported as failed fact", "session", sess.ID, "runtime", handle.RuntimeName, "err", probeErr) case alive: - facts.RuntimeState = ports.RuntimeProbeAlive - facts.ProcessState = ports.ProcessProbeAlive + facts.Runtime = ports.ProbeAlive + facts.Process = ports.ProbeAlive default: - facts.RuntimeState = ports.RuntimeProbeDead - facts.ProcessState = ports.ProcessProbeDead + facts.Runtime = ports.ProbeDead + facts.Process = ports.ProbeDead } if err := r.lcm.ApplyRuntimeObservation(ctx, sess.ID, facts); err != nil { diff --git a/backend/internal/observe/reaper/reaper_test.go b/backend/internal/observe/reaper/reaper_test.go index 0d3b4d47..ffb3eed4 100644 --- a/backend/internal/observe/reaper/reaper_test.go +++ b/backend/internal/observe/reaper/reaper_test.go @@ -1,385 +1,115 @@ -package reaper_test +package reaper import ( "context" "errors" - "reflect" - "sync" + "io" + "log/slog" "testing" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// ---- fakes ---- +var ctx = context.Background() -type aliveResult struct { - alive bool - err error -} - -// fakeRuntime is a programmable ports.Runtime. The reaper only calls IsAlive, -// but the interface requires the other methods so we stub them. -type fakeRuntime struct { - mu sync.Mutex - results map[string]aliveResult - probed []string -} - -var _ ports.Runtime = (*fakeRuntime)(nil) - -func (f *fakeRuntime) IsAlive(_ context.Context, h ports.RuntimeHandle) (bool, error) { - f.mu.Lock() - f.probed = append(f.probed, h.ID) - f.mu.Unlock() - r, ok := f.results[h.ID] - if !ok { - return false, errors.New("fakeRuntime: no programmed response for " + h.ID) - } - return r.alive, r.err -} - -func (f *fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { - return ports.RuntimeHandle{}, nil -} -func (f *fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } -func (f *fakeRuntime) SendMessage(context.Context, ports.RuntimeHandle, string) error { - return nil -} -func (f *fakeRuntime) GetOutput(context.Context, ports.RuntimeHandle, int) (string, error) { - return "", nil -} - -// fakeLCM records every reaper-facing call in order so tests can assert the -// exact sequence (TickEscalations -> RunningSessions -> ApplyRuntimeObservation). type fakeLCM struct { - mu sync.Mutex - sessions []domain.SessionRecord - calls []call - - runErr error - tickErr error - obsErr error -} - -type call struct { - Kind string - Now time.Time - Session domain.SessionID - Facts ports.RuntimeFacts + running []domain.SessionRecord + observed map[domain.SessionID]ports.RuntimeFacts + escalated int } -var _ ports.LifecycleManager = (*fakeLCM)(nil) - -func (l *fakeLCM) RunningSessions(_ context.Context) ([]domain.SessionRecord, error) { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "RunningSessions"}) - if l.runErr != nil { - return nil, l.runErr - } - out := make([]domain.SessionRecord, len(l.sessions)) - copy(out, l.sessions) - return out, nil +func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { + return l.running, nil } - -func (l *fakeLCM) TickEscalations(_ context.Context, now time.Time) error { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "TickEscalations", Now: now}) - return l.tickErr -} - func (l *fakeLCM) ApplyRuntimeObservation(_ context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "ApplyRuntimeObservation", Session: id, Facts: f}) - return l.obsErr -} - -// unused methods on the LCM port — the reaper never invokes them. -func (l *fakeLCM) ApplySCMObservation(context.Context, domain.SessionID, ports.SCMFacts) error { + if l.observed == nil { + l.observed = map[domain.SessionID]ports.RuntimeFacts{} + } + l.observed[id] = f return nil } +func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { l.escalated++; return nil } func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { return nil } -func (l *fakeLCM) OnSpawnInitiated(context.Context, domain.SessionRecord) error { return nil } +func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { + return nil +} func (l *fakeLCM) OnSpawnCompleted(context.Context, domain.SessionID, ports.SpawnOutcome) error { return nil } -func (l *fakeLCM) OnKillRequested(context.Context, domain.SessionID, ports.KillReason) error { +func (l *fakeLCM) OnKillRequested(context.Context, domain.SessionID, domain.TerminationReason) error { return nil } -// ---- helpers ---- +type fakeRuntime struct { + alive bool + err error +} -func aliveSessionWith(id domain.SessionID, runtimeName, handleID string) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, - }, - Metadata: domain.SessionMetadata{ - RuntimeHandleID: handleID, - RuntimeName: runtimeName, - }, - } +func (r fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { + return ports.RuntimeHandle{}, nil +} +func (r fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } +func (r fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { + return r.alive, r.err } -// detectingSessionWith returns a session in the Detecting quarantine, the -// shape `Manager.RunningSessions` MUST include so a probe-alive can recover it -// (otherwise the reaper traps every session that hiccups once in detecting). -func detectingSessionWith(id domain.SessionID, runtimeName, handleID string) domain.SessionRecord { +func probableSession(id domain.SessionID) domain.SessionRecord { return domain.SessionRecord{ - ID: id, + ID: id, + Metadata: domain.SessionMetadata{RuntimeHandleID: "h1", RuntimeName: "tmux"}, Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionDetecting, Reason: domain.ReasonProbeFailure}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError}, - }, - Metadata: domain.SessionMetadata{ - RuntimeHandleID: handleID, - RuntimeName: runtimeName, + Session: domain.SessionSubstate{State: domain.SessionWorking}, }, } } -// ---- tests ---- +func quietLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } -func TestReaper_Tick(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - - type runtimeProbes struct { - name string - results map[string]aliveResult - } - - tests := []struct { - name string - sessions []domain.SessionRecord - runtimes []runtimeProbes - wantCalls []call - wantProbe map[string][]string // runtime name -> handle IDs probed, in order - }{ - { - // "No death applied" per the spec: the LCM does not receive a - // death-causing fact. It still receives the alive fact, because - // the reaper reports what it probed and the LCM is the one that - // diffs against canonical (a no-op when runtime is already alive, - // a recovery when the session was in Detecting). - name: "alive session: alive fact reported, no death applied, tick still fires", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: true}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - // Recovery path: a session in Detecting+probe_failed must be in - // the poll set so an alive probe can flow through and recover it. - // If the reaper filtered to runtime-axis-alive only, this session - // would be trapped in Detecting forever. - name: "detecting session: alive probe reported so LCM can recover from quarantine", - sessions: []domain.SessionRecord{detectingSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: true}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "dead session: exactly one ApplyRuntimeObservation with Dead facts", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: false}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "probe error: reported as failed fact, NOT collapsed to alive", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {err: errors.New("boom")}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeFailed}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "multi-runtime dispatch: tmux + zellij in same tick", - sessions: []domain.SessionRecord{ - aliveSessionWith("s1", "tmux", "ht"), - aliveSessionWith("s2", "zellij", "hz"), - }, - runtimes: []runtimeProbes{ - {name: "tmux", results: map[string]aliveResult{"ht": {alive: false}}}, - {name: "zellij", results: map[string]aliveResult{"hz": {alive: true}}}, - }, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead}, - }, - { - Kind: "ApplyRuntimeObservation", - Session: "s2", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"ht"}, "zellij": {"hz"}}, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - lcm := &fakeLCM{sessions: tc.sessions} - registry := reaper.MapRegistry{} - byName := map[string]*fakeRuntime{} - for _, r := range tc.runtimes { - rt := &fakeRuntime{results: r.results} - registry[r.name] = rt - byName[r.name] = rt - } - rp := reaper.New(lcm, registry, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) - } - - if !reflect.DeepEqual(lcm.calls, tc.wantCalls) { - t.Errorf("LCM call log mismatch:\n got %#v\n want %#v", lcm.calls, tc.wantCalls) - } - - for name, want := range tc.wantProbe { - got := byName[name].probed - if !reflect.DeepEqual(got, want) { - t.Errorf("runtime %q probed handles mismatch: got %v want %v", name, got, want) - } - } - }) - } +func newReaper(lcm *fakeLCM, rt fakeRuntime) *Reaper { + return New(lcm, MapRegistry{"tmux": rt}, Config{Logger: quietLogger()}) } -// TestReaper_Loop verifies the background goroutine actually drives ticks and -// exits on context cancel without leaking. -func TestReaper_Loop(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - lcm := &fakeLCM{} - rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Clock: clock, Tick: 5 * time.Millisecond}) - - ctx, cancel := context.WithCancel(context.Background()) - done := rp.Start(ctx) - - // Wait for at least two ticks so we know the loop is actually firing. - deadline := time.Now().Add(500 * time.Millisecond) - for time.Now().Before(deadline) { - lcm.mu.Lock() - n := countKind(lcm.calls, "TickEscalations") - lcm.mu.Unlock() - if n >= 2 { - break - } - time.Sleep(2 * time.Millisecond) +func TestTick_ReportsAliveProbe(t *testing.T) { + lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + t.Fatal(err) } - cancel() - - select { - case <-done: - case <-time.After(time.Second): - t.Fatal("reaper goroutine did not exit within 1s of ctx cancel") - } - - lcm.mu.Lock() - defer lcm.mu.Unlock() - if got := countKind(lcm.calls, "TickEscalations"); got < 2 { - t.Errorf("expected at least 2 TickEscalations calls during loop, got %d", got) + if lcm.observed["mer-1"].Runtime != ports.ProbeAlive { + t.Fatalf("want alive probe, got %q", lcm.observed["mer-1"].Runtime) } } -func countKind(calls []call, kind string) int { - n := 0 - for _, c := range calls { - if c.Kind == kind { - n++ - } +func TestTick_ReportsProbeErrorAsFailed(t *testing.T) { + lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, fakeRuntime{err: errors.New("tmux gone")}).Tick(ctx); err != nil { + t.Fatal(err) + } + if lcm.observed["mer-1"].Runtime != ports.ProbeFailed { + t.Fatalf("probe error must be reported as failed, got %q", lcm.observed["mer-1"].Runtime) } - return n } -// TestReaper_SkipsUnknownRuntime verifies the reaper does not panic and does not -// report a fact when a session references an unregistered runtime — the reaper -// only reports what it actually probed. -func TestReaper_SkipsUnknownRuntime(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - lcm := &fakeLCM{sessions: []domain.SessionRecord{aliveSessionWith("s1", "ghost", "h1")}} - rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) +func TestTick_FiresEscalationHeartbeat(t *testing.T) { + lcm := &fakeLCM{} + if err := newReaper(lcm, fakeRuntime{}).Tick(ctx); err != nil { + t.Fatal(err) } - - for _, c := range lcm.calls { - if c.Kind == "ApplyRuntimeObservation" { - t.Fatalf("unexpected ApplyRuntimeObservation for unknown-runtime session: %+v", c) - } + if lcm.escalated != 1 { + t.Fatalf("tick must drive TickEscalations once, got %d", lcm.escalated) } } -// TestReaper_SkipsMissingHandle verifies the reaper does not probe (and does not -// report) for sessions whose runtime handle metadata is missing — probing -// nothing returns no fact. -func TestReaper_SkipsMissingHandle(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - sess := aliveSessionWith("s1", "tmux", "h1") - sess.Metadata.RuntimeHandleID = "" - lcm := &fakeLCM{sessions: []domain.SessionRecord{sess}} - rt := &fakeRuntime{results: map[string]aliveResult{}} - rp := reaper.New(lcm, reaper.MapRegistry{"tmux": rt}, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) - } - if len(rt.probed) != 0 { - t.Errorf("expected no probes for session without handle id, got %v", rt.probed) +func TestTick_SkipsSessionWithoutHandle(t *testing.T) { + noHandle := domain.SessionRecord{ID: "mer-1"} // no runtime metadata + lcm := &fakeLCM{running: []domain.SessionRecord{noHandle}} + if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + t.Fatal(err) } - for _, c := range lcm.calls { - if c.Kind == "ApplyRuntimeObservation" { - t.Fatalf("unexpected ApplyRuntimeObservation: %+v", c) - } + if _, probed := lcm.observed["mer-1"]; probed { + t.Fatal("a session without a runtime handle must be skipped") } } diff --git a/backend/internal/ports/facts.go b/backend/internal/ports/facts.go index e1854fac..a3b3b397 100644 --- a/backend/internal/ports/facts.go +++ b/backend/internal/ports/facts.go @@ -1,9 +1,6 @@ -// Package ports declares the boundary contracts for the LCM + Session Manager -// lane: the inbound interfaces we implement, the outbound interfaces others -// implement for us, and the fact DTOs that cross those boundaries. -// -// These are the types the SCM poller, persistence adapter, and API layer build -// against, so they are committed and stabilised before the LCM/SM logic. +// Package ports declares the boundary contracts for the lifecycle lane: the +// inbound interfaces the engine implements, the outbound interfaces its adapters +// implement, and the plain DTOs that cross those edges. It holds no logic. package ports import ( @@ -12,122 +9,55 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// SCMFacts is produced by the SCM poller and handed to ApplySCMObservation. -// -// Fetched is the failed-probe guard: when false, the GitHub query timed out or -// errored and the rest of the struct is meaningless — the LCM must NOT read it -// as "no PR / PR closed" (the SCM analogue of "failed probe != dead"). -// -// CIFailureLogTail is a pointer because it is only populated when CI is failing; -// it carries ~120 lines and we don't want it on the hot poll path otherwise. -type SCMFacts struct { - Fetched bool - ObservedAt time.Time - PRState domain.PRState - Draft bool - PRNumber int - PRURL string - CISummary CISummary - ReviewDecision ReviewDecision - Mergeability Mergeability - PendingComments []ReviewComment - CIFailureLogTail *string -} - -type CISummary string +// ProbeResult is a single liveness reading. "failed" (the probe errored/timed +// out) and "unknown" (ran but couldn't tell) are kept distinct from dead — both +// route to the detecting quarantine, never to a death conclusion. +type ProbeResult string const ( - CIPending CISummary = "pending" - CIPassing CISummary = "passing" - CIFailing CISummary = "failing" - CINone CISummary = "none" + ProbeAlive ProbeResult = "alive" + ProbeDead ProbeResult = "dead" + ProbeFailed ProbeResult = "failed" + ProbeUnknown ProbeResult = "unknown" ) -type ReviewDecision string - -const ( - ReviewApproved ReviewDecision = "approved" - ReviewChangesRequested ReviewDecision = "changes_requested" - ReviewPending ReviewDecision = "pending" - ReviewNone ReviewDecision = "none" -) - -// Mergeability is the structured "can this merge?" answer. CIPassing/Approved -// here overlap CISummary/ReviewDecision by design (different granularity); -// Mergeability is authoritative for the merge gate, the others for display. -type Mergeability struct { - Mergeable bool - CIPassing bool - Approved bool - NoConflicts bool - Blockers []string -} - -// ReviewComment carries IsBot so the decider can route bot review comments -// (bugbot-comments reaction) differently from human ones (changes-requested). -type ReviewComment struct { - Author string - Body string - IsBot bool - URL string -} - -// RuntimeFacts is produced by the reaper and handed to ApplyRuntimeObservation. +// RuntimeFacts is what the reaper reports each probe: is the runtime container +// up, and is the agent process inside it up. type RuntimeFacts struct { - ObservedAt time.Time - RuntimeState RuntimeProbe - ProcessState ProcessProbe + ObservedAt time.Time + Runtime ProbeResult + Process ProbeResult } -// RuntimeProbe / ProcessProbe keep "failed" (the probe call itself errored or -// timed out) distinct from "indeterminate" (the probe ran but couldn't tell) — -// they route differently in the decider. -type RuntimeProbe string - -const ( - RuntimeProbeAlive RuntimeProbe = "alive" - RuntimeProbeDead RuntimeProbe = "dead" - RuntimeProbeIndeterminate RuntimeProbe = "indeterminate" - RuntimeProbeFailed RuntimeProbe = "failed" -) - -type ProcessProbe string - -const ( - ProcessProbeAlive ProcessProbe = "alive" - ProcessProbeDead ProcessProbe = "dead" - ProcessProbeIndeterminate ProcessProbe = "indeterminate" - ProcessProbeFailed ProcessProbe = "failed" -) - -// ActivitySignal is pushed by agent hooks / the FS watcher. State is the -// confidence wrapper (so unavailable/probe_failure != idleness); Activity is -// the actual classification. +// ActivitySignal is pushed by the agent hooks. Only a Valid signal is +// authoritative; a stale/absent one is ignored rather than read as idleness. type ActivitySignal struct { - State SignalConfidence - Activity domain.ActivityState + Valid bool + State domain.ActivityState Timestamp time.Time Source domain.ActivitySource } -type SignalConfidence string - -const ( - SignalValid SignalConfidence = "valid" - SignalStale SignalConfidence = "stale" - SignalNull SignalConfidence = "null" - SignalUnavailable SignalConfidence = "unavailable" - SignalProbeFailure SignalConfidence = "probe_failure" -) +// PRObservation is what the SCM poller reports for one PR. Fetched is the +// failed-fetch guard: when false the rest is meaningless and the engine must not +// read it as "PR closed". Checks/Comments are the current full sets (the engine +// records the checks and replaces the comment set). +type PRObservation struct { + Fetched bool + URL string + Number int + Draft bool + Merged bool + Closed bool + CI domain.CIState + Review domain.ReviewDecision + Mergeability domain.Mergeability + Checks []PRCheckRow + Comments []PRComment +} -// SpawnOutcome is what the Session Manager reports to the LCM after a spawn. -// RuntimeHandle is the same structured handle the Runtime port returns, so no -// ad-hoc string encoding is needed for later Destroy/SendMessage calls. -// -// Prompt is the assembled launch prompt persisted as metadata so Restore can -// fall back to a fresh launch (Agent.GetLaunchCommand) when the agent's native -// session id was never captured — without it Restore would have nothing to -// resume and nothing to re-seed a fresh run with. +// SpawnOutcome is what the Session Manager reports once a spawn is live: the +// handles needed for later teardown/restore. type SpawnOutcome struct { Branch string WorkspacePath string @@ -136,17 +66,41 @@ type SpawnOutcome struct { Prompt string } -// KillReason is what the Session Manager reports to the LCM when a kill is -// requested. Kind drives whether the terminal state is killed/cleanup/errored. -type KillReason struct { - Kind LifecycleKillReason - Detail string +// ---- store row DTOs (shared by the PRWriter port and its sqlite adapter) ---- + +// PRRow is the scalar PR facts row. +type PRRow struct { + URL string + SessionID string + Number int + Draft bool + Merged bool + Closed bool + CI domain.CIState + Review domain.ReviewDecision + Mergeability domain.Mergeability + UpdatedAt time.Time } -type LifecycleKillReason string +// PRCheckRow is one CI check run (one row per check name per commit). +type PRCheckRow struct { + PRURL string + Name string + CommitHash string + Status string + URL string + LogTail string + CreatedAt time.Time +} -const ( - KillManual LifecycleKillReason = "manual" - KillCleanup LifecycleKillReason = "cleanup" - KillError LifecycleKillReason = "error" -) +// PRComment is one review comment. Review feedback is injected into the agent +// regardless of author, so there is no bot/human distinction. +type PRComment struct { + ID string + Author string + File string + Line int + Body string + Resolved bool + CreatedAt time.Time +} diff --git a/backend/internal/ports/inbound.go b/backend/internal/ports/inbound.go index 58ec2015..00223ae9 100644 --- a/backend/internal/ports/inbound.go +++ b/backend/internal/ports/inbound.go @@ -7,73 +7,45 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleManager is the inbound contract we implement. Every Apply* method -// runs the same synchronous pipeline: load canonical -> pure decide -> diff -> -// persist (full-row Upsert) -> if the status transitioned, fire reactions. The LCM -// never polls; observers (SCM poller, reaper, activity ingest) call in. -// -// Concurrency: the LCM serialises per session, so concurrent Apply* calls for -// the same session do not race the load/decide/persist read-modify-write. +// LifecycleManager is the inbound contract the engine implements. Observers +// (reaper, SCM poller, activity hooks) and the Session Manager call in; the LCM +// is the sole writer of canonical transitions and the only place reactions fire. type LifecycleManager interface { - // Raw-fact entrypoints (each runs decide internally). - ApplySCMObservation(ctx context.Context, id domain.SessionID, f SCMFacts) error ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f RuntimeFacts) error ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ActivitySignal) error + ApplyPRObservation(ctx context.Context, id domain.SessionID, o PRObservation) error - // Mutation commands/outcomes reported by the Session Manager. - OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error + // OnSpawnCompleted marks a session live and records its handles. It works for + // a fresh spawn (not_started -> live) and a restore (terminal -> reopened). OnSpawnCompleted(ctx context.Context, id domain.SessionID, o SpawnOutcome) error - OnKillRequested(ctx context.Context, id domain.SessionID, r KillReason) error + OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error - // Reaper heartbeat that drives duration-based escalation (a non-polling - // LCM can't wake itself to fire a "30m elapsed" escalation). + // TickEscalations fires the duration-based escalations the synchronous LCM + // can't wake itself for; the reaper calls it on a timer. TickEscalations(ctx context.Context, now time.Time) error - - // RunningSessions returns a snapshot of every session whose runtime axis is - // alive. The reaper calls it once per tick to decide whom to probe. It is a - // read snapshot — the slice and its elements are safe for the caller to - // iterate without holding any LCM lock — and does not violate the - // single-writer invariant (the reaper never writes; it reports facts back - // through ApplyRuntimeObservation). + // RunningSessions snapshots every non-terminal session for the reaper to probe. RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) } -// SessionManager is the inbound contract called by the API layer and CLI. It -// owns explicit mutations (spawn/kill/restore/cleanup) and never writes -// sessions directly — it routes mutation commands/outcomes to the LCM. +// SessionManager is the inbound contract the API/CLI call for explicit +// mutations. It drives the runtime/agent/workspace plugins and routes canonical +// writes to the LCM. type SessionManager interface { Spawn(ctx context.Context, cfg SpawnConfig) (domain.Session, error) - Kill(ctx context.Context, id domain.SessionID, opts KillOptions) (KillResult, error) + Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (freed bool, err error) + Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) Send(ctx context.Context, id domain.SessionID, message string) error - Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) - Cleanup(ctx context.Context, project domain.ProjectID) (CleanupResult, error) + Cleanup(ctx context.Context, project domain.ProjectID) ([]domain.SessionID, error) } type SpawnConfig struct { ProjectID domain.ProjectID IssueID domain.IssueID Kind domain.SessionKind + Harness domain.AgentHarness Branch string Prompt string AgentRules string - // OpenTerminal is reserved for a later lane (open a terminal tab on spawn). - // Spawn does NOT honor it yet — setting it has no effect. - OpenTerminal bool -} - -type KillOptions struct { - Reason LifecycleKillReason - Detail string -} - -type KillResult struct { - ID domain.SessionID - WorkspaceFreed bool -} - -type CleanupResult struct { - Cleaned []domain.SessionID - Skipped []domain.SessionID // e.g. paths that still held uncommitted work } diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go index c64a1e6d..d180f538 100644 --- a/backend/internal/ports/outbound.go +++ b/backend/internal/ports/outbound.go @@ -6,86 +6,63 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleStore is Tom's persistence adapter for session records. -// -// Writer contract: the Lifecycle Manager (LCM) is the sole logical writer of -// sessions. Controllers, the Session Manager, observers, and other goroutines -// must route mutations to the LCM; no other goroutine writes sessions directly. -// The LCM serializes mutations and calls Upsert with the full SessionRecord and -// the classified event_type. The storage layer owns Revision++ and performs the -// full-row insert-or-update; the older sparse merge-patch model is gone. -// -// List/Get return persistence records (no derived status); the Session Manager -// hydrates them into domain.Session by attaching DeriveLegacyStatus on read. -type LifecycleStore interface { - // Upsert inserts or replaces the full session row and bumps Revision inside - // the storage layer. Only the LCM may call it. - Upsert(ctx context.Context, rec domain.SessionRecord, eventType EventType) error - Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) - List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) - GetMetadata(ctx context.Context, id domain.SessionID) (domain.SessionMetadata, error) - PatchMetadata(ctx context.Context, id domain.SessionID, meta domain.SessionMetadata) error - - // Get returns a single full record (with identity) by id. Load is - // lifecycle-only, so readers use this to build the read-model and reconstruct - // teardown handles for Kill/Restore on one id. - Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) +// SessionStore persists session records and serves the derived read-model's PR +// facts. The Session Manager creates rows; the Lifecycle Manager is the sole +// writer of canonical transitions thereafter. +type SessionStore interface { + CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) + UpdateSession(ctx context.Context, rec domain.SessionRecord) error + GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) + ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) + ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) + // PRFactsForSession returns the PR facts that drive a session's display + // status: the most-recently-updated non-closed PR, else the most recent. + // Zero value (Exists=false) means the session has no PR. + PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) } -// EventType is the schema-level event label attached to each Upsert. -type EventType string - -const ( - EventSessionCreated EventType = "session_created" - EventSessionTerminated EventType = "session_terminated" - EventSessionStateChanged EventType = "session_state_changed" - EventSessionPRUpdated EventType = "session_pr_updated" - EventSessionRuntimeUpdated EventType = "session_runtime_updated" - EventSessionAttentionUpdated EventType = "session_attention_updated" - EventSessionActivityUpdated EventType = "session_activity_updated" - EventSessionDisplayUpdated EventType = "session_display_updated" - EventSessionUpdated EventType = "session_updated" -) +// PRWriter records the PR facts a PR observation carries. The pr table's own DB +// triggers emit the CDC; this just writes the rows. +type PRWriter interface { + UpsertPR(ctx context.Context, r PRRow) error + RecordCheck(ctx context.Context, r PRCheckRow) error + RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) + ReplacePRComments(ctx context.Context, prURL string, comments []PRComment) error +} -// Notifier delivers events to the human (desktop/Slack later). Push, never pull. +// Notifier delivers an event to the human (desktop/Slack later). Push, never poll. type Notifier interface { - Notify(ctx context.Context, event OrchestratorEvent) error + Notify(ctx context.Context, event Event) error +} + +// AgentMessenger injects a message into a running agent (busy-detecting until the +// agent is ready). Used by the auto-nudge reactions. +type AgentMessenger interface { + Send(ctx context.Context, id domain.SessionID, message string) error } -type EventPriority string +type Priority string const ( - PriorityUrgent EventPriority = "urgent" - PriorityAction EventPriority = "action" - PriorityWarning EventPriority = "warning" - PriorityInfo EventPriority = "info" + PriorityUrgent Priority = "urgent" + PriorityAction Priority = "action" + PriorityInfo Priority = "info" ) -type OrchestratorEvent struct { +// Event is a human-facing notification produced by a reaction. +type Event struct { Type string - Priority EventPriority + Priority Priority SessionID domain.SessionID ProjectID domain.ProjectID Message string - Data map[string]any -} - -// AgentMessenger injects a message into a running agent. The implementation -// busy-detects (waits for the agent to be idle/ready) and verifies delivery, -// which is why activity-detection accuracy matters. -type AgentMessenger interface { - Send(ctx context.Context, id domain.SessionID, message string) error } -// The runtime/agent/workspace plugin ports are co-owned with the coding-agents -// lane; the method sets below are the minimum the Session Manager spawn/kill -// pipelines call. They will be fleshed out alongside the tmux/claude-code impls. +// ---- runtime / agent / workspace plugin ports (used by the Session Manager) ---- type Runtime interface { Create(ctx context.Context, cfg RuntimeConfig) (RuntimeHandle, error) Destroy(ctx context.Context, handle RuntimeHandle) error - SendMessage(ctx context.Context, handle RuntimeHandle, message string) error - GetOutput(ctx context.Context, handle RuntimeHandle, lines int) (string, error) IsAlive(ctx context.Context, handle RuntimeHandle) (bool, error) } @@ -104,10 +81,6 @@ type RuntimeHandle struct { type Agent interface { GetLaunchCommand(cfg AgentConfig) string GetEnvironment(cfg AgentConfig) map[string]string - // ProbeProcess returns the agent process liveness classification - // (alive/dead/indeterminate/failed) — not a boolean and not an activity - // state. Activity classification arrives separately via ActivitySignal. - ProbeProcess(ctx context.Context, handle RuntimeHandle) (ProcessProbe, error) GetRestoreCommand(agentSessionID string) string } @@ -120,7 +93,6 @@ type AgentConfig struct { type Workspace interface { Create(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) Destroy(ctx context.Context, info WorkspaceInfo) error - List(ctx context.Context, project domain.ProjectID) ([]WorkspaceInfo, error) Restore(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) } diff --git a/backend/internal/session/fakes_test.go b/backend/internal/session/fakes_test.go deleted file mode 100644 index 033f6de7..00000000 --- a/backend/internal/session/fakes_test.go +++ /dev/null @@ -1,400 +0,0 @@ -package session - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// callLog records the cross-fake call order so tests can assert pipeline -// sequencing (e.g. OnKillRequested before Runtime.Destroy before Workspace.Destroy). -type callLog struct { - mu sync.Mutex - calls []string -} - -func (c *callLog) add(s string) { - c.mu.Lock() - defer c.mu.Unlock() - c.calls = append(c.calls, s) -} - -func (c *callLog) snapshot() []string { - c.mu.Lock() - defer c.mu.Unlock() - out := make([]string, len(c.calls)) - copy(out, c.calls) - return out -} - -// indexOf returns the position of the first call equal to name, or -1. -func (c *callLog) indexOf(name string) int { - for i, s := range c.snapshot() { - if s == name { - return i - } - } - return -1 -} - -// ---- fakeStore: in-memory LifecycleStore with full-row Upsert + Get ---- - -type fakeStore struct { - mu sync.Mutex - records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]domain.SessionMetadata -} - -var _ ports.LifecycleStore = (*fakeStore)(nil) - -func newFakeStore() *fakeStore { - return &fakeStore{ - records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]domain.SessionMetadata{}, - } -} - -func (s *fakeStore) Upsert(_ context.Context, rec domain.SessionRecord, _ ports.EventType) error { - s.mu.Lock() - defer s.mu.Unlock() - if existing, ok := s.records[rec.ID]; ok { - if rec.Lifecycle.Revision != existing.Lifecycle.Revision { - return fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, existing.Lifecycle.Revision) - } - rec.Lifecycle.Revision = existing.Lifecycle.Revision + 1 - } else { - if rec.Lifecycle.Revision != 0 { - return fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) - } - rec.Lifecycle.Revision = 1 - } - if rec.Lifecycle.Version == 0 { - rec.Lifecycle.Version = domain.LifecycleVersion - } - r := rec - s.records[rec.ID] = &r - return nil -} - -func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.SessionRecord{}, false, nil - } - return s.withMetadata(*rec), true, nil -} - -func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.CanonicalSessionLifecycle{}, false, nil - } - return rec.Lifecycle, true, nil -} - -func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - s.mu.Lock() - defer s.mu.Unlock() - var out []domain.SessionRecord - for _, rec := range s.records { - if rec.ProjectID == project { - out = append(out, s.withMetadata(*rec)) - } - } - return out, nil -} - -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (domain.SessionMetadata, error) { - s.mu.Lock() - defer s.mu.Unlock() - return s.metadata[id], nil -} - -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, meta domain.SessionMetadata) error { - s.mu.Lock() - defer s.mu.Unlock() - s.metadata[id] = mergeSessionMetadata(s.metadata[id], meta) - return nil -} - -// mergeSessionMetadata applies meta onto dst with the store's "empty = leave -// unchanged" semantics, so partial patches do not clobber earlier values. -func mergeSessionMetadata(dst, meta domain.SessionMetadata) domain.SessionMetadata { - if meta.Branch != "" { - dst.Branch = meta.Branch - } - if meta.WorkspacePath != "" { - dst.WorkspacePath = meta.WorkspacePath - } - if meta.RuntimeHandleID != "" { - dst.RuntimeHandleID = meta.RuntimeHandleID - } - if meta.RuntimeName != "" { - dst.RuntimeName = meta.RuntimeName - } - if meta.AgentSessionID != "" { - dst.AgentSessionID = meta.AgentSessionID - } - if meta.Prompt != "" { - dst.Prompt = meta.Prompt - } - return dst -} - -// withMetadata attaches the separately-stored metadata to a record copy (a real -// store would return them together). Caller holds s.mu. -func (s *fakeStore) withMetadata(rec domain.SessionRecord) domain.SessionRecord { - rec.Metadata = s.metadata[rec.ID] - return rec -} - -// ---- fakeRuntime ---- - -type fakeRuntime struct { - log *callLog - createErr error - alive bool - - created []ports.RuntimeConfig - destroyed []ports.RuntimeHandle - sent []string -} - -var _ ports.Runtime = (*fakeRuntime)(nil) - -func (r *fakeRuntime) Create(_ context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) { - r.log.add("Runtime.Create") - if r.createErr != nil { - return ports.RuntimeHandle{}, r.createErr - } - r.created = append(r.created, cfg) - return ports.RuntimeHandle{ID: "rt-" + string(cfg.SessionID), RuntimeName: "tmux"}, nil -} - -func (r *fakeRuntime) Destroy(_ context.Context, h ports.RuntimeHandle) error { - r.log.add("Runtime.Destroy") - r.destroyed = append(r.destroyed, h) - return nil -} - -func (r *fakeRuntime) SendMessage(_ context.Context, _ ports.RuntimeHandle, message string) error { - r.sent = append(r.sent, message) - return nil -} - -func (r *fakeRuntime) GetOutput(_ context.Context, _ ports.RuntimeHandle, _ int) (string, error) { - return "", nil -} - -func (r *fakeRuntime) IsAlive(_ context.Context, _ ports.RuntimeHandle) (bool, error) { - return r.alive, nil -} - -// ---- fakeAgent ---- - -type fakeAgent struct { - env map[string]string -} - -var _ ports.Agent = (*fakeAgent)(nil) - -func (a *fakeAgent) GetLaunchCommand(_ ports.AgentConfig) string { return "claude" } - -func (a *fakeAgent) GetEnvironment(_ ports.AgentConfig) map[string]string { return cloneMap(a.env) } - -func (a *fakeAgent) ProbeProcess(_ context.Context, _ ports.RuntimeHandle) (ports.ProcessProbe, error) { - return ports.ProcessProbeAlive, nil -} - -func (a *fakeAgent) GetRestoreCommand(agentSessionID string) string { - return "claude --resume " + agentSessionID -} - -// ---- fakeWorkspace (with worktree-remove refusal mode) ---- - -type fakeWorkspace struct { - log *callLog - createErr error - refuse map[string]bool // path -> still registered after prune (uncommitted work) - created []ports.WorkspaceConfig - destroyed []ports.WorkspaceInfo - restoredID []domain.SessionID -} - -var _ ports.Workspace = (*fakeWorkspace)(nil) - -func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - w.log.add("Workspace.Create") - if w.createErr != nil { - return ports.WorkspaceInfo{}, w.createErr - } - w.created = append(w.created, cfg) - return workspaceFor(cfg), nil -} - -func (w *fakeWorkspace) Destroy(_ context.Context, info ports.WorkspaceInfo) error { - w.log.add("Workspace.Destroy") - if w.refuse[info.Path] { - // Worktree-remove safety: after `git worktree prune` the path is still - // registered, so it may hold the agent's uncommitted work — refuse. - return fmt.Errorf("workspace: refusing to rm -rf %s: still registered after prune", info.Path) - } - w.destroyed = append(w.destroyed, info) - return nil -} - -func (w *fakeWorkspace) List(_ context.Context, _ domain.ProjectID) ([]ports.WorkspaceInfo, error) { - return nil, nil -} - -func (w *fakeWorkspace) Restore(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - w.log.add("Workspace.Restore") - w.restoredID = append(w.restoredID, cfg.SessionID) - return workspaceFor(cfg), nil -} - -func workspaceFor(cfg ports.WorkspaceConfig) ports.WorkspaceInfo { - return ports.WorkspaceInfo{ - Path: "/tmp/ws/" + string(cfg.SessionID), - Branch: cfg.Branch, - SessionID: cfg.SessionID, - ProjectID: cfg.ProjectID, - } -} - -// ---- recordingMessenger ---- - -type recordingMessenger struct { - sent []struct { - ID domain.SessionID - Message string - } -} - -var _ ports.AgentMessenger = (*recordingMessenger)(nil) - -func (m *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { - m.sent = append(m.sent, struct { - ID domain.SessionID - Message string - }{id, message}) - return nil -} - -// ---- noopNotifier ---- - -type noopNotifier struct{} - -var _ ports.Notifier = (*noopNotifier)(nil) - -func (noopNotifier) Notify(_ context.Context, _ ports.OrchestratorEvent) error { return nil } - -// ---- recordingLCM: wraps the REAL lifecycle.Manager and logs SM-facing calls ---- - -type recordingLCM struct { - log *callLog - inner ports.LifecycleManager - - // onSpawnErr, when set, makes OnSpawnCompleted fail (without touching the - // inner manager) so tests can exercise the SM's post-spawn failure paths. - onSpawnErr error -} - -var _ ports.LifecycleManager = (*recordingLCM)(nil) - -func (l *recordingLCM) OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error { - l.log.add("OnSpawnInitiated") - return l.inner.OnSpawnInitiated(ctx, rec) -} - -func (l *recordingLCM) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { - l.log.add("OnSpawnCompleted") - if l.onSpawnErr != nil { - return l.onSpawnErr - } - return l.inner.OnSpawnCompleted(ctx, id, o) -} - -func (l *recordingLCM) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { - l.log.add("OnKillRequested") - return l.inner.OnKillRequested(ctx, id, r) -} - -func (l *recordingLCM) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { - return l.inner.ApplySCMObservation(ctx, id, f) -} - -func (l *recordingLCM) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - return l.inner.ApplyRuntimeObservation(ctx, id, f) -} - -func (l *recordingLCM) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { - return l.inner.ApplyActivitySignal(ctx, id, s) -} - -func (l *recordingLCM) TickEscalations(ctx context.Context, now time.Time) error { - return l.inner.TickEscalations(ctx, now) -} - -func (l *recordingLCM) RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) { - return l.inner.RunningSessions(ctx) -} - -// ---- harness: wires the SM against the fakes + the real LCM ---- - -type harness struct { - sm *Manager - store *fakeStore - runtime *fakeRuntime - agent *fakeAgent - workspace *fakeWorkspace - messenger *recordingMessenger - lcm *recordingLCM - log *callLog -} - -var fixedTime = time.Date(2026, 5, 27, 12, 0, 0, 0, time.UTC) - -func newHarness(id domain.SessionID) *harness { - log := &callLog{} - store := newFakeStore() - rt := &fakeRuntime{log: log, alive: true} - ag := &fakeAgent{env: map[string]string{"BASE": "1"}} - ws := &fakeWorkspace{log: log, refuse: map[string]bool{}} - msg := &recordingMessenger{} - - lcm := &recordingLCM{log: log, inner: lifecycle.New(store, noopNotifier{}, msg)} - - sm := New(Deps{ - Runtime: rt, - Agent: ag, - Workspace: ws, - Store: store, - Messenger: msg, - Lifecycle: lcm, - Clock: func() time.Time { return fixedTime }, - NewID: func(ports.SpawnConfig) domain.SessionID { return id }, - }) - - return &harness{sm: sm, store: store, runtime: rt, agent: ag, workspace: ws, messenger: msg, lcm: lcm, log: log} -} - -func cloneMap(in map[string]string) map[string]string { - if in == nil { - return nil - } - out := make(map[string]string, len(in)) - for k, v := range in { - out[k] = v - } - return out -} diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go index dce63305..d7350f5f 100644 --- a/backend/internal/session/manager.go +++ b/backend/internal/session/manager.go @@ -1,75 +1,53 @@ -// Package session implements ports.SessionManager: the explicit-mutation half -// of the lane. The SM is impure plumbing — it drives the Runtime/Agent/Workspace -// plugins to create and tear down sessions, and routes mutation commands and -// outcomes to the LCM (OnSpawnInitiated / OnSpawnCompleted / OnKillRequested). -// -// It NEVER writes sessions directly: observed transitions and explicit -// canonical mutations are the LCM's job under the Writer contract. The SM is the -// single producer of the derived display status, attached on read in List/Get -// and never persisted. +// Package session implements ports.SessionManager: the explicit-mutation half of +// the lane. It drives the runtime/agent/workspace plugins to create and tear +// down sessions, routes canonical writes to the LCM, and is the single producer +// of the derived display status (attached on read in List/Get). package session import ( "context" - "crypto/rand" - "encoding/hex" "errors" "fmt" - "strconv" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// ErrNotFound is returned by Get/Restore when no record exists for the id. -var ErrNotFound = errors.New("session: not found") - -// ErrNotRestorable is returned by Restore when the session is not torn down. -// Restoring a live session would spin up a second runtime/workspace for the same -// id, duplicating the agent and risking data loss. -var ErrNotRestorable = errors.New("session: not restorable (not terminal)") - -// ErrIncompleteTeardownMetadata is returned when a record's teardown handles are -// missing (empty workspace path or runtime handle), so calling a real adapter's -// Destroy could act on empty args — an unsafe delete. The teardown is skipped. -var ErrIncompleteTeardownMetadata = errors.New("session: incomplete teardown metadata") +var ( + ErrNotFound = errors.New("session: not found") + ErrNotRestorable = errors.New("session: not restorable (not terminal)") + ErrIncompleteHandle = errors.New("session: incomplete teardown handle") +) -// Env vars a spawned process reads to learn who it is (distillation §5.4). +// Env vars a spawned process reads to learn who it is. const ( EnvSessionID = "AO_SESSION_ID" EnvProjectID = "AO_PROJECT_ID" EnvIssueID = "AO_ISSUE_ID" ) -// Manager implements ports.SessionManager against the outbound ports. Every -// dependency is an interface so the SM runs entirely against fakes in tests. +// Manager implements ports.SessionManager over the outbound ports. type Manager struct { runtime ports.Runtime agent ports.Agent workspace ports.Workspace - store ports.LifecycleStore + store ports.SessionStore messenger ports.AgentMessenger lcm ports.LifecycleManager - - clock func() time.Time - newID func(ports.SpawnConfig) domain.SessionID + clock func() time.Time } var _ ports.SessionManager = (*Manager)(nil) -// Deps groups the SM's collaborators. Clock and NewID are optional (defaulted) -// so production wiring only supplies the ports. type Deps struct { Runtime ports.Runtime Agent ports.Agent Workspace ports.Workspace - Store ports.LifecycleStore + Store ports.SessionStore Messenger ports.AgentMessenger Lifecycle ports.LifecycleManager - - Clock func() time.Time - NewID func(ports.SpawnConfig) domain.SessionID + Clock func() time.Time } func New(d Deps) *Manager { @@ -81,38 +59,27 @@ func New(d Deps) *Manager { messenger: d.Messenger, lcm: d.Lifecycle, clock: d.Clock, - newID: d.NewID, } if m.clock == nil { m.clock = time.Now } - if m.newID == nil { - m.newID = defaultNewID - } return m } -// ---- Spawn ---- - -// Spawn runs the create pipeline in spec order: workspace -> runtime -> route -// seed command to the LCM -> report completion to the LCM. The record is seeded LATE (after the runtime is up), so a -// failure before the seed leaves no record for Cleanup to reclaim — hence each -// step eagerly rolls back the steps that already succeeded. +// Spawn creates the session row (which assigns the "{project}-{n}" id), then the +// workspace and runtime, then reports completion to the LCM. A failure after the +// row exists routes it to a terminal errored state and rolls back what was built. func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) { - id := m.newID(cfg) - if _, ok, err := m.store.Get(ctx, id); err != nil { - return domain.Session{}, fmt.Errorf("spawn %s: check existing: %w", id, err) - } else if ok { - return domain.Session{}, fmt.Errorf("spawn %s: already exists", id) + rec, err := m.store.CreateSession(ctx, seedRecord(cfg, m.clock())) + if err != nil { + return domain.Session{}, fmt.Errorf("spawn: create: %w", err) } + id := rec.ID - ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ - ProjectID: cfg.ProjectID, - SessionID: id, - Branch: cfg.Branch, - }) + ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ProjectID: cfg.ProjectID, SessionID: id, Branch: cfg.Branch}) if err != nil { - return domain.Session{}, fmt.Errorf("spawn %s: workspace create: %w", id, err) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: workspace: %w", id, err) } agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: buildPrompt(cfg)} @@ -123,121 +90,127 @@ func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Sess Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, cfg.ProjectID, cfg.IssueID), }) if err != nil { - m.rollbackWorkspace(ctx, ws) // nothing seeded yet - return domain.Session{}, fmt.Errorf("spawn %s: runtime create: %w", id, err) - } - - if err := m.lcm.OnSpawnInitiated(ctx, seedRecord(id, cfg, m.clock())); err != nil { - m.rollbackRuntime(ctx, handle) - m.rollbackWorkspace(ctx, ws) - return domain.Session{}, fmt.Errorf("spawn %s: on spawn initiated: %w", id, err) + _ = m.workspace.Destroy(ctx, ws) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: runtime: %w", id, err) } - // Prompt is persisted via OnSpawnCompleted -> spawnMetadata so a later Restore - // can fall back to a fresh launch if the agent's native session id was never - // captured (the capture path is a separate hook that may never have run). outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, Prompt: agentCfg.Prompt} if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { - // The record is seeded but the runtime/workspace are about to be torn - // down. The store has no delete, so route the orphan to a terminal - // errored state (best effort) rather than strand a phantom "spawning". - _ = m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: ports.KillError, Detail: "spawn completion failed"}) - m.rollbackRuntime(ctx, handle) - m.rollbackWorkspace(ctx, ws) - return domain.Session{}, fmt.Errorf("spawn %s: on spawn completed: %w", id, err) + _ = m.runtime.Destroy(ctx, handle) + _ = m.workspace.Destroy(ctx, ws) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: completed: %w", id, err) } - return m.Get(ctx, id) } -// rollback* are best-effort: the caller already has the originating failure, and -// there is no logger at this layer, so a secondary teardown error is dropped -// rather than masking the real cause. -func (m *Manager) rollbackWorkspace(ctx context.Context, ws ports.WorkspaceInfo) { - _ = m.workspace.Destroy(ctx, ws) -} - -func (m *Manager) rollbackRuntime(ctx context.Context, h ports.RuntimeHandle) { - _ = m.runtime.Destroy(ctx, h) +// markErrored best-effort parks an orphaned spawn in a terminal errored state +// (the store has no delete; a phantom "spawning" row is worse than a terminal one). +func (m *Manager) markErrored(ctx context.Context, id domain.SessionID) { + _ = m.lcm.OnKillRequested(ctx, id, domain.TermErrorInProcess) } -// ---- Kill ---- - -// Kill records terminal intent with the LCM FIRST, then tears down the runtime -// and workspace. There is no separate Agent stop: the agent runs inside the -// runtime, so Runtime.Destroy stops it. The workspace teardown honors the -// worktree-remove safety — a refusal (path still registered after prune, so it -// may hold uncommitted work) surfaces as an error with WorkspaceFreed=false and -// is never forced. -func (m *Manager) Kill(ctx context.Context, id domain.SessionID, opts ports.KillOptions) (ports.KillResult, error) { - rec, ok, err := m.store.Get(ctx, id) +// Kill records terminal intent with the LCM, then tears down the runtime and +// workspace. A workspace teardown refused by the worktree-remove safety +// (uncommitted work) surfaces as an error with freed=false and is never forced. +func (m *Manager) Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (bool, error) { + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w", id, err) + return false, fmt.Errorf("kill %s: %w", id, err) } if !ok { - // Already gone: benign race, mirrors LCM.OnKillRequested's no-op. - return ports.KillResult{ID: id}, nil + return false, nil // already gone: benign race } - meta, err := m.store.GetMetadata(ctx, id) - if err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: metadata: %w", id, err) + handle := runtimeHandle(rec.Metadata) + ws := workspaceInfo(rec) + if handle.ID == "" || ws.Path == "" { + return false, fmt.Errorf("kill %s: %w", id, ErrIncompleteHandle) + } + if err := m.lcm.OnKillRequested(ctx, id, reason); err != nil { + return false, fmt.Errorf("kill %s: %w", id, err) + } + if err := m.runtime.Destroy(ctx, handle); err != nil { + return false, fmt.Errorf("kill %s: runtime: %w", id, err) + } + if err := m.workspace.Destroy(ctx, ws); err != nil { + return false, fmt.Errorf("kill %s: workspace: %w", id, err) } + return true, nil +} - // Validate the teardown handles BEFORE recording intent or touching an - // adapter: a corrupted/partially-seeded record with empty handles must never - // reach Destroy (empty path / handle could be an unsafe delete). - rtHandle := runtimeHandle(meta) - wsInfo := workspaceInfo(rec, meta) - if !validRuntimeHandle(rtHandle) { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: runtime handle", id, ErrIncompleteTeardownMetadata) +// Restore relaunches a torn-down session in its workspace. The fallible I/O runs +// before any canonical write, so a failure never resurrects the row or destroys +// the worktree (it may hold the agent's prior work). +func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, err) } - if !validWorkspaceInfo(wsInfo) { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: workspace path", id, ErrIncompleteTeardownMetadata) + if !ok { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) + } + if !isTerminal(rec.Lifecycle.Session.State) { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) + } + meta := rec.Metadata + if meta.AgentSessionID == "" && meta.Prompt == "" { + return domain.Session{}, fmt.Errorf("restore %s: nothing to resume from", id) } - if err := m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: opts.Reason, Detail: opts.Detail}); err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: on kill requested: %w", id, err) + ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ProjectID: rec.ProjectID, SessionID: id, Branch: meta.Branch}) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: workspace: %w", id, err) + } + agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: meta.Prompt} + launch := m.agent.GetRestoreCommand(meta.AgentSessionID) + if meta.AgentSessionID == "" { + launch = m.agent.GetLaunchCommand(agentCfg) } - if err := m.runtime.Destroy(ctx, rtHandle); err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: runtime destroy: %w", id, err) + handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ + SessionID: id, + WorkspacePath: ws.Path, + LaunchCommand: launch, + Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID), + }) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: runtime: %w", id, err) } - if err := m.workspace.Destroy(ctx, wsInfo); err != nil { - return ports.KillResult{ID: id, WorkspaceFreed: false}, fmt.Errorf("kill %s: workspace destroy: %w", id, err) + outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, AgentSessionID: meta.AgentSessionID, Prompt: meta.Prompt} + if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + _ = m.runtime.Destroy(ctx, handle) + return domain.Session{}, fmt.Errorf("restore %s: completed: %w", id, err) } - return ports.KillResult{ID: id, WorkspaceFreed: true}, nil + return m.Get(ctx, id) } -// ---- read-model ---- - -// List builds the read-model for a project: stored records with the display -// status derived on read. The SM is the single producer of that status. func (m *Manager) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) { - recs, err := m.store.List(ctx, project) + recs, err := m.store.ListSessions(ctx, project) if err != nil { return nil, fmt.Errorf("list %s: %w", project, err) } out := make([]domain.Session, 0, len(recs)) for _, rec := range recs { - out = append(out, toSession(rec)) + s, err := m.toSession(ctx, rec) + if err != nil { + return nil, err + } + out = append(out, s) } return out, nil } func (m *Manager) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) { - rec, ok, err := m.store.Get(ctx, id) + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { return domain.Session{}, fmt.Errorf("get %s: %w", id, err) } if !ok { return domain.Session{}, fmt.Errorf("get %s: %w", id, ErrNotFound) } - return toSession(rec), nil + return m.toSession(ctx, rec) } -// ---- Send ---- - -// Send routes a message to the running agent through the AgentMessenger, which -// busy-detects and verifies delivery. func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) error { if err := m.messenger.Send(ctx, id, message); err != nil { return fmt.Errorf("send %s: %w", id, err) @@ -245,156 +218,64 @@ func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) return nil } -// ---- Restore ---- - -// Restore relaunches a previously torn-down session in its workspace. The -// fallible I/O (workspace restore + runtime create) runs first so a failure -// touches no canonical state and never destroys the worktree (it may hold the -// agent's prior work). Only once the runtime is up do we reopen the lifecycle: -// resetting a terminal session is an explicit mutation routed to the LCM (the -// LCM's observe path would never resurrect a terminal session), and the PR axis -// is cleared. OnSpawnCompleted then flips the runtime to alive. -func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { - rec, ok, err := m.store.Get(ctx, id) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, err) - } - if !ok { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) - } - // Only a torn-down session may be restored. Reopening a live one would spawn a - // duplicate runtime/workspace for the same id and reset its lifecycle. - if !isTerminalSession(rec.Lifecycle.Session.State) { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) - } - meta, err := m.store.GetMetadata(ctx, id) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: metadata: %w", id, err) - } - - // Resume is only possible with the agent's captured session id; without it we - // fall back to a fresh launch using the seeded prompt persisted at spawn time - // (the agent's id-capture path is a separate hook that may never have run, so - // "no id" is the common case rather than an error). If neither is available - // there is nothing to relaunch from — fail early, before any I/O. - agentSessionID := meta.AgentSessionID - seededPrompt := meta.Prompt - if agentSessionID == "" && seededPrompt == "" { - return domain.Session{}, fmt.Errorf("restore %s: no agent session id or seeded prompt (cannot resume or relaunch)", id) - } - - ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ - ProjectID: rec.ProjectID, - SessionID: id, - Branch: meta.Branch, - }) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: workspace restore: %w", id, err) - } - - agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: seededPrompt} - launchCommand := m.agent.GetRestoreCommand(agentSessionID) - if agentSessionID == "" { - launchCommand = m.agent.GetLaunchCommand(agentCfg) - } - handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ - SessionID: id, - WorkspacePath: ws.Path, - LaunchCommand: launchCommand, - Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID), - }) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: runtime create: %w", id, err) - } - - // Past this point the runtime is live: a failure must tear it back down (but - // never the workspace, which holds the agent's prior work) so we don't strand - // a process while parking the session in a terminal lifecycle. - reopen := rec - reopen.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested} - reopen.Lifecycle.PR = domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonClearedOnRestore} - reopen.Lifecycle.Runtime = domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete} - reopen.Lifecycle.Detecting = nil - if err := m.lcm.OnSpawnInitiated(ctx, reopen); err != nil { - m.rollbackRuntime(ctx, handle) - return domain.Session{}, fmt.Errorf("restore %s: on spawn initiated: %w", id, err) - } - - outcome := ports.SpawnOutcome{ - Branch: ws.Branch, - WorkspacePath: ws.Path, - RuntimeHandle: handle, - AgentSessionID: agentSessionID, - Prompt: seededPrompt, - } - if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { - m.rollbackRuntime(ctx, handle) - // Re-upsert the original record to undo the reopen; the store will - // assign the next revision. - if revertErr := m.lcm.OnSpawnInitiated(ctx, rec); revertErr != nil { - return domain.Session{}, fmt.Errorf("restore %s: revert after spawn completed failure: %w (original error: %v)", id, revertErr, err) - } - if !rec.Metadata.IsZero() { - if revertErr := m.store.PatchMetadata(ctx, id, rec.Metadata); revertErr != nil { - return domain.Session{}, fmt.Errorf("restore %s: revert metadata after spawn completed failure: %w (original error: %v)", id, revertErr, err) - } - } - return domain.Session{}, fmt.Errorf("restore %s: on spawn completed: %w", id, err) - } - return m.Get(ctx, id) -} - -// ---- Cleanup ---- - // Cleanup reclaims the workspaces of terminal sessions in a project. A workspace -// whose teardown is refused by the worktree-remove safety (uncommitted work) is -// skipped, never forced. Runtime teardown is best-effort (a terminal session's -// runtime is usually already gone); the workspace result decides cleaned/skipped. -func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) (ports.CleanupResult, error) { - recs, err := m.store.List(ctx, project) +// whose teardown is refused (uncommitted work) is skipped, never forced. +func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) ([]domain.SessionID, error) { + recs, err := m.store.ListSessions(ctx, project) if err != nil { - return ports.CleanupResult{}, fmt.Errorf("cleanup %s: %w", project, err) + return nil, fmt.Errorf("cleanup %s: %w", project, err) } - var res ports.CleanupResult + var cleaned []domain.SessionID for _, rec := range recs { - if !isTerminalSession(rec.Lifecycle.Session.State) { + if !isTerminal(rec.Lifecycle.Session.State) { continue } - meta, err := m.store.GetMetadata(ctx, rec.ID) - if err != nil { - return res, fmt.Errorf("cleanup %s: metadata %s: %w", project, rec.ID, err) - } - wsInfo := workspaceInfo(rec, meta) - if !validWorkspaceInfo(wsInfo) { - // No workspace path to reclaim — skip rather than hand empty args to a - // real adapter's Destroy (an unsafe delete). - res.Skipped = append(res.Skipped, rec.ID) + ws := workspaceInfo(rec) + if ws.Path == "" { continue } - if rtHandle := runtimeHandle(meta); validRuntimeHandle(rtHandle) { - _ = m.runtime.Destroy(ctx, rtHandle) // best effort; usually already gone + if h := runtimeHandle(rec.Metadata); h.ID != "" { + _ = m.runtime.Destroy(ctx, h) // best effort; usually already gone } - if err := m.workspace.Destroy(ctx, wsInfo); err != nil { - res.Skipped = append(res.Skipped, rec.ID) - continue + if err := m.workspace.Destroy(ctx, ws); err != nil { + continue // skipped: uncommitted work } - res.Cleaned = append(res.Cleaned, rec.ID) + cleaned = append(cleaned, rec.ID) } - return res, nil + return cleaned, nil } // ---- helpers ---- -func toSession(rec domain.SessionRecord) domain.Session { - return domain.Session{SessionRecord: rec, Status: domain.DeriveLegacyStatus(rec.Lifecycle)} +func (m *Manager) toSession(ctx context.Context, rec domain.SessionRecord) (domain.Session, error) { + pr, err := m.store.PRFactsForSession(ctx, rec.ID) + if err != nil { + return domain.Session{}, fmt.Errorf("pr facts %s: %w", rec.ID, err) + } + return domain.Session{SessionRecord: rec, Status: domain.DeriveStatus(rec.Lifecycle, pr)}, nil } -func isTerminalSession(s domain.SessionState) bool { +func isTerminal(s domain.SessionState) bool { return s == domain.SessionDone || s == domain.SessionTerminated } -// buildPrompt assembles the spawn prompt from the explicit config only; the full -// 3-layer assembly (base protocol + config-derived + user rules) lands later. +func seedRecord(cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { + return domain.SessionRecord{ + ProjectID: cfg.ProjectID, + IssueID: cfg.IssueID, + Kind: cfg.Kind, + CreatedAt: now, + UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: domain.SessionNotStarted}, + Harness: cfg.Harness, + }, + } +} + +// buildPrompt assembles the spawn prompt from the explicit config (the full +// 3-layer assembly lands later). func buildPrompt(cfg ports.SpawnConfig) string { switch { case cfg.AgentRules == "": @@ -406,8 +287,6 @@ func buildPrompt(cfg ports.SpawnConfig) string { } } -// spawnEnv overlays the AO_* identity vars onto the agent's environment without -// mutating the map the agent returned. func spawnEnv(base map[string]string, id domain.SessionID, project domain.ProjectID, issue domain.IssueID) map[string]string { env := make(map[string]string, len(base)+3) for k, v := range base { @@ -419,70 +298,15 @@ func spawnEnv(base map[string]string, id domain.SessionID, project domain.Projec return env } -func seedRecord(id domain.SessionID, cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, - ProjectID: cfg.ProjectID, - IssueID: cfg.IssueID, - Kind: cfg.Kind, - CreatedAt: now, - UpdatedAt: now, - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete}, - PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, - }, - } -} - -// runtimeHandle / workspaceInfo reconstruct teardown handles from the metadata -// the LCM persisted in OnSpawnCompleted (the metadata-key contract is shared -// with the lifecycle package). func runtimeHandle(meta domain.SessionMetadata) ports.RuntimeHandle { - return ports.RuntimeHandle{ - ID: meta.RuntimeHandleID, - RuntimeName: meta.RuntimeName, - } + return ports.RuntimeHandle{ID: meta.RuntimeHandleID, RuntimeName: meta.RuntimeName} } -func workspaceInfo(rec domain.SessionRecord, meta domain.SessionMetadata) ports.WorkspaceInfo { +func workspaceInfo(rec domain.SessionRecord) ports.WorkspaceInfo { return ports.WorkspaceInfo{ - Path: meta.WorkspacePath, - Branch: meta.Branch, + Path: rec.Metadata.WorkspacePath, + Branch: rec.Metadata.Branch, SessionID: rec.ID, ProjectID: rec.ProjectID, } } - -// validRuntimeHandle reports whether the handle identifies a runtime to destroy. -// An adapter needs the handle id to target the right process; an empty handle -// would be ambiguous, so we refuse to call Destroy with one. -func validRuntimeHandle(h ports.RuntimeHandle) bool { - return h.ID != "" -} - -// validWorkspaceInfo reports whether there is a concrete path to reclaim. An -// empty path handed to a worktree-remove could resolve to an unsafe target. -func validWorkspaceInfo(w ports.WorkspaceInfo) bool { - return w.Path != "" -} - -func defaultNewID(cfg ports.SpawnConfig) domain.SessionID { - base := string(cfg.IssueID) - if base == "" { - base = string(cfg.Kind) - } - if base == "" { - base = "session" - } - return domain.SessionID(base + "-" + randHex(4)) -} - -func randHex(n int) string { - b := make([]byte, n) - if _, err := rand.Read(b); err != nil { - return strconv.FormatInt(time.Now().UnixNano(), 16) - } - return hex.EncodeToString(b) -} diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go index c0c98cf7..669e0c25 100644 --- a/backend/internal/session/manager_test.go +++ b/backend/internal/session/manager_test.go @@ -3,630 +3,294 @@ package session import ( "context" "errors" + "fmt" "testing" + "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -const ( - testProject = domain.ProjectID("proj") - testIssue = domain.IssueID("42") -) - -func spawnCfg() ports.SpawnConfig { - return ports.SpawnConfig{ - ProjectID: testProject, - IssueID: testIssue, - Kind: domain.KindWorker, - Branch: "feat/42", - Prompt: "do the thing", - AgentRules: "be careful", - } -} - -func TestSpawn_HappyPath(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() +var ctx = context.Background() - sess, err := h.sm.Spawn(ctx, spawnCfg()) - if err != nil { - t.Fatalf("spawn: %v", err) - } - - // Display status is derived (single producer) — a freshly spawned, not_started - // session shows as spawning. - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want %q", sess.Status, domain.StatusSpawning) - } +// ---- fakes ---- - // Record seeded by the LCM with identity + initial lifecycle, then OnSpawnCompleted flipped - // the runtime axis to alive. - rec, ok, err := h.store.Get(ctx, "sess-1") - if err != nil || !ok { - t.Fatalf("get seeded record: ok=%v err=%v", ok, err) - } - if rec.ProjectID != testProject || rec.IssueID != testIssue || rec.Kind != domain.KindWorker { - t.Errorf("identity = %+v, want proj/42/worker", rec) - } - if !rec.CreatedAt.Equal(fixedTime) { - t.Errorf("createdAt = %v, want %v", rec.CreatedAt, fixedTime) - } - if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { - t.Errorf("session substate = %+v, want not_started/spawn_requested", got) - } - if got := rec.Lifecycle.Runtime; got.State != domain.RuntimeAlive || got.Reason != domain.RuntimeReasonProcessRunning { - t.Errorf("runtime substate = %+v, want alive/process_running", got) - } +type fakeStore struct { + sessions map[domain.SessionID]domain.SessionRecord + pr map[domain.SessionID]domain.PRFacts + num int +} - // Pipeline order: workspace -> runtime -> LCM seed command -> LCM completion. - wantOrder := []string{"Workspace.Create", "Runtime.Create", "OnSpawnInitiated", "OnSpawnCompleted"} - if got := h.log.snapshot(); !equalStrings(got, wantOrder) { - t.Errorf("call order = %v, want %v", got, wantOrder) - } +func newFakeStore() *fakeStore { + return &fakeStore{sessions: map[domain.SessionID]domain.SessionRecord{}, pr: map[domain.SessionID]domain.PRFacts{}} +} - // Identity env wired onto the runtime config, layered over the agent's env. - if len(h.runtime.created) != 1 { - t.Fatalf("runtime.created = %d, want 1", len(h.runtime.created)) - } - env := h.runtime.created[0].Env - for k, want := range map[string]string{ - EnvSessionID: "sess-1", - EnvProjectID: "proj", - EnvIssueID: "42", - "BASE": "1", - } { - if env[k] != want { - t.Errorf("env[%q] = %q, want %q", k, env[k], want) +func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + f.num++ + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) + f.sessions[rec.ID] = rec + return rec, nil +} +func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { + f.sessions[rec.ID] = rec + return nil +} +func (f *fakeStore) GetSession(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + r, ok := f.sessions[id] + return r, ok, nil +} +func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domain.SessionRecord, error) { + var out []domain.SessionRecord + for _, r := range f.sessions { + if r.ProjectID == p { + out = append(out, r) } } - - // Handles persisted to metadata for later teardown/restore. The prompt is - // persisted too so a later Restore that finds no captured agent session id - // can still fall back to a fresh launch using the same prompt. - meta, _ := h.store.GetMetadata(ctx, "sess-1") - want := domain.SessionMetadata{ - Branch: "feat/42", - WorkspacePath: "/tmp/ws/sess-1", - RuntimeHandleID: "rt-sess-1", - RuntimeName: "tmux", - Prompt: "do the thing\n\nbe careful", - } - if meta != want { - t.Errorf("metadata = %+v, want %+v", meta, want) - } + return out, nil } - -func TestSpawn_RuntimeCreateFailure_RollsBack(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - h.runtime.createErr = errors.New("boom") - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error, got nil") - } - - // No record seeded for a spawn that never completed. - if _, ok, _ := h.store.Get(ctx, "sess-1"); ok { - t.Error("record was seeded despite runtime-create failure") - } - // The already-created workspace was rolled back (eager rollback), since a - // late-seeded record means Cleanup could never find this orphan. - if len(h.workspace.destroyed) != 1 || h.workspace.destroyed[0].Path != "/tmp/ws/sess-1" { - t.Errorf("workspace.destroyed = %+v, want the created worktree", h.workspace.destroyed) - } - // LCM never told a spawn completed. - if h.log.indexOf("OnSpawnCompleted") != -1 { - t.Error("OnSpawnCompleted should not fire on a failed spawn") +func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { + out := make([]domain.SessionRecord, 0, len(f.sessions)) + for _, r := range f.sessions { + out = append(out, r) } + return out, nil } - -func TestSpawn_ExistingSessionIDRejectedBeforeWork(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", - ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("seed existing row: %v", err) - } - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error for existing session id, got nil") - } - if len(h.workspace.created) != 0 { - t.Error("workspace should not be created when session id already exists") - } - if len(h.runtime.created) != 0 { - t.Error("runtime should not be created when session id already exists") - } - if h.log.indexOf("OnSpawnInitiated") != -1 || h.log.indexOf("OnSpawnCompleted") != -1 { - t.Error("LCM should not be called when session id already exists") - } +func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { + return f.pr[id], nil } -func TestSpawn_OnSpawnCompletedFailure_RoutesOrphanToErrored(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - h.lcm.onSpawnErr = errors.New("lcm boom") - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error, got nil") - } - - // Runtime + workspace are torn down on the failure path. - if len(h.runtime.destroyed) != 1 { - t.Errorf("runtime.destroyed = %d, want 1", len(h.runtime.destroyed)) - } - if len(h.workspace.destroyed) != 1 { - t.Errorf("workspace.destroyed = %d, want 1", len(h.workspace.destroyed)) - } - // The record was already seeded and the store has no delete, so the orphan is - // routed to a terminal errored state (via OnKillRequested(KillError)) rather - // than stranded forever as "spawning". - rec, ok, _ := h.store.Get(ctx, "sess-1") - if !ok { - t.Fatal("seeded record vanished; expected it parked as errored") - } - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonErrorInProcess { - t.Errorf("session substate = %+v, want terminated/error_in_process", got) - } - if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusErrored { - t.Errorf("status = %q, want errored", status) - } +// fakeLCM is the minimal lifecycle the Session Manager drives: it persists the +// spawn/kill canonical writes into the store so Get reflects them. +type fakeLCM struct { + store *fakeStore + completed int } -func TestKill_OrderingAndTerminalState(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - - res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) - if err != nil { - t.Fatalf("kill: %v", err) - } - if !res.WorkspaceFreed { - t.Error("WorkspaceFreed = false, want true") - } - - // Intent recorded with the LCM BEFORE any teardown, runtime before workspace. - iKill := h.log.indexOf("OnKillRequested") - iRT := h.log.indexOf("Runtime.Destroy") - iWS := h.log.indexOf("Workspace.Destroy") - if !(iKill >= 0 && iKill < iRT && iRT < iWS) { - t.Errorf("kill order indices: OnKillRequested=%d Runtime.Destroy=%d Workspace.Destroy=%d (want ascending)", iKill, iRT, iWS) - } - - // Terminal canonical written by the LCM; display derives to killed. - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled { - t.Errorf("session substate = %+v, want terminated/manually_killed", got) - } - if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusKilled { - t.Errorf("status = %q, want killed", status) - } +func (l *fakeLCM) OnSpawnCompleted(_ context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + l.completed++ + rec := l.store.sessions[id] + rec.Lifecycle.Session.State = domain.SessionNotStarted + rec.Lifecycle.IsAlive = true + rec.Lifecycle.TerminationReason = domain.TermNone + rec.Metadata = domain.SessionMetadata{ + Branch: o.Branch, WorkspacePath: o.WorkspacePath, + RuntimeHandleID: o.RuntimeHandle.ID, RuntimeName: o.RuntimeHandle.RuntimeName, + AgentSessionID: o.AgentSessionID, Prompt: o.Prompt, + } + l.store.sessions[id] = rec + return nil } - -func TestKill_WorktreeRemoveRefusalSurfaced(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - // The worktree path is still registered after prune (uncommitted work). - h.workspace.refuse["/tmp/ws/sess-1"] = true - - res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) - if err == nil { - t.Fatal("kill: want refusal error, got nil") - } - if res.WorkspaceFreed { - t.Error("WorkspaceFreed = true, want false on refusal") - } - // The refusal must be honored — the path is never force-deleted. - if len(h.workspace.destroyed) != 0 { - t.Errorf("workspace.destroyed = %+v, want none (refused)", h.workspace.destroyed) - } - // Runtime still torn down and intent still recorded — only the worktree is spared. - if h.log.indexOf("Runtime.Destroy") == -1 || h.log.indexOf("OnKillRequested") == -1 { - t.Error("runtime teardown / kill intent should still happen on a workspace refusal") - } +func (l *fakeLCM) OnKillRequested(_ context.Context, id domain.SessionID, reason domain.TerminationReason) error { + rec := l.store.sessions[id] + rec.Lifecycle.Session.State = domain.SessionTerminated + rec.Lifecycle.TerminationReason = reason + rec.Lifecycle.IsAlive = false + l.store.sessions[id] = rec + return nil } - -func TestKill_IncompleteMetadata_RefusesTeardown(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - // A record with no teardown metadata (empty runtime handle + workspace path), - // e.g. a partially-seeded or corrupted record. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); !errors.Is(err, ErrIncompleteTeardownMetadata) { - t.Fatalf("kill: err = %v, want ErrIncompleteTeardownMetadata", err) - } - // Nothing destroyed with empty args, and no intent recorded. - if len(h.runtime.destroyed) != 0 || len(h.workspace.destroyed) != 0 { - t.Errorf("teardown ran despite incomplete metadata: rt=%v ws=%v", h.runtime.destroyed, h.workspace.destroyed) - } - if h.log.indexOf("OnKillRequested") != -1 { - t.Error("kill intent recorded despite incomplete metadata") - } +func (l *fakeLCM) ApplyRuntimeObservation(context.Context, domain.SessionID, ports.RuntimeFacts) error { + return nil +} +func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { + return nil +} +func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { + return nil +} +func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { return nil } +func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { + return nil, nil } -func TestCleanup_IncompleteMetadata_Skipped(t *testing.T) { - h := newHarness("unused") - ctx := context.Background() - // Terminal session but no workspace path persisted — must be skipped, never - // handed to Destroy with an empty path. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "orphan-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } +type fakeRuntime struct { + createErr error + created, destroyed int +} - res, err := h.sm.Cleanup(ctx, testProject) - if err != nil { - t.Fatalf("cleanup: %v", err) - } - if !equalIDSet(res.Skipped, []domain.SessionID{"orphan-1"}) { - t.Errorf("skipped = %v, want [orphan-1]", res.Skipped) - } - if len(res.Cleaned) != 0 { - t.Errorf("cleaned = %v, want none", res.Cleaned) - } - if len(h.workspace.destroyed) != 0 { - t.Errorf("workspace.destroyed = %v, want none (empty path must not reach Destroy)", h.workspace.destroyed) +func (r *fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { + if r.createErr != nil { + return ports.RuntimeHandle{}, r.createErr } + r.created++ + return ports.RuntimeHandle{ID: "h1", RuntimeName: "tmux"}, nil +} +func (r *fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { r.destroyed++; return nil } +func (r *fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { + return true, nil } -func TestRestore_LiveSession_Rejected(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - // The session is live (never torn down). Capture an agent id so the only thing - // blocking restore is the non-terminal lifecycle, not missing metadata. - if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } - createdBefore := len(h.runtime.created) - restoresBefore := len(h.workspace.restoredID) +type fakeAgent struct{} - if _, err := h.sm.Restore(ctx, "sess-1"); !errors.Is(err, ErrNotRestorable) { - t.Fatalf("restore: err = %v, want ErrNotRestorable", err) - } - // No second runtime/workspace spun up for the still-live session. - if len(h.runtime.created) != createdBefore { - t.Error("runtime created for a live-session restore") - } - if len(h.workspace.restoredID) != restoresBefore { - t.Error("workspace restored for a live-session restore") - } +func (fakeAgent) GetLaunchCommand(ports.AgentConfig) string { return "launch" } +func (fakeAgent) GetEnvironment(ports.AgentConfig) map[string]string { + return map[string]string{"X": "1"} } +func (fakeAgent) GetRestoreCommand(id string) string { return "resume " + id } -func TestListAndGet_DeriveStatus(t *testing.T) { - cases := []struct { - name string - lc domain.CanonicalSessionLifecycle - want domain.SessionStatus - }{ - {"not_started", lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.PRNone, ""), domain.StatusSpawning}, - {"working", lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), domain.StatusWorking}, - {"idle", lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.PRNone, ""), domain.StatusIdle}, - {"needs_input", lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.PRNone, ""), domain.StatusNeedsInput}, - {"pr_ci_failed", lc(domain.SessionWorking, domain.ReasonFixingCI, domain.PROpen, domain.PRReasonCIFailing), domain.StatusCIFailed}, - {"pr_merged", lc(domain.SessionIdle, domain.ReasonMergedWaitingDecision, domain.PRMerged, domain.PRReasonMerged), domain.StatusMerged}, - {"killed", lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), domain.StatusKilled}, - } +type fakeWorkspace struct { + destroyErr error + destroyed int +} - h := newHarness("unused") - ctx := context.Background() - for _, c := range cases { - if err := h.store.Upsert(ctx, domain.SessionRecord{ID: domain.SessionID(c.name), ProjectID: testProject, Lifecycle: c.lc}, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert %s: %v", c.name, err) - } - } +func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return ports.WorkspaceInfo{Path: "/ws/" + string(cfg.SessionID), Branch: cfg.Branch, SessionID: cfg.SessionID, ProjectID: cfg.ProjectID}, nil +} +func (w *fakeWorkspace) Destroy(context.Context, ports.WorkspaceInfo) error { + w.destroyed++ + return w.destroyErr +} +func (w *fakeWorkspace) Restore(ctx context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return w.Create(ctx, cfg) +} - // Get derives per-record. - for _, c := range cases { - got, err := h.sm.Get(ctx, domain.SessionID(c.name)) - if err != nil { - t.Fatalf("get %s: %v", c.name, err) - } - if got.Status != c.want { - t.Errorf("get %s: status = %q, want %q", c.name, got.Status, c.want) - } - } +type fakeMessenger struct{ msgs []string } - // List derives for every record in the project. - got, err := h.sm.List(ctx, testProject) - if err != nil { - t.Fatalf("list: %v", err) - } - if len(got) != len(cases) { - t.Fatalf("list len = %d, want %d", len(got), len(cases)) - } - byID := map[domain.SessionID]domain.SessionStatus{} - for _, s := range got { - byID[s.ID] = s.Status - } - for _, c := range cases { - if byID[domain.SessionID(c.name)] != c.want { - t.Errorf("list %s: status = %q, want %q", c.name, byID[domain.SessionID(c.name)], c.want) - } - } +func (m *fakeMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { + m.msgs = append(m.msgs, msg) + return nil } -func TestGet_NotFound(t *testing.T) { - h := newHarness("sess-1") - if _, err := h.sm.Get(context.Background(), "missing"); !errors.Is(err, ErrNotFound) { - t.Errorf("get missing: err = %v, want ErrNotFound", err) - } +func newManager() (*Manager, *fakeStore, *fakeRuntime, *fakeWorkspace) { + st := newFakeStore() + rt := &fakeRuntime{} + ws := &fakeWorkspace{} + m := New(Deps{ + Runtime: rt, Agent: fakeAgent{}, Workspace: ws, + Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, + }) + return m, st, rt, ws } -func TestSend_RoutesToMessenger(t *testing.T) { - h := newHarness("sess-1") - if err := h.sm.Send(context.Background(), "sess-1", "hello"); err != nil { - t.Fatalf("send: %v", err) - } - if len(h.messenger.sent) != 1 || h.messenger.sent[0].ID != "sess-1" || h.messenger.sent[0].Message != "hello" { - t.Errorf("messenger.sent = %+v, want one {sess-1, hello}", h.messenger.sent) +func seedTerminal(st *fakeStore, id domain.SessionID, meta domain.SessionMetadata) { + st.sessions[id] = domain.SessionRecord{ + ID: id, ProjectID: "mer", Metadata: meta, + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionTerminated}}, } } -func TestRestore_RelaunchesWithResumeCommand(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // The agent's resume id is captured in metadata (here set explicitly). - if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } +// ---- tests ---- - sess, err := h.sm.Restore(ctx, "sess-1") - if err != nil { - t.Fatalf("restore: %v", err) - } +func TestSpawn_AssignsIDAndGoesLive(t *testing.T) { + m, st, rt, _ := newManager() - // Reopened: terminal session reset to a fresh spawn, PR cleared, runtime alive. - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want spawning", sess.Status) - } - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { - t.Errorf("session substate = %+v, want not_started/spawn_requested", got) + s, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "do it"}) + if err != nil { + t.Fatal(err) } - if got := rec.Lifecycle.PR; got.State != domain.PRNone || got.Reason != domain.PRReasonClearedOnRestore { - t.Errorf("pr substate = %+v, want none/cleared_on_restore", got) + if s.ID != "mer-1" { + t.Fatalf("store should assign mer-1, got %q", s.ID) } - if rec.Lifecycle.Runtime.State != domain.RuntimeAlive { - t.Errorf("runtime state = %q, want alive", rec.Lifecycle.Runtime.State) + if s.Status != domain.StatusSpawning { + t.Fatalf("fresh session displays spawning, got %q", s.Status) } - - // Relaunched via the agent's resume command (created[0] is the original spawn). - if len(h.runtime.created) != 2 { - t.Fatalf("runtime.created = %d, want 2 (spawn + restore)", len(h.runtime.created)) + if rt.created != 1 { + t.Fatalf("runtime not created") } - if got := h.runtime.created[1].LaunchCommand; got != "claude --resume agent-xyz" { - t.Errorf("restore launch command = %q, want resume", got) - } - if h.log.indexOf("Workspace.Restore") == -1 { - t.Error("Workspace.Restore was not called") + if st.sessions["mer-1"].Metadata.RuntimeHandleID != "h1" { + t.Fatal("spawn handle not folded into the row") } } -func TestRestore_NoAgentSessionID_FreshLaunchFallback(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // No agent session id was ever captured (the capture hook is a separate - // path that may never have run), but Spawn persisted the prompt, so Restore - // must fall back to a fresh launch instead of failing. - createdBefore := len(h.runtime.created) +func TestSpawn_RollsBackOnRuntimeFailure(t *testing.T) { + m, st, _, ws := newManager() + m.runtime = &fakeRuntime{createErr: errors.New("boom")} - sess, err := h.sm.Restore(ctx, "sess-1") - if err != nil { - t.Fatalf("restore: %v", err) + if _, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer"}); err == nil { + t.Fatal("expected spawn to fail") } - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want spawning", sess.Status) + if ws.destroyed != 1 { + t.Fatal("workspace should be rolled back") } - if len(h.runtime.created) != createdBefore+1 { - t.Fatalf("runtime.created grew by %d, want 1 (fresh-launch fallback)", len(h.runtime.created)-createdBefore) - } - // Fresh launch uses GetLaunchCommand (returns "claude" in the fake) — not - // the resume command, which would have read "claude --resume ". - if got := h.runtime.created[createdBefore].LaunchCommand; got != "claude" { - t.Errorf("restore launch command = %q, want fresh-launch %q", got, "claude") + if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionTerminated { + t.Fatal("orphaned spawn should be parked terminal") } } -func TestRestore_NoIDAndNoPrompt_Errors(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - // Seed a terminal record directly without any metadata — no agent session id, - // no prompt. Restore has nothing to resume and nothing to relaunch from, so - // it must fail early without touching workspace/runtime. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - beforeRestores := len(h.workspace.restoredID) - beforeCreated := len(h.runtime.created) +func TestKill_TearsDownRuntimeAndWorkspace(t *testing.T) { + m, st, rt, ws := newManager() + st.sessions["mer-1"] = mkLive("mer-1") - if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { - t.Fatal("restore: want error for missing agent session id and prompt, got nil") + freed, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled) + if err != nil || !freed { + t.Fatalf("kill should free the workspace: freed=%v err=%v", freed, err) } - if len(h.workspace.restoredID) != beforeRestores { - t.Error("workspace was touched despite a doomed restore") - } - if len(h.runtime.created) != beforeCreated { - t.Error("runtime was created despite a doomed restore") - } - // The session stays terminal — a failed restore does not reopen it. - rec, _, _ := h.store.Get(ctx, "sess-1") - if rec.Lifecycle.Session.State != domain.SessionTerminated { - t.Errorf("session state = %q, want terminated (unchanged)", rec.Lifecycle.Session.State) + if rt.destroyed != 1 || ws.destroyed != 1 { + t.Fatal("kill should destroy runtime and workspace") } } -func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) +func TestKill_RefusesIncompleteHandle(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = domain.SessionRecord{ // live, but no teardown handles + ID: "mer-1", ProjectID: "mer", + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, } - if err := h.store.PatchMetadata(ctx, "sess-1", domain.SessionMetadata{AgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } - beforeMeta, _ := h.store.GetMetadata(ctx, "sess-1") - - // Fail the post-create LCM call; capture teardown counts just before restore. - h.lcm.onSpawnErr = errors.New("lcm boom") - before, _, _ := h.store.Get(ctx, "sess-1") - destroyedBefore := len(h.runtime.destroyed) - wsDestroyedBefore := len(h.workspace.destroyed) - if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { - t.Fatal("restore: want error, got nil") - } - - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled { - t.Fatalf("restore failure should restore terminal lifecycle, got %+v", got) - } - if rec.Lifecycle.Revision != before.Lifecycle.Revision+2 { - t.Fatalf("restore failure should advance revision twice, got %d want %d", rec.Lifecycle.Revision, before.Lifecycle.Revision+2) - } - afterMeta, _ := h.store.GetMetadata(ctx, "sess-1") - if afterMeta != beforeMeta { - t.Fatalf("restore failure should restore metadata, got %+v want %+v", afterMeta, beforeMeta) - } - - // The runtime created during restore is torn back down so no process is - // stranded; the workspace is left intact (it holds the agent's prior work). - if len(h.runtime.destroyed) != destroyedBefore+1 { - t.Errorf("runtime.destroyed grew by %d, want 1 (restore rollback)", len(h.runtime.destroyed)-destroyedBefore) - } - if len(h.workspace.destroyed) != wsDestroyedBefore { - t.Errorf("workspace was destroyed on restore rollback; it must be preserved") + if _, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled); !errors.Is(err, ErrIncompleteHandle) { + t.Fatalf("want ErrIncompleteHandle, got %v", err) } } -func TestCleanup_SkipsUncommittedWork(t *testing.T) { - h := newHarness("unused") - ctx := context.Background() - - // Two terminal sessions (reclaimable) + one working session (must be ignored). - seedTerminal(t, h, "done-1", "/tmp/ws/done-1") - seedTerminal(t, h, "dirty-1", "/tmp/ws/dirty-1") - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "live-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert live: %v", err) - } - // dirty-1's worktree still holds uncommitted work — Destroy refuses it. - h.workspace.refuse["/tmp/ws/dirty-1"] = true +func TestRestore_ReopensTerminal(t *testing.T) { + m, st, rt, _ := newManager() + seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "b", AgentSessionID: "agent-x"}) - res, err := h.sm.Cleanup(ctx, testProject) + s, err := m.Restore(ctx, "mer-1") if err != nil { - t.Fatalf("cleanup: %v", err) + t.Fatal(err) } - - if !equalIDSet(res.Cleaned, []domain.SessionID{"done-1"}) { - t.Errorf("cleaned = %v, want [done-1]", res.Cleaned) - } - if !equalIDSet(res.Skipped, []domain.SessionID{"dirty-1"}) { - t.Errorf("skipped = %v, want [dirty-1]", res.Skipped) + if s.Status != domain.StatusSpawning { + t.Fatalf("restored session displays spawning, got %q", s.Status) } - // The live session was never a candidate. - if contains(res.Cleaned, "live-1") || contains(res.Skipped, "live-1") { - t.Error("non-terminal session must not be cleaned or skipped") + if rt.created != 1 { + t.Fatal("restore should relaunch the runtime") } } -// ---- test helpers ---- +func TestRestore_RefusesLiveSession(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = mkLive("mer-1") -func lc(s domain.SessionState, r domain.SessionReason, prs domain.PRState, prr domain.PRReason) domain.CanonicalSessionLifecycle { - return domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: s, Reason: r}, - PR: domain.PRSubstate{State: prs, Reason: prr}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + if _, err := m.Restore(ctx, "mer-1"); !errors.Is(err, ErrNotRestorable) { + t.Fatalf("want ErrNotRestorable, got %v", err) } } -func seedTerminal(t *testing.T, h *harness, id domain.SessionID, wsPath string) { - t.Helper() - ctx := context.Background() - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: id, ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert %s: %v", id, err) +func TestList_DerivesStatusFromPRFacts(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = mkLive("mer-1") + st.pr["mer-1"] = domain.PRFacts{Exists: true, CI: domain.CIFailing} + + list, err := m.List(ctx, "mer") + if err != nil { + t.Fatal(err) } - if err := h.store.PatchMetadata(ctx, id, domain.SessionMetadata{WorkspacePath: wsPath}); err != nil { - t.Fatalf("patch metadata %s: %v", id, err) + if len(list) != 1 || list[0].Status != domain.StatusCIFailed { + t.Fatalf("status should reflect PR facts, got %+v", list) } } -func equalStrings(a, b []string) bool { - if len(a) != len(b) { - return false +func TestCleanup_ReclaimsTerminalWorkspaces(t *testing.T) { + m, st, _, ws := newManager() + seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1"}) + st.sessions["mer-2"] = mkLive("mer-2") // live: must be skipped + + cleaned, err := m.Cleanup(ctx, "mer") + if err != nil { + t.Fatal(err) } - for i := range a { - if a[i] != b[i] { - return false - } + if len(cleaned) != 1 || cleaned[0] != "mer-1" { + t.Fatalf("only the terminal session should be reclaimed, got %v", cleaned) } - return true -} - -func contains(ids []domain.SessionID, id domain.SessionID) bool { - for _, x := range ids { - if x == id { - return true - } + if ws.destroyed != 1 { + t.Fatal("the live session's workspace must not be destroyed") } - return false } -func equalIDSet(got, want []domain.SessionID) bool { - if len(got) != len(want) { - return false - } - for _, w := range want { - if !contains(got, w) { - return false - } +func mkLive(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ + ID: id, ProjectID: "mer", + Metadata: domain.SessionMetadata{WorkspacePath: "/ws/" + string(id), RuntimeHandleID: "h1", RuntimeName: "tmux"}, + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, } - return true } diff --git a/backend/lifecycle_wiring.go b/backend/lifecycle_wiring.go index 3836baf6..f69b1ce4 100644 --- a/backend/lifecycle_wiring.go +++ b/backend/lifecycle_wiring.go @@ -11,116 +11,132 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// lifecycleStack owns the running LCM + reaper. The LCM is the sole writer into -// the store (every Apply*/On* call ends in store.Upsert, which the CDC pipeline -// then drains); the reaper is the OBSERVE-layer timer that probes live runtimes -// and reports facts back through the LCM. Together with the CDC substrate this -// makes the write path live end-to-end: LCM -> store -> outbox -> JSONL -> -// broadcaster. +// lifecycleStack owns the running LCM + reaper. The LCM is the sole writer of +// canonical transitions; the reaper is the OBSERVE-layer timer that probes live +// runtimes and reports facts back through it. type lifecycleStack struct { LCM *lifecycle.Manager reaperDone <-chan struct{} } -// startLifecycle constructs the LCM over store, makes escalation budgets durable, -// teaches it to enumerate sessions for the reaper, and starts the reaper loop. +// startLifecycle constructs the LCM over the store adapter and starts the reaper. // The goroutine stops when ctx is cancelled; Stop waits for it to drain. // -// TEMPORARY STUBS (replace as the daemon lane lands the real collaborators): -// -// - noopNotifier — swap for the production notifier multiplexer once the -// notifier plugins (desktop/Slack/webhook) are ported. Wire it where -// noopNotifier{} is passed to lifecycle.New below. -// - noopMessenger — swap for the AgentMessenger backed by the runtime/agent -// plugins (it injects a prompt into the live agent pane). Wire it at the -// same lifecycle.New call site. -// - reaper.MapRegistry{} — empty runtime registry, so the reaper probes -// nothing. Register the real runtime adapters (tmux/process) keyed by -// runtime name once those plugins exist: reaper.MapRegistry{"tmux": rt}. +// TEMPORARY STUBS (replace as the daemon lane lands the collaborators): +// - noopNotifier — swap for the notifier multiplexer (desktop/Slack/webhook). +// - noopMessenger — swap for the runtime/agent-plugin-backed AgentMessenger. +// - reaper.MapRegistry{} — empty runtime registry, so the reaper ticks +// escalations but probes nothing until the runtime plugins exist. func startLifecycle(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*lifecycleStack, error) { - // TODO(daemon-lane): replace noopNotifier{}/noopMessenger{} with the real - // notifier multiplexer and the plugin-backed AgentMessenger. - lcm := lifecycle.New(store, noopNotifier{}, noopMessenger{}) - - // Durable escalation budgets (flaw #3 fix): hydrate from the store and turn - // on write-through so a restart does not re-fire an already-escalated page. - // Must run before the reaper starts dispatching TickEscalations. - if err := lcm.WithReactionStore(ctx, lifecycleReactionStore{store}); err != nil { - return nil, err - } - - // The reaper's RunningSessions snapshot needs to see every session; ListAll - // spans all projects (the per-project List would hide cross-project work). - lcm.WithSessionLister(store.ListAll) - - // TODO(daemon-lane): pass the real runtime registry so the reaper actually - // probes live panes. With an empty registry it ticks escalations but probes - // nothing, which is correct until runtimes exist. + a := storeAdapter{store} + lcm := lifecycle.New(a, a, noopNotifier{}, noopMessenger{}) rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Logger: logger}) - return &lifecycleStack{LCM: lcm, reaperDone: rp.Start(ctx)}, nil } // Stop waits for the reaper goroutine to exit (the caller must have cancelled the // ctx passed to startLifecycle). -func (l *lifecycleStack) Stop() { - <-l.reaperDone -} - -// noopNotifier satisfies ports.Notifier by dropping every event. TEMPORARY: the -// daemon lane replaces this with the notifier multiplexer over the real notifier -// plugins. Until then human-facing notifications are silently discarded — the -// write path and CDC still work, only the human push is absent. -type noopNotifier struct{} +func (l *lifecycleStack) Stop() { <-l.reaperDone } -func (noopNotifier) Notify(context.Context, ports.OrchestratorEvent) error { return nil } +// storeAdapter bridges *sqlite.Store to the engine's ports. It embeds the store +// (so CreateSession/UpdateSession/GetSession/ListSessions/ListAllSessions and +// RecentCheckStatuses promote directly) and adds the PR conversions + the +// PRFacts read-model the display status needs. +type storeAdapter struct{ *sqlite.Store } -// noopMessenger satisfies ports.AgentMessenger by dropping every send. TEMPORARY: -// replace with the runtime/agent-plugin-backed messenger that injects prompts -// into the live agent pane. Until then auto-nudge reactions are no-ops. -type noopMessenger struct{} - -func (noopMessenger) Send(context.Context, domain.SessionID, string) error { return nil } - -// lifecycleReactionStore bridges the concrete *sqlite.Store to the lifecycle -// package's ReactionStore interface (string/row types <-> domain types). It is -// the production twin of the reactionStoreAdapter used in the lifecycle tests. -type lifecycleReactionStore struct{ store *sqlite.Store } +var ( + _ ports.SessionStore = storeAdapter{} + _ ports.PRWriter = storeAdapter{} +) -func (a lifecycleReactionStore) LoadReactionTrackers(ctx context.Context) ([]lifecycle.PersistedTracker, error) { - rows, err := a.store.ListReactionTrackers(ctx) +// PRFactsForSession picks the PR that drives display status — the most-recently +// updated non-closed PR, else the most recent — and folds in whether it has +// unresolved review comments. +func (a storeAdapter) PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) { + rows, err := a.Store.ListPRsBySession(ctx, string(id)) // newest first + if err != nil { + return domain.PRFacts{}, err + } + if len(rows) == 0 { + return domain.PRFacts{}, nil + } + pick := rows[0] + for _, r := range rows { + if r.State == "draft" || r.State == "open" { + pick = r + break + } + } + facts := domain.PRFacts{ + URL: pick.URL, Number: int(pick.Number), Exists: true, + Draft: pick.State == "draft", Merged: pick.State == "merged", Closed: pick.State == "closed", + CI: domain.CIState(pick.CIState), + Review: domain.ReviewDecision(pick.ReviewDecision), + Mergeability: domain.Mergeability(pick.Mergeability), + } + comments, err := a.Store.ListPRComments(ctx, pick.URL) if err != nil { - return nil, err + return domain.PRFacts{}, err } - out := make([]lifecycle.PersistedTracker, len(rows)) - for i, r := range rows { - out[i] = lifecycle.PersistedTracker{ - SessionID: domain.SessionID(r.SessionID), - Key: r.ReactionKey, - Attempts: r.Attempts, - Escalated: r.Escalated, - FirstAttemptAt: r.FirstAttemptAt, - ProjectID: domain.ProjectID(r.ProjectID), + for _, c := range comments { + if !c.Resolved { + facts.ReviewComments = true + break } } - return out, nil + return facts, nil +} + +func (a storeAdapter) UpsertPR(ctx context.Context, r ports.PRRow) error { + return a.Store.UpsertPR(ctx, sqlite.PRRow{ + URL: r.URL, SessionID: r.SessionID, Number: int64(r.Number), + State: prState(r), + ReviewDecision: string(r.Review), + CIState: string(r.CI), + Mergeability: string(r.Mergeability), + UpdatedAt: r.UpdatedAt, + }) } -func (a lifecycleReactionStore) SaveReactionTracker(ctx context.Context, t lifecycle.PersistedTracker) error { - return a.store.SaveReactionTracker(ctx, sqlite.ReactionTrackerRow{ - SessionID: string(t.SessionID), - ReactionKey: t.Key, - Attempts: t.Attempts, - Escalated: t.Escalated, - FirstAttemptAt: t.FirstAttemptAt, - ProjectID: string(t.ProjectID), +func (a storeAdapter) RecordCheck(ctx context.Context, r ports.PRCheckRow) error { + return a.Store.RecordCheck(ctx, sqlite.PRCheckRow{ + PRURL: r.PRURL, Name: r.Name, CommitHash: r.CommitHash, + Status: r.Status, URL: r.URL, LogTail: r.LogTail, CreatedAt: r.CreatedAt, }) } -func (a lifecycleReactionStore) DeleteReactionTracker(ctx context.Context, id domain.SessionID, key string) error { - return a.store.DeleteReactionTracker(ctx, string(id), key) +func (a storeAdapter) ReplacePRComments(ctx context.Context, prURL string, comments []ports.PRComment) error { + rows := make([]sqlite.PRCommentRow, len(comments)) + for i, c := range comments { + rows[i] = sqlite.PRCommentRow{ + PRURL: prURL, CommentID: c.ID, Author: c.Author, File: c.File, + Line: int64(c.Line), Body: c.Body, Resolved: c.Resolved, CreatedAt: c.CreatedAt, + } + } + return a.Store.ReplacePRComments(ctx, prURL, rows) } -func (a lifecycleReactionStore) DeleteSessionReactionTrackers(ctx context.Context, id domain.SessionID) error { - return a.store.DeleteSessionReactionTrackers(ctx, string(id)) +// prState collapses the PR's bools into the single pr.state column value. +func prState(r ports.PRRow) string { + switch { + case r.Merged: + return "merged" + case r.Closed: + return "closed" + case r.Draft: + return "draft" + default: + return "open" + } } + +// noopNotifier / noopMessenger are TEMPORARY stubs (see startLifecycle): the +// write path and CDC work without them; only the human push / agent nudge are +// absent until the real plugins are wired. +type noopNotifier struct{} + +func (noopNotifier) Notify(context.Context, ports.Event) error { return nil } + +type noopMessenger struct{} + +func (noopMessenger) Send(context.Context, domain.SessionID, string) error { return nil } diff --git a/backend/main.go b/backend/main.go index 8db058ea..c4d4da52 100644 --- a/backend/main.go +++ b/backend/main.go @@ -53,19 +53,18 @@ func run() error { // lane and are wired there once their collaborators (Notifier, AgentMessenger, // and the runtime/agent/workspace plugins) have production implementations; // here we stand up the persistence + change-delivery foundation they build on. - db, err := sqlite.Open(cfg.DataDir) + store, err := sqlite.Open(cfg.DataDir) if err != nil { return fmt.Errorf("open store: %w", err) } - defer db.Close() - store := sqlite.NewStore(db) + defer store.Close() // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the // graceful shutdown inside Server.Run. ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() - cdcPipe, err := startCDC(ctx, store, cfg.DataDir, log) + cdcPipe, err := startCDC(ctx, store, log) if err != nil { return err } diff --git a/backend/main_test.go b/backend/main_test.go deleted file mode 100644 index c8f32541..00000000 --- a/backend/main_test.go +++ /dev/null @@ -1,134 +0,0 @@ -package main - -import ( - "context" - "encoding/json" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" -) - -// These tests cover the composition-root adapters in cdc_wiring.go directly -// (package main otherwise has no test coverage): the outboxAdapter mapping the -// store's OutboxEvent to cdc.PendingEvent, and the snapshotSource rebuilding -// full-state events from the sessions table. - -func newWiringStore(t *testing.T) *sqlite.Store { - t.Helper() - db, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open: %v", err) - } - t.Cleanup(func() { db.Close() }) - return sqlite.NewStore(db) -} - -func wiringRec(id string) domain.SessionRecord { - now := time.Now().UTC() - return domain.SessionRecord{ - ID: domain.SessionID(id), ProjectID: "proj", Kind: domain.KindWorker, CreatedAt: now, UpdatedAt: now, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, - PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, - Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, - }, - } -} - -func TestOutboxAdapterMapsPendingEvents(t *testing.T) { - ctx := context.Background() - store := newWiringStore(t) - a := outboxAdapter{store} - - if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - pending, err := a.ListUnsent(ctx, 10) - if err != nil { - t.Fatalf("list unsent: %v", err) - } - if len(pending) != 1 { - t.Fatalf("want 1 pending event, got %d", len(pending)) - } - pe := pending[0] - if pe.Seq != 1 || pe.SessionID != "s1" || pe.EventType != string(ports.EventSessionCreated) || pe.Revision != 1 { - t.Fatalf("unexpected mapping: %+v", pe) - } - if pe.Payload == "" { - t.Fatal("payload should carry the marshaled record") - } - - // MarkSent must clear it from the unsent set. - if err := a.MarkSent(ctx, pe.OutboxID, time.Now().UTC()); err != nil { - t.Fatalf("mark sent: %v", err) - } - again, err := a.ListUnsent(ctx, 10) - if err != nil { - t.Fatalf("list unsent 2: %v", err) - } - if len(again) != 0 { - t.Fatalf("sent event should not reappear, got %d", len(again)) - } -} - -func TestSnapshotSourceRebuildsState(t *testing.T) { - ctx := context.Background() - store := newWiringStore(t) - s := snapshotSource{store} - - // Empty store: no events, maxSeq 0. - events, maxSeq, err := s.Snapshot(ctx) - if err != nil { - t.Fatalf("empty snapshot: %v", err) - } - if len(events) != 0 || maxSeq != 0 { - t.Fatalf("empty store should yield no events and maxSeq 0, got %d events maxSeq %d", len(events), maxSeq) - } - - // Two canonical writes (seq 1,2) across two sessions. - if err := store.Upsert(ctx, wiringRec("s1"), ports.EventSessionCreated); err != nil { - t.Fatalf("upsert s1: %v", err) - } - if err := store.Upsert(ctx, wiringRec("s2"), ports.EventSessionCreated); err != nil { - t.Fatalf("upsert s2: %v", err) - } - - events, maxSeq, err = s.Snapshot(ctx) - if err != nil { - t.Fatalf("snapshot: %v", err) - } - if maxSeq != 2 { - t.Fatalf("maxSeq = %d, want 2 (change_log high-water)", maxSeq) - } - if len(events) != 2 { - t.Fatalf("want one event per session (2), got %d", len(events)) - } - for _, e := range events { - if e.Seq != maxSeq { - t.Errorf("snapshot event seq = %d, want resume watermark %d", e.Seq, maxSeq) - } - if e.EventType != "session_snapshot" { - t.Errorf("event type = %q, want session_snapshot", e.EventType) - } - // Payload must be a parseable full record at the persisted revision with - // metadata excluded and the schema version stamped. - var rec domain.SessionRecord - if err := json.Unmarshal([]byte(e.Payload), &rec); err != nil { - t.Fatalf("payload not a SessionRecord: %v", err) - } - if rec.Lifecycle.Version != domain.LifecycleVersion { - t.Errorf("payload version = %d, want %d", rec.Lifecycle.Version, domain.LifecycleVersion) - } - if rec.Lifecycle.Revision != 1 { - t.Errorf("payload revision = %d, want 1", rec.Lifecycle.Revision) - } - if !rec.Metadata.IsZero() { - t.Errorf("snapshot payload must exclude metadata, got %v", rec.Metadata) - } - } -} diff --git a/backend/wiring_test.go b/backend/wiring_test.go new file mode 100644 index 00000000..74b314b0 --- /dev/null +++ b/backend/wiring_test.go @@ -0,0 +1,71 @@ +package main + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// TestWiring_WriteFlowsToBroadcaster exercises the real boot path end to end: +// a lifecycle write -> sqlite -> DB trigger -> change_log -> CDC poller -> +// broadcaster, through the production storeAdapter and cdcSource. +func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { + ctx := context.Background() + store, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatal(err) + } + defer store.Close() + + a := storeAdapter{store} + lcm := lifecycle.New(a, a, noopNotifier{}, noopMessenger{}) + + bcast := cdc.NewBroadcaster() + poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{}) + if err := poller.SeekToHead(ctx); err != nil { + t.Fatal(err) + } + + var mu sync.Mutex + var got []cdc.Event + bcast.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + if err := store.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/repo/mer"}); err != nil { + t.Fatal(err) + } + rec, err := store.CreateSession(ctx, domain.SessionRecord{ + ProjectID: "mer", Kind: domain.KindWorker, + Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion, Session: domain.SessionSubstate{State: domain.SessionNotStarted}}, + }) + if err != nil { + t.Fatal(err) + } + // A real transition through the engine, which writes the row and fires the + // is_alive/activity_state CDC trigger. + if err := lcm.ApplyActivitySignal(ctx, rec.ID, ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Timestamp: time.Now()}); err != nil { + t.Fatal(err) + } + + if err := poller.Poll(ctx); err != nil { + t.Fatal(err) + } + + mu.Lock() + defer mu.Unlock() + var sawSession bool + for _, e := range got { + if e.SessionID == string(rec.ID) { + sawSession = true + } + } + if !sawSession { + t.Fatalf("expected a change_log event for %s to reach the broadcaster, got %d events", rec.ID, len(got)) + } +} From 0a69b8429ee89156107f6a2d7f6a608e2bf60bdc Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 07:18:22 +0530 Subject: [PATCH 08/10] docs(config): drop stale CDC-JSONL mention in resolveDataDir CDC is trigger-driven in the SQLite DB now; there is no JSONL log. Co-Authored-By: Claude Opus 4.8 --- backend/internal/config/config.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 68aab00e..719e7524 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -149,9 +149,9 @@ func resolveRunFilePath() (string, error) { return filepath.Join(dir, "agent-orchestrator", "running.json"), nil } -// resolveDataDir picks where durable state (SQLite DB, CDC JSONL) lives. An -// explicit AO_DATA_DIR wins; otherwise it sits under the per-user state -// directory alongside running.json. +// resolveDataDir picks where durable state (the SQLite DB) lives. An explicit +// AO_DATA_DIR wins; otherwise it sits under the per-user state directory +// alongside running.json. func resolveDataDir() (string, error) { if p, ok := os.LookupEnv("AO_DATA_DIR"); ok && p != "" { return p, nil From 0dbd304e5832045b050b7e7679a563cd494d3107 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 16:14:02 +0530 Subject: [PATCH 09/10] fix(backend): drain CDC/lifecycle goroutines without deadlocking on non-signal exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lcStack.Stop()/cdcPipe.Stop() block on done channels that close only when ctx is cancelled, but the deferred stop() that cancels ctx ran last (LIFO) — so any non-signal exit (e.g. a listener bind error) hung the daemon forever. Cancel ctx first, then drain, explicitly after srv.Run instead of via defer. Also refresh the startup comments that still described the removed outbox/JSONL/janitor flow. Co-Authored-By: Claude Opus 4.8 --- backend/main.go | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/backend/main.go b/backend/main.go index c4d4da52..60d9e26e 100644 --- a/backend/main.go +++ b/backend/main.go @@ -47,12 +47,13 @@ func run() error { return err } - // Open the durable store and bring up the CDC substrate (outbox publisher, - // JSONL consumer + broadcaster, outbox janitor). The LCM/Session Manager and - // the HTTP API routes that drive and read this store are owned by the daemon - // lane and are wired there once their collaborators (Notifier, AgentMessenger, - // and the runtime/agent/workspace plugins) have production implementations; - // here we stand up the persistence + change-delivery foundation they build on. + // Open the durable store and bring up the CDC substrate: the DB triggers + // capture changes into change_log, the poller tails it, and the broadcaster + // fans events out to the SSE transport. The LCM/Session Manager and the HTTP + // API routes that drive and read this store are owned by the daemon lane and + // are wired there once their collaborators (Notifier, AgentMessenger, and the + // runtime/agent/workspace plugins) have production implementations; here we + // stand up the persistence + change-delivery foundation they build on. store, err := sqlite.Open(cfg.DataDir) if err != nil { return fmt.Errorf("open store: %w", err) @@ -60,7 +61,7 @@ func run() error { defer store.Close() // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the - // graceful shutdown inside Server.Run. + // graceful shutdown inside Server.Run and stops the background goroutines. ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() @@ -68,17 +69,12 @@ func run() error { if err != nil { return err } - defer func() { - if err := cdcPipe.Stop(); err != nil { - log.Error("cdc pipeline shutdown", "err", err) - } - }() // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE - // timer). This makes the write path live end-to-end: LCM.Upsert -> store -> - // outbox -> CDC JSONL -> broadcaster. The collaborators it needs that don't - // yet have production implementations (Notifier, AgentMessenger, runtime - // registry) are stubbed in lifecycle_wiring.go with TODO markers. + // timer). This makes the write path live end-to-end: LCM write -> store -> DB + // trigger -> change_log -> poller -> broadcaster. The collaborators it needs + // that don't yet have production implementations (Notifier, AgentMessenger, + // runtime registry) are stubbed in lifecycle_wiring.go with TODO markers. // // NOT wired here yet — both await collaborators the daemon lane owns: // - Session Manager: session.New needs Runtime/Agent/Workspace plugins to @@ -90,11 +86,21 @@ func run() error { // the SM work since the routes call into it. lcStack, err := startLifecycle(ctx, store, log) if err != nil { - return fmt.Errorf("start lifecycle: %w", err) + return err } - defer lcStack.Stop() - return srv.Run(ctx) + runErr := srv.Run(ctx) + + // Shut the background goroutines down in order: cancel the context FIRST so + // their loops exit, then wait for them to drain. Doing this explicitly (not + // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel + // runs before the cancel — which would hang any non-signal exit path. + stop() + lcStack.Stop() + if err := cdcPipe.Stop(); err != nil { + log.Error("cdc pipeline shutdown", "err", err) + } + return runErr } // newLogger returns the daemon's slog logger. It writes to stderr so the From 70aab5eb26352869842b4a96db615422f55acf55 Mon Sep 17 00:00:00 2001 From: prateek Date: Sun, 31 May 2026 17:02:47 +0530 Subject: [PATCH 10/10] feat(backend): atomic PR-observation write + CDC on check status updates Addresses review on PR-observation persistence: - pr_checks now has an AFTER UPDATE CDC trigger (guarded on status change), so a check flipping in_progress->failed on the same commit emits change_log instead of updating silently. Restores symmetry with the sessions/pr triggers. - writePR persists scalar facts + checks + comments in ONE transaction via Store.WritePRObservation, so a mid-write failure can't leave the pr row (and its CDC event) committed while checks/comments are partial. Collapses the PRWriter port's three write methods into one WritePR. - db.go: record why modernc.org/sqlite (pure-Go, CGO-free static binary) at the import site. Regression tests for both the update-trigger (emit on change, suppress no-op re-poll) and the transactional write. go test -race ./... green. Co-Authored-By: Claude Opus 4.8 --- backend/internal/lifecycle/manager.go | 22 +++-- backend/internal/lifecycle/manager_test.go | 14 +-- backend/internal/ports/outbound.go | 8 +- backend/internal/storage/sqlite/db.go | 3 + .../storage/sqlite/migrations/0001_init.sql | 19 ++++ .../internal/storage/sqlite/pr_cdc_test.go | 86 +++++++++++++++++++ backend/internal/storage/sqlite/pr_store.go | 74 +++++++++++++--- backend/lifecycle_wiring.go | 43 +++++----- 8 files changed, 212 insertions(+), 57 deletions(-) create mode 100644 backend/internal/storage/sqlite/pr_cdc_test.go diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 5c58f0a2..f61d38b4 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -176,27 +176,31 @@ func (m *Manager) ApplyPRObservation(ctx context.Context, id domain.SessionID, o return m.runReactions(ctx, id, prContent(o)) } -// writePR upserts the scalar facts, records each check run, and replaces the -// comment set. PR-table CDC is emitted by the DB triggers. +// writePR persists the observation's scalar facts, check runs, and comment set +// in one atomic store call. PR-table CDC is emitted by the DB triggers. func (m *Manager) writePR(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { now := m.clock() - if err := m.pr.UpsertPR(ctx, ports.PRRow{ + row := ports.PRRow{ URL: o.URL, SessionID: string(id), Number: o.Number, Draft: o.Draft, Merged: o.Merged, Closed: o.Closed, CI: o.CI, Review: o.Review, Mergeability: o.Mergeability, UpdatedAt: now, - }); err != nil { - return err } - for _, c := range o.Checks { + checks := make([]ports.PRCheckRow, len(o.Checks)) + for i, c := range o.Checks { c.PRURL = o.URL if c.CreatedAt.IsZero() { c.CreatedAt = now } - if err := m.pr.RecordCheck(ctx, c); err != nil { - return err + checks[i] = c + } + comments := make([]ports.PRComment, len(o.Comments)) + for i, c := range o.Comments { + if c.CreatedAt.IsZero() { + c.CreatedAt = now } + comments[i] = c } - return m.pr.ReplacePRComments(ctx, o.URL, o.Comments) + return m.pr.WritePR(ctx, row, checks, comments) } // ---- mutation commands from the Session Manager ---- diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index 7843f8af..4ae9aaaf 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -82,12 +82,10 @@ func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (d } return facts, nil } -func (f *fakeStore) UpsertPR(_ context.Context, r ports.PRRow) error { - f.pr[domain.SessionID(r.SessionID)] = r - return nil -} -func (f *fakeStore) RecordCheck(_ context.Context, r ports.PRCheckRow) error { - f.checks = append(f.checks, r) +func (f *fakeStore) WritePR(_ context.Context, pr ports.PRRow, checks []ports.PRCheckRow, comments []ports.PRComment) error { + f.pr[domain.SessionID(pr.SessionID)] = pr + f.checks = append(f.checks, checks...) + f.comments[pr.URL] = comments return nil } func (f *fakeStore) RecentCheckStatuses(_ context.Context, url, name string, limit int) ([]string, error) { @@ -99,10 +97,6 @@ func (f *fakeStore) RecentCheckStatuses(_ context.Context, url, name string, lim } return out, nil } -func (f *fakeStore) ReplacePRComments(_ context.Context, url string, cs []ports.PRComment) error { - f.comments[url] = cs - return nil -} type fakeNotifier struct{ events []ports.Event } diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go index d180f538..75a24bf0 100644 --- a/backend/internal/ports/outbound.go +++ b/backend/internal/ports/outbound.go @@ -24,10 +24,12 @@ type SessionStore interface { // PRWriter records the PR facts a PR observation carries. The pr table's own DB // triggers emit the CDC; this just writes the rows. type PRWriter interface { - UpsertPR(ctx context.Context, r PRRow) error - RecordCheck(ctx context.Context, r PRCheckRow) error + // WritePR persists a full PR observation — scalar facts, check runs, and the + // replacement comment set — in one transaction, so the rows and the CDC + // events they emit are all-or-nothing. + WritePR(ctx context.Context, pr PRRow, checks []PRCheckRow, comments []PRComment) error + // RecentCheckStatuses reads the last `limit` runs of a check (the CI brake). RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) - ReplacePRComments(ctx context.Context, prURL string, comments []PRComment) error } // Notifier delivers an event to the human (desktop/Slack later). Push, never poll. diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go index 63f6b7dd..8b001d11 100644 --- a/backend/internal/storage/sqlite/db.go +++ b/backend/internal/storage/sqlite/db.go @@ -11,6 +11,9 @@ import ( "path/filepath" "github.com/pressly/goose/v3" + // modernc.org/sqlite is the pure-Go (CGO-free) SQLite driver — chosen so the + // daemon cross-compiles and ships as a static binary with no libsqlite/CGO + // toolchain dependency, at the cost of some raw throughput vs a C-backed driver. _ "modernc.org/sqlite" ) diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql index 6534816d..9d5a6a22 100644 --- a/backend/internal/storage/sqlite/migrations/0001_init.sql +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -202,6 +202,25 @@ BEGIN END; -- +goose StatementEnd +-- A re-polled check can change status on the same commit (in_progress -> failed) +-- via UpsertPRCheck's ON CONFLICT DO UPDATE. Without this trigger that status +-- transition would update the row silently, so CDC consumers would never see it. +-- Guarded on the status so a no-op re-poll emits nothing. +-- +goose StatementBegin +CREATE TRIGGER pr_checks_cdc_update +AFTER UPDATE ON pr_checks +WHEN OLD.status <> NEW.status +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ( + (SELECT s.project_id FROM pr p JOIN sessions s ON s.id = p.session_id WHERE p.url = NEW.pr_url), + (SELECT session_id FROM pr WHERE url = NEW.pr_url), + 'pr_check_recorded', + json_object('pr', NEW.pr_url, 'name', NEW.name, 'commit', NEW.commit_hash, 'status', NEW.status), + NEW.created_at); +END; +-- +goose StatementEnd + -- +goose Down -- +goose StatementBegin DROP TABLE change_log; diff --git a/backend/internal/storage/sqlite/pr_cdc_test.go b/backend/internal/storage/sqlite/pr_cdc_test.go new file mode 100644 index 00000000..8c8f7ea2 --- /dev/null +++ b/backend/internal/storage/sqlite/pr_cdc_test.go @@ -0,0 +1,86 @@ +package sqlite + +import ( + "context" + "strings" + "testing" + "time" +) + +// A check can change status on the same commit (in_progress -> failed) via +// UpsertPRCheck's ON CONFLICT DO UPDATE. CDC must emit on that transition, not +// only on the first insert — otherwise live clients never see the status change. +func TestPRChecksCDC_EmitsOnInsertAndStatusUpdate(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + rec, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) + } + url := "https://example/pr/1" + if err := s.UpsertPR(ctx, PRRow{URL: url, SessionID: string(rec.ID), Number: 1}); err != nil { + t.Fatal(err) + } + + now := time.Now() + mustCheck := func(status string) { + if err := s.RecordCheck(ctx, PRCheckRow{PRURL: url, Name: "build", CommitHash: "c1", Status: status, CreatedAt: now}); err != nil { + t.Fatal(err) + } + } + mustCheck("in_progress") // insert -> event + mustCheck("failed") // status change on same commit (update) -> event + mustCheck("failed") // no-op re-poll (status unchanged) -> NO event + + rows, err := s.ReadChangeLogAfter(ctx, 0, 100) + if err != nil { + t.Fatal(err) + } + var checkEvents []ChangeLogRow + for _, r := range rows { + if r.EventType == "pr_check_recorded" { + checkEvents = append(checkEvents, r) + } + } + if len(checkEvents) != 2 { + t.Fatalf("want 2 check CDC events (insert + status change, no-op suppressed), got %d", len(checkEvents)) + } + if !strings.Contains(checkEvents[1].Payload, `"status":"failed"`) { + t.Fatalf("the update event should carry the new status, got %q", checkEvents[1].Payload) + } +} + +// WritePRObservation persists scalar facts, checks, and comments in one tx; all +// three should be queryable afterward. +func TestWritePRObservation_PersistsScalarsChecksAndComments(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + rec, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) + } + url := "https://example/pr/7" + now := time.Now() + + err = s.WritePRObservation(ctx, + PRRow{URL: url, SessionID: string(rec.ID), Number: 7, CIState: "failing", UpdatedAt: now}, + []PRCheckRow{{PRURL: url, Name: "build", CommitHash: "c1", Status: "failed", CreatedAt: now}}, + []PRCommentRow{{PRURL: url, CommentID: "1", Author: "reviewer", Body: "use a const", CreatedAt: now}}, + ) + if err != nil { + t.Fatal(err) + } + + pr, ok, err := s.GetPR(ctx, url) + if err != nil || !ok || pr.CIState != "failing" { + t.Fatalf("scalar facts not persisted: ok=%v ci=%q err=%v", ok, pr.CIState, err) + } + if checks, _ := s.ListChecks(ctx, url); len(checks) != 1 || checks[0].Status != "failed" { + t.Fatalf("check not persisted: %+v", checks) + } + if comments, _ := s.ListPRComments(ctx, url); len(comments) != 1 || comments[0].Body != "use a const" { + t.Fatalf("comment not persisted: %+v", comments) + } +} diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go index 4170da4d..8b41396c 100644 --- a/backend/internal/storage/sqlite/pr_store.go +++ b/backend/internal/storage/sqlite/pr_store.go @@ -27,18 +27,7 @@ type PRRow struct { // fields default to their "nothing known yet" value so a partial row is valid // against the CHECK constraints (matches the domain zero values none/unknown). func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { - if r.State == "" { - r.State = "open" - } - if r.ReviewDecision == "" { - r.ReviewDecision = "none" - } - if r.CIState == "" { - r.CIState = "unknown" - } - if r.Mergeability == "" { - r.Mergeability = "unknown" - } + r = r.withDefaults() s.writeMu.Lock() defer s.writeMu.Unlock() return s.qw.UpsertPR(ctx, gen.UpsertPRParams{ @@ -53,6 +42,67 @@ func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { }) } +// WritePRObservation persists a full PR observation — scalar facts, check runs, +// and the replacement comment set — in one write transaction, so the rows and +// the change_log events their triggers emit are committed all-or-nothing. The +// scalar PR upsert runs first so the checks'/comments' CDC triggers can resolve +// the session id from the pr row within the same transaction. +func (s *Store) WritePRObservation(ctx context.Context, pr PRRow, checks []PRCheckRow, comments []PRCommentRow) error { + pr = pr.withDefaults() + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.inTx(ctx, "write pr observation", func(q *gen.Queries) error { + if err := q.UpsertPR(ctx, gen.UpsertPRParams{ + Url: pr.URL, SessionID: pr.SessionID, Number: pr.Number, + PrState: pr.State, ReviewDecision: pr.ReviewDecision, + CiState: pr.CIState, Mergeability: pr.Mergeability, UpdatedAt: pr.UpdatedAt, + }); err != nil { + return err + } + for _, c := range checks { + if c.Status == "" { + c.Status = "unknown" + } + if err := q.UpsertPRCheck(ctx, gen.UpsertPRCheckParams{ + PrUrl: c.PRURL, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, Url: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + }); err != nil { + return err + } + } + if err := q.DeletePRComments(ctx, pr.URL); err != nil { + return err + } + for _, cm := range comments { + if err := q.UpsertPRComment(ctx, gen.UpsertPRCommentParams{ + PrUrl: pr.URL, CommentID: cm.CommentID, Author: cm.Author, File: cm.File, + Line: cm.Line, Body: cm.Body, Resolved: boolToInt(cm.Resolved), CreatedAt: cm.CreatedAt, + }); err != nil { + return fmt.Errorf("comment %q: %w", cm.CommentID, err) + } + } + return nil + }) +} + +// withDefaults fills empty enum fields with their "nothing known yet" value so a +// partial row satisfies the CHECK constraints (matches UpsertPR). +func (r PRRow) withDefaults() PRRow { + if r.State == "" { + r.State = "open" + } + if r.ReviewDecision == "" { + r.ReviewDecision = "none" + } + if r.CIState == "" { + r.CIState = "unknown" + } + if r.Mergeability == "" { + r.Mergeability = "unknown" + } + return r +} + // GetPR returns the PR facts for a URL, or ok=false if absent. func (s *Store) GetPR(ctx context.Context, url string) (PRRow, bool, error) { p, err := s.qr.GetPR(ctx, url) diff --git a/backend/lifecycle_wiring.go b/backend/lifecycle_wiring.go index f69b1ce4..d736d653 100644 --- a/backend/lifecycle_wiring.go +++ b/backend/lifecycle_wiring.go @@ -87,33 +87,30 @@ func (a storeAdapter) PRFactsForSession(ctx context.Context, id domain.SessionID return facts, nil } -func (a storeAdapter) UpsertPR(ctx context.Context, r ports.PRRow) error { - return a.Store.UpsertPR(ctx, sqlite.PRRow{ - URL: r.URL, SessionID: r.SessionID, Number: int64(r.Number), - State: prState(r), - ReviewDecision: string(r.Review), - CIState: string(r.CI), - Mergeability: string(r.Mergeability), - UpdatedAt: r.UpdatedAt, - }) -} - -func (a storeAdapter) RecordCheck(ctx context.Context, r ports.PRCheckRow) error { - return a.Store.RecordCheck(ctx, sqlite.PRCheckRow{ - PRURL: r.PRURL, Name: r.Name, CommitHash: r.CommitHash, - Status: r.Status, URL: r.URL, LogTail: r.LogTail, CreatedAt: r.CreatedAt, - }) -} - -func (a storeAdapter) ReplacePRComments(ctx context.Context, prURL string, comments []ports.PRComment) error { - rows := make([]sqlite.PRCommentRow, len(comments)) +func (a storeAdapter) WritePR(ctx context.Context, pr ports.PRRow, checks []ports.PRCheckRow, comments []ports.PRComment) error { + row := sqlite.PRRow{ + URL: pr.URL, SessionID: pr.SessionID, Number: int64(pr.Number), + State: prState(pr), + ReviewDecision: string(pr.Review), + CIState: string(pr.CI), + Mergeability: string(pr.Mergeability), + UpdatedAt: pr.UpdatedAt, + } + checkRows := make([]sqlite.PRCheckRow, len(checks)) + for i, c := range checks { + checkRows[i] = sqlite.PRCheckRow{ + PRURL: c.PRURL, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, URL: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + } + } + commentRows := make([]sqlite.PRCommentRow, len(comments)) for i, c := range comments { - rows[i] = sqlite.PRCommentRow{ - PRURL: prURL, CommentID: c.ID, Author: c.Author, File: c.File, + commentRows[i] = sqlite.PRCommentRow{ + PRURL: pr.URL, CommentID: c.ID, Author: c.Author, File: c.File, Line: int64(c.Line), Body: c.Body, Resolved: c.Resolved, CreatedAt: c.CreatedAt, } } - return a.Store.ReplacePRComments(ctx, prURL, rows) + return a.Store.WritePRObservation(ctx, row, checkRows, commentRows) } // prState collapses the PR's bools into the single pr.state column value.