diff --git a/.gitignore b/.gitignore index e5ea212a..425b31d7 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,15 @@ vendor/ /backend/backend agent-orchestrator.yaml +# Backend runtime data artifacts (SQLite store + WAL, CDC event log). +# Created at AO_DATA_DIR (outside the repo by default); ignored here so a +# data dir pointed at the tree never gets committed. +*.db +*.db-shm +*.db-wal +session-events.jsonl +session-events.jsonl.* + # Environment .env .env.* diff --git a/backend/cdc_wiring.go b/backend/cdc_wiring.go new file mode 100644 index 00000000..d824cbab --- /dev/null +++ b/backend/cdc_wiring.go @@ -0,0 +1,64 @@ +package main + +import ( + "context" + "encoding/json" + "log/slog" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// cdcPipeline owns the running CDC poller and the broadcaster the SSE transport +// subscribes to. The DB triggers write change_log; the poller tails it and fans +// each new event out through the broadcaster. Durable catch-up is the client's +// job (it reads change_log from its own Last-Event-ID), so the poller only +// pushes live events and re-seeks to head on restart. +type cdcPipeline struct { + Broadcaster *cdc.Broadcaster + done <-chan struct{} +} + +// startCDC seeks the poller to the current head and starts its loop. It stops +// when ctx is cancelled; Stop waits for it to drain. +func startCDC(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*cdcPipeline, error) { + bcast := cdc.NewBroadcaster() + poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{Logger: logger}) + if err := poller.SeekToHead(ctx); err != nil { + return nil, err + } + return &cdcPipeline{Broadcaster: bcast, done: poller.Start(ctx)}, nil +} + +// Stop waits for the poller goroutine to exit (the caller must have cancelled the +// ctx passed to startCDC). +func (p *cdcPipeline) Stop() error { + <-p.done + return nil +} + +// cdcSource adapts *sqlite.Store's change_log reads to cdc.Source. +type cdcSource struct{ store *sqlite.Store } + +func (s cdcSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { + rows, err := s.store.ReadChangeLogAfter(ctx, after, limit) + if err != nil { + return nil, err + } + out := make([]cdc.Event, len(rows)) + for i, r := range rows { + out[i] = cdc.Event{ + Seq: r.Seq, + ProjectID: r.ProjectID, + SessionID: r.SessionID, + Type: cdc.EventType(r.EventType), + Payload: json.RawMessage(r.Payload), + CreatedAt: r.CreatedAt, + } + } + return out, nil +} + +func (s cdcSource) LatestSeq(ctx context.Context) (int64, error) { + return s.store.MaxChangeLogSeq(ctx) +} diff --git a/backend/go.mod b/backend/go.mod index 311cea28..88ca590c 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -1,5 +1,25 @@ module github.com/aoagents/agent-orchestrator/backend -go 1.22 +go 1.25.7 -require github.com/go-chi/chi/v5 v5.1.0 +require ( + github.com/go-chi/chi/v5 v5.1.0 + github.com/pressly/goose/v3 v3.27.1 + modernc.org/sqlite v1.51.0 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/mattn/go-isatty v0.0.21 // indirect + github.com/mfridman/interpolate v0.0.2 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/sethvargo/go-retry v0.3.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.43.0 // indirect + modernc.org/libc v1.72.3 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect +) diff --git a/backend/go.sum b/backend/go.sum index 823cdbb1..89f83929 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,2 +1,68 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= +github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= +github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= +github.com/mfridman/interpolate v0.0.2/go.mod h1:p+7uk6oE07mpE/Ik1b8EckO0O4ZXiGAfshKBWLUM9Xg= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pressly/goose/v3 v3.27.1 h1:6uEvcprBybDmW4hcz3gYujhARhye+GoWKhEWyzD5sh4= +github.com/pressly/goose/v3 v3.27.1/go.mod h1:maruOxsPnIG2yHHyo8UqKWXYKFcH7Q76csUV7+7KYoM= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= +github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/cc/v4 v4.28.2 h1:3tQ0lf2ADtoby2EtSP+J7IE2SHwEJdP8ioR59wx7XpY= +modernc.org/cc/v4 v4.28.2/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI= +modernc.org/ccgo/v4 v4.34.0 h1:yRLPFZieg532OT4rp4JFNIVcquwalMX26G95WQDqwCQ= +modernc.org/ccgo/v4 v4.34.0/go.mod h1:AS5WYMyBakQ+fhsHhtP8mWB82KTGPkNNJDGfGQCe0/A= +modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= +modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.72.3 h1:ZnDF4tXn4NBXFutMMQC4vtbTFSXhhKzR73fv0beZEAU= +modernc.org/libc v1.72.3/go.mod h1:dn0dZNnnn1clLyvRxLxYExxiKRZIRENOfqQ8XEeg4Qs= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.2.0 h1:tGyef5ApycA7FSEOMraay9SaTk5zmbx7Tu+cJs4QKZg= +modernc.org/opt v0.2.0/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.51.0 h1:aH/MMSoayAIhozZ7uJbVTT9QO/VhzBf0J9tymmmuC/U= +modernc.org/sqlite v1.51.0/go.mod h1:tcNzv5p84E0skkmJn038y+hWJbLQXQqEnQfeh5r2JLM= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/backend/internal/cdc/broadcast.go b/backend/internal/cdc/broadcast.go new file mode 100644 index 00000000..b914f766 --- /dev/null +++ b/backend/internal/cdc/broadcast.go @@ -0,0 +1,66 @@ +package cdc + +import ( + "log/slog" + "sync" +) + +// Broadcaster is the in-process fan-out the poller feeds. Subscribers (the +// WS/SSE transport, wired in the frontend task) register a callback; every +// polled Event is delivered to all current subscribers. It is the single seam +// between the CDC poller and live delivery, so the transport can be built and +// swapped without touching the poller. +type Broadcaster struct { + mu sync.RWMutex + nextID int + subs map[int]func(Event) + logger *slog.Logger +} + +// NewBroadcaster returns an empty Broadcaster ready for subscriptions. +func NewBroadcaster() *Broadcaster { + return &Broadcaster{subs: map[int]func(Event){}, logger: slog.Default()} +} + +// Subscribe registers fn and returns an unsubscribe function. fn is called +// synchronously from the poller loop, so it must not block; a transport that +// needs buffering should push onto its own channel inside fn. +func (b *Broadcaster) Subscribe(fn func(Event)) (unsubscribe func()) { + b.mu.Lock() + id := b.nextID + b.nextID++ + b.subs[id] = fn + b.mu.Unlock() + return func() { + b.mu.Lock() + delete(b.subs, id) + b.mu.Unlock() + } +} + +// SubscriberCount reports the number of current subscribers. +func (b *Broadcaster) SubscriberCount() int { + b.mu.RLock() + defer b.mu.RUnlock() + return len(b.subs) +} + +// Publish delivers e to every current subscriber. A panicking subscriber is +// recovered and logged so one bad callback can't kill the poller goroutine or +// starve the other subscribers. +func (b *Broadcaster) Publish(e Event) { + b.mu.RLock() + defer b.mu.RUnlock() + for _, fn := range b.subs { + b.deliver(fn, e) + } +} + +func (b *Broadcaster) deliver(fn func(Event), e Event) { + defer func() { + if r := recover(); r != nil { + b.logger.Error("cdc broadcaster: subscriber panicked", "seq", e.Seq, "panic", r) + } + }() + fn(e) +} diff --git a/backend/internal/cdc/cdc_test.go b/backend/internal/cdc/cdc_test.go new file mode 100644 index 00000000..d72370f4 --- /dev/null +++ b/backend/internal/cdc/cdc_test.go @@ -0,0 +1,192 @@ +package cdc_test + +import ( + "context" + "encoding/json" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// storeSource adapts sqlite.Store to cdc.Source — the same glue the daemon wires. +type storeSource struct{ s *sqlite.Store } + +func (a storeSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { + rows, err := a.s.ReadChangeLogAfter(ctx, after, limit) + if err != nil { + return nil, err + } + out := make([]cdc.Event, len(rows)) + for i, r := range rows { + out[i] = cdc.Event{ + Seq: r.Seq, + ProjectID: r.ProjectID, + SessionID: r.SessionID, + Type: cdc.EventType(r.EventType), + Payload: json.RawMessage(r.Payload), + CreatedAt: r.CreatedAt, + } + } + return out, nil +} + +func (a storeSource) LatestSeq(ctx context.Context) (int64, error) { return a.s.MaxChangeLogSeq(ctx) } + +func newStore(t *testing.T) *sqlite.Store { + t.Helper() + s, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +func seedSession(t *testing.T, s *sqlite.Store) domain.SessionRecord { + t.Helper() + ctx := context.Background() + now := time.Now().UTC().Truncate(time.Second) + if err := s.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/m", RegisteredAt: now}); err != nil { + t.Fatal(err) + } + r, err := s.CreateSession(ctx, domain.SessionRecord{ + ProjectID: "mer", Kind: domain.KindWorker, + Lifecycle: domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: domain.SessionWorking}, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, + }, + CreatedAt: now, UpdatedAt: now, + }) + if err != nil { + t.Fatal(err) + } + return r +} + +// TestE2E_StoreWriteToBroadcast drives the whole path: a store write fires a DB +// trigger that appends to change_log; the poller reads it and broadcasts. +func TestE2E_StoreWriteToBroadcast(t *testing.T) { + ctx := context.Background() + s := newStore(t) + r := seedSession(t, s) // -> session_created (seq 1) + + r.Lifecycle.Session.State = domain.SessionIdle + if err := s.UpdateSession(ctx, r); err != nil { // -> session_updated (seq 2) + t.Fatal(err) + } + if err := s.UpsertPR(ctx, sqlite.PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: r.UpdatedAt}); err != nil { // -> pr_created (seq 3) + t.Fatal(err) + } + + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) + p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // StartSeq 0: read from the top + if err := p.Poll(ctx); err != nil { + t.Fatal(err) + } + + if len(got) != 3 { + t.Fatalf("delivered %d events, want 3", len(got)) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d seq=%d, want %d", i, e.Seq, i+1) + } + if e.ProjectID != "mer" { + t.Fatalf("event %d project=%q, want mer", i, e.ProjectID) + } + } + if got[0].Type != cdc.EventSessionCreated || got[1].Type != cdc.EventSessionUpdated || got[2].Type != cdc.EventPRCreated { + t.Fatalf("types = %s, %s, %s", got[0].Type, got[1].Type, got[2].Type) + } + // the trigger-built JSON payload survives as a usable RawMessage. + var payload map[string]any + if err := json.Unmarshal(got[0].Payload, &payload); err != nil { + t.Fatalf("payload not JSON: %v", err) + } + if payload["id"] != string(r.ID) || payload["state"] != "working" { + t.Fatalf("payload = %v", payload) + } + + // idempotent: a second poll with no new rows delivers nothing more. + if err := p.Poll(ctx); err != nil { + t.Fatal(err) + } + if len(got) != 3 { + t.Fatalf("re-poll delivered extra events: %d", len(got)) + } +} + +// TestE2E_ConcurrentPollerLiveDelivery runs the poller as a goroutine (the daemon +// model) and asserts every store change is delivered exactly once, in order. +func TestE2E_ConcurrentPollerLiveDelivery(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + s := newStore(t) + r := seedSession(t, s) // seq 1 + + var mu sync.Mutex + var got []cdc.Event + bc := cdc.NewBroadcaster() + bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // from the top + done := p.Start(ctx) + + const n = 6 + for i := 0; i < n; i++ { + r.Lifecycle.IsAlive = i%2 == 0 // toggles is_alive -> sessions_cdc_update fires + if err := s.UpdateSession(ctx, r); err != nil { + t.Fatal(err) + } + } + want := 1 + n // session_created + n updates + + deadline := time.Now().Add(5 * time.Second) + for { + mu.Lock() + c := len(got) + mu.Unlock() + if c >= want { + break + } + if time.Now().After(deadline) { + t.Fatalf("timed out: delivered %d/%d", c, want) + } + time.Sleep(20 * time.Millisecond) + } + cancel() + <-done + + mu.Lock() + defer mu.Unlock() + if len(got) != want { + t.Fatalf("delivered %d events, want %d", len(got), want) + } + for i, e := range got { + if e.Seq != int64(i+1) { + t.Fatalf("event %d has seq %d, want %d (out-of-order/duplicate)", i, e.Seq, i+1) + } + } +} + +// TestBroadcasterRecoversPanickingSubscriber: one panicking subscriber must not +// kill delivery to the others (or crash the poller goroutine). +func TestBroadcasterRecoversPanickingSubscriber(t *testing.T) { + bc := cdc.NewBroadcaster() + good := 0 + bc.Subscribe(func(cdc.Event) { panic("boom") }) + bc.Subscribe(func(cdc.Event) { good++ }) + + bc.Publish(cdc.Event{Seq: 1}) // must not panic + bc.Publish(cdc.Event{Seq: 2}) + + if good != 2 { + t.Fatalf("good subscriber got %d, want 2 (panic was not isolated)", good) + } +} diff --git a/backend/internal/cdc/event.go b/backend/internal/cdc/event.go new file mode 100644 index 00000000..04f52648 --- /dev/null +++ b/backend/internal/cdc/event.go @@ -0,0 +1,40 @@ +// Package cdc is the change-data-capture delivery layer. Change events are +// captured durably by SQLite triggers into the change_log table (see the storage +// migrations); this package POLLS that log and fans new events out, in order, to +// in-process subscribers (the WS/SSE transport, wired in the frontend task). +// +// There is no durable outbox/JSONL/janitor machinery: the change_log table IS +// the durable, ordered source of truth, and clients catch up by reading it from +// their own offset (SSE Last-Event-ID). The poller + broadcaster here are only +// the LIVE push on top of that. +package cdc + +import ( + "encoding/json" + "time" +) + +// EventType mirrors the event_type values the DB triggers write. +type EventType string + +const ( + EventSessionCreated EventType = "session_created" + EventSessionUpdated EventType = "session_updated" + EventPRCreated EventType = "pr_created" + EventPRUpdated EventType = "pr_updated" + EventPRCheckRecorded EventType = "pr_check_recorded" +) + +// Event is one CDC change read from change_log. Seq is the monotonic ordering + +// idempotency key (consumers dedup by it). SessionID is empty for project-level +// events. Payload is the trigger-built JSON, kept raw so a typed transport can +// narrow it by Type (the discriminated-union decode lives at the transport edge, +// not here). +type Event struct { + Seq int64 `json:"seq"` + ProjectID string `json:"projectId"` + SessionID string `json:"sessionId,omitempty"` + Type EventType `json:"type"` + Payload json.RawMessage `json:"payload"` + CreatedAt time.Time `json:"createdAt"` +} diff --git a/backend/internal/cdc/poller.go b/backend/internal/cdc/poller.go new file mode 100644 index 00000000..c824def3 --- /dev/null +++ b/backend/internal/cdc/poller.go @@ -0,0 +1,123 @@ +package cdc + +import ( + "context" + "fmt" + "log/slog" + "time" +) + +// DefaultPollInterval is how often the poller checks change_log for new rows. +// Polling (rather than fs-notify or a DB hook) keeps it dependency-free; at this +// cadence live updates stay well under a human-perceptible delay. +const DefaultPollInterval = 100 * time.Millisecond + +// DefaultBatch bounds how many events one poll drains. +const DefaultBatch = 512 + +// Source is the poller's view of the durable log: read events after a seq, and +// the current head seq. The storage layer implements it (the change_log table). +type Source interface { + EventsAfter(ctx context.Context, after int64, limit int) ([]Event, error) + LatestSeq(ctx context.Context) (int64, error) +} + +// Poller tails change_log and fans each new event out through the Broadcaster, +// in seq order. It holds only an in-memory cursor (lastSeq): it is the LIVE push +// path, while durable catch-up is the client's job (read change_log from its own +// offset). A restart re-seeks to head, so the poller never re-broadcasts history +// to a freshly-started broadcaster. +type Poller struct { + src Source + bcast *Broadcaster + interval time.Duration + batch int + logger *slog.Logger + lastSeq int64 +} + +// PollerConfig holds optional knobs; zero values fall back to defaults. StartSeq +// is the cursor to begin from; production wiring leaves it 0 and calls +// SeekToHead, tests set it to read from the beginning. +type PollerConfig struct { + Interval time.Duration + Batch int + Logger *slog.Logger + StartSeq int64 +} + +// NewPoller constructs a Poller over src, fanning out through bcast. +func NewPoller(src Source, bcast *Broadcaster, cfg PollerConfig) *Poller { + p := &Poller{ + src: src, + bcast: bcast, + interval: cfg.Interval, + batch: cfg.Batch, + logger: cfg.Logger, + lastSeq: cfg.StartSeq, + } + if p.interval <= 0 { + p.interval = DefaultPollInterval + } + if p.batch <= 0 { + p.batch = DefaultBatch + } + if p.logger == nil { + p.logger = slog.Default() + } + return p +} + +// SeekToHead moves the cursor to the current head, so the poller only broadcasts +// events created from now on (clients catch up on older events via the store). +func (p *Poller) SeekToHead(ctx context.Context) error { + seq, err := p.src.LatestSeq(ctx) + if err != nil { + return fmt.Errorf("cdc poller seek: %w", err) + } + p.lastSeq = seq + return nil +} + +// Start runs the poll loop until ctx is cancelled; the returned channel closes +// when the loop has exited. +func (p *Poller) Start(ctx context.Context) <-chan struct{} { + done := make(chan struct{}) + go func() { + defer close(done) + t := time.NewTicker(p.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := p.Poll(ctx); err != nil { + p.logger.Error("cdc poller: poll failed", "err", err) + } + } + } + }() + return done +} + +// Poll drains one batch of new events and broadcasts them in seq order, +// advancing the cursor. Exported so tests (and a daemon) can drive a cycle +// synchronously. +func (p *Poller) Poll(ctx context.Context) error { + evs, err := p.src.EventsAfter(ctx, p.lastSeq, p.batch) + if err != nil { + return fmt.Errorf("cdc poller: read after %d: %w", p.lastSeq, err) + } + for _, e := range evs { + if e.Seq <= p.lastSeq { + continue // idempotent guard + } + p.bcast.Publish(e) + p.lastSeq = e.Seq + } + return nil +} + +// LastSeq returns the poller's current cursor (the highest seq broadcast). +func (p *Poller) LastSeq() int64 { return p.lastSeq } diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index d6765dba..719e7524 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -47,6 +47,9 @@ type Config struct { // RunFilePath is where the PID + port handshake file (running.json) is // written so the Electron supervisor can discover and reap the daemon. RunFilePath string + // DataDir is the directory holding durable state (the SQLite database and + // the CDC JSONL log). It is created on first use by the storage layer. + DataDir string } // Addr returns the host:port the HTTP server binds. It uses net.JoinHostPort so @@ -65,6 +68,7 @@ func (c Config) Addr() string { // AO_REQUEST_TIMEOUT per-request timeout (Go duration > 0, default 60s) // AO_SHUTDOWN_TIMEOUT shutdown deadline (Go duration > 0, default 10s) // AO_RUN_FILE running.json path (default /running.json) +// AO_DATA_DIR durable state dir (default /data) // // The bind host is not configurable: the daemon is loopback-only by design. func Load() (Config, error) { @@ -108,6 +112,12 @@ func Load() (Config, error) { } cfg.RunFilePath = runFile + dataDir, err := resolveDataDir() + if err != nil { + return Config{}, err + } + cfg.DataDir = dataDir + return cfg, nil } @@ -138,3 +148,17 @@ func resolveRunFilePath() (string, error) { } return filepath.Join(dir, "agent-orchestrator", "running.json"), nil } + +// resolveDataDir picks where durable state (the SQLite DB) lives. An explicit +// AO_DATA_DIR wins; otherwise it sits under the per-user state directory +// alongside running.json. +func resolveDataDir() (string, error) { + if p, ok := os.LookupEnv("AO_DATA_DIR"); ok && p != "" { + return p, nil + } + dir, err := os.UserConfigDir() + if err != nil { + return "", fmt.Errorf("resolve state dir: %w", err) + } + return filepath.Join(dir, "agent-orchestrator", "data"), nil +} diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go index c46df18d..be195aef 100644 --- a/backend/internal/domain/decide/decide.go +++ b/backend/internal/domain/decide/decide.go @@ -1,7 +1,11 @@ // Package decide is the pure DECIDE core: total, deterministic, zero I/O. It -// collapses observed facts (plus the prior detecting/activity memory) into one -// LifecycleDecision. Every function here must remain side-effect free so the -// whole status truth-table can be tested in isolation. +// collapses observed liveness facts (plus the prior detecting memory) into one +// LifecycleDecision. Every function here is side-effect free so the whole +// liveness truth-table can be tested in isolation. +// +// PR-driven behaviour is NOT here: PR display status is derived by +// domain.DeriveStatus from the pr table, and PR-driven nudges are the reaction +// engine's job. decide is only about liveness + the anti-flap quarantine. package decide import ( @@ -30,158 +34,57 @@ const ( // terminal this decider may reach without quarantine); // - a *failed* probe (timeout/error) is never read as death — it routes to // detecting, as does any disagreement between the two probes; -// - only runtime-dead + process-dead + no-recent-activity reaches killed. +// - only runtime-down + process-dead + no-recent-activity reaches terminal. func ResolveProbeDecision(in ProbeInput) LifecycleDecision { if in.KillRequested { + reason := in.KillReason + if reason == "" { + reason = domain.TermManuallyKilled + } return LifecycleDecision{ - Status: domain.StatusKilled, - Evidence: "manual kill requested", - SessionState: domain.SessionTerminated, - SessionReason: domain.ReasonManuallyKilled, + Evidence: "manual kill requested", + SessionState: domain.SessionTerminated, + TerminationReason: reason, + IsAlive: false, } } - if in.RuntimeFailed || in.ProcessFailed || in.Runtime == domain.RuntimeProbeFailed { - ev := fmt.Sprintf("probe_failed runtime=%s runtimeFailed=%t process=%s processFailed=%t", - in.Runtime, in.RuntimeFailed, in.Process, in.ProcessFailed) - return detecting(in, domain.ReasonProbeFailure, ev) + if in.RuntimeFailed || in.ProcessFailed { + ev := fmt.Sprintf("probe_failed runtimeFailed=%t process=%s processFailed=%t", in.RuntimeFailed, in.Process, in.ProcessFailed) + return detecting(in, ev) } - switch in.Runtime { - case domain.RuntimeAlive: + if in.RuntimeAlive { if in.Process == ProcessDead { // Runtime up but the agent process is gone: probes disagree. ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity) - return detecting(in, domain.ReasonAgentProcessExited, ev) - } - return LifecycleDecision{ - Status: domain.StatusWorking, - Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), - SessionState: domain.SessionWorking, - SessionReason: domain.ReasonTaskInProgress, - } - - case domain.RuntimeExited, domain.RuntimeMissing: - // Runtime is gone. Death is only concluded when the process is *also* - // confirmed dead AND nothing has been heard from the agent recently; - // any other shape is ambiguous and quarantines. - if in.Process == ProcessAlive || in.RecentActivity { - ev := fmt.Sprintf("disagree runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) - return detecting(in, domain.ReasonRuntimeLost, ev) - } - if in.Process == ProcessDead { - return LifecycleDecision{ - Status: domain.StatusKilled, - Evidence: fmt.Sprintf("dead runtime=%s process=dead recentActivity=false", in.Runtime), - SessionState: domain.SessionTerminated, - SessionReason: domain.ReasonRuntimeLost, - } - } - // Process indeterminate: cannot confirm death, so quarantine. - ev := fmt.Sprintf("runtime_lost runtime=%s process=%s recentActivity=false", in.Runtime, in.Process) - return detecting(in, domain.ReasonRuntimeLost, ev) - - default: - // unknown (not yet probed): ambiguous, never conclude death. - ev := fmt.Sprintf("runtime_unknown runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) - return detecting(in, domain.ReasonRuntimeLost, ev) - } -} - -// ResolveOpenPRDecision walks the PR pipeline ladder. CI failure dominates -// everything. Draft PRs then surface as draft and do not enter the review or -// merge states. Open PRs continue through requested changes, approval/merge -// states, pending review, stalled (idle-beyond-threshold), then plain open. -func ResolveOpenPRDecision(in OpenPRInput) LifecycleDecision { - // evidence is a stable, timestamp-free summary " # " - // for logs/traceability; it folds in the PR identity inputs (Number/URL). - evidence := func(cond string) string { - s := cond - if in.Number > 0 { - s += fmt.Sprintf(" #%d", in.Number) - } - if in.URL != "" { - s += " " + in.URL + return detecting(in, ev) } - return s - } - prState := domain.PROpen - if in.Draft { - prState = domain.PRDraft - } - base := func(status domain.SessionStatus, cond string, prReason domain.PRReason, ss domain.SessionState, sr domain.SessionReason) LifecycleDecision { return LifecycleDecision{ - Status: status, - Evidence: evidence(cond), - SessionState: ss, - SessionReason: sr, - PRState: prState, - PRReason: prReason, + Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), + SessionState: domain.SessionWorking, + IsAlive: true, } } - switch { - case in.CIFailing: - return base(domain.StatusCIFailed, "ci_failing", domain.PRReasonCIFailing, domain.SessionWorking, domain.ReasonFixingCI) - case in.Draft: - return base(domain.StatusDraft, "draft", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) - case in.ChangesRequested: - return base(domain.StatusChangesRequested, "changes_requested", domain.PRReasonChangesRequested, domain.SessionWorking, domain.ReasonResolvingReviewComments) - case in.BotComments: - return base(domain.StatusChangesRequested, "bot_comments", domain.PRReasonBotComments, domain.SessionWorking, domain.ReasonResolvingReviewComments) - case in.MergeConflicts: - return base(domain.StatusPROpen, "merge_conflicts", domain.PRReasonMergeConflicts, domain.SessionWorking, domain.ReasonPRCreated) - case in.Mergeable: - // Mergeability is the authoritative merge gate, so it already folds in - // "approved if review is required". Checking it before Approved means a - // PR on a no-required-review repo (mergeable, not formally approved) is - // still surfaced as ready-to-merge instead of falling through to PR_OPEN. - return base(domain.StatusMergeable, "merge_ready", domain.PRReasonMergeReady, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.Approved: - return base(domain.StatusApproved, "approved", domain.PRReasonApproved, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.ReviewPending: - return base(domain.StatusReviewPending, "review_pending", domain.PRReasonReviewPending, domain.SessionIdle, domain.ReasonAwaitingExternalReview) - case in.IdleBeyond: - // A PR open but quiet past the stuck threshold needs a human nudge. - return base(domain.StatusStuck, "idle_beyond", domain.PRReasonInProgress, domain.SessionStuck, domain.ReasonAwaitingUserInput) - default: - return base(domain.StatusPROpen, "pr_open", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) + // Runtime is gone. Death is only concluded when the process is *also* + // confirmed dead AND nothing has been heard from the agent recently; any + // other shape is ambiguous and quarantines. + if in.Process == ProcessAlive || in.RecentActivity { + ev := fmt.Sprintf("disagree runtime=down process=%s recentActivity=%t", in.Process, in.RecentActivity) + return detecting(in, ev) } -} - -// ResolveTerminalPRStateDecision handles merged/closed PRs. A merge parks the -// session idle awaiting a human's post-merge decision; a close drops to idle. -// none/open are not terminal — callers should route those to the open-PR or -// probe deciders — but the function stays total for safety. -func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { - switch pr { - case domain.PRMerged: + if in.Process == ProcessDead { return LifecycleDecision{ - Status: domain.StatusMerged, - Evidence: "pr merged", - SessionState: domain.SessionIdle, - SessionReason: domain.ReasonMergedWaitingDecision, - PRState: domain.PRMerged, - PRReason: domain.PRReasonMerged, - } - case domain.PRClosed: - return LifecycleDecision{ - Status: domain.StatusIdle, - Evidence: "pr closed unmerged", - SessionState: domain.SessionIdle, - SessionReason: domain.ReasonAwaitingUserInput, - PRState: domain.PRClosed, - PRReason: domain.PRReasonClosedUnmerged, - } - default: - return LifecycleDecision{ - Status: domain.StatusWorking, - Evidence: fmt.Sprintf("non-terminal pr state=%s", pr), - SessionState: domain.SessionWorking, - SessionReason: domain.ReasonTaskInProgress, - PRState: pr, + Evidence: "dead runtime=down process=dead recentActivity=false", + SessionState: domain.SessionTerminated, + TerminationReason: domain.TermRuntimeLost, + IsAlive: false, } } + // Process indeterminate: cannot confirm death, so quarantine. + ev := fmt.Sprintf("runtime_lost runtime=down process=%s recentActivity=false", in.Process) + return detecting(in, ev) } // CreateDetectingDecision advances or escalates the anti-flap quarantine. @@ -189,9 +92,10 @@ func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { // The attempt counter climbs only while the (timestamp-stripped) evidence hash // is unchanged and resets the moment the evidence moves; StartedAt is preserved // across the whole detecting episode so the duration cap is a real wall-clock -// safety net even when the evidence keeps flapping. Escalation to stuck fires -// at DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration -// elapsed since first entering detecting. +// safety net even when the evidence keeps flapping. Escalation to stuck fires at +// DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration +// elapsed since first entering detecting. Detecting/stuck leave IsAlive true: +// the probe was ambiguous, so the session is not confirmed dead. func CreateDetectingDecision(in DetectingInput) LifecycleDecision { hash := HashEvidence(in.Evidence) @@ -207,19 +111,17 @@ func CreateDetectingDecision(in DetectingInput) LifecycleDecision { escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration)) if escalate { return LifecycleDecision{ - Status: domain.StatusStuck, - Evidence: in.Evidence, - SessionState: domain.SessionStuck, - SessionReason: in.ProposedReason, + Evidence: in.Evidence, + SessionState: domain.SessionStuck, + IsAlive: true, } } return LifecycleDecision{ - Status: domain.StatusDetecting, - Evidence: in.Evidence, - Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, - SessionState: domain.SessionDetecting, - SessionReason: in.ProposedReason, + Evidence: in.Evidence, + Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, + SessionState: domain.SessionDetecting, + IsAlive: true, } } @@ -237,38 +139,20 @@ func HashEvidence(evidence string) string { } // timestampPatterns is the list of regexes HashEvidence applies (in order) to -// delete the time-varying parts of an evidence string before hashing, so the -// same ambiguous signal restamped with a new clock value hashes equal and the -// detecting counter keeps climbing instead of resetting every tick. -// -// Order matters: the full datetime form is removed first so its embedded -// HH:MM:SS isn't half-eaten by the bare time-of-day pattern that follows. -// -// 1. full ISO-8601 / RFC3339 datetime — date, a T or space separator, -// HH:MM:SS, optional fractional seconds, optional Z or ±HH:MM offset. -// e.g. "2026-05-26T12:00:00Z", "2026-05-26 12:00:00.218+05:30" -// 2. a bare time-of-day, e.g. "12:00:00" or "12:00:00.218" -// 3. a bare unix epoch — any 10-13 digit run (seconds or millis), e.g. -// "1716724800". This is broad enough to also clobber a same-width numeric -// ID if one ever appears in evidence; evidence is decider-authored, so keep -// IDs out of evidence strings to preserve hash fidelity. +// delete the time-varying parts of an evidence string before hashing. var timestampPatterns = []*regexp.Regexp{ regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`), regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`), regexp.MustCompile(`\b\d{10,13}\b`), } -// detecting adapts a probe verdict into the shared anti-flap path. It packages -// the proposed reason + evidence (plus the prior counter from the same probe -// input) into a DetectingInput and defers to CreateDetectingDecision, so every +// detecting packages a probe verdict into the shared anti-flap path, so every // probe-driven ambiguity is counted and escalated by the identical quarantine // logic instead of each probe branch re-implementing the counter. -func detecting(in ProbeInput, reason domain.SessionReason, evidence string) LifecycleDecision { +func detecting(in ProbeInput, evidence string) LifecycleDecision { return CreateDetectingDecision(DetectingInput{ - Evidence: evidence, - ProposedState: domain.SessionDetecting, - ProposedReason: reason, - Prior: in.Prior, - Now: in.Now, + Evidence: evidence, + Prior: in.Prior, + Now: in.Now, }) } diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go index 1a815959..bc25af55 100644 --- a/backend/internal/domain/decide/decide_test.go +++ b/backend/internal/domain/decide/decide_test.go @@ -7,570 +7,158 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) +var t0 = time.Date(2026, 5, 31, 12, 0, 0, 0, time.UTC) func TestResolveProbeDecision(t *testing.T) { tests := []struct { - name string - in ProbeInput - wantStatus domain.SessionStatus - wantState domain.SessionState - wantReason domain.SessionReason - wantDetect bool // expect non-nil Detecting memory - wantTermNil bool // expect terminal (Detecting must be nil) - }{ - { - name: "kill requested short-circuits to terminal killed", - in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonManuallyKilled, - wantTermNil: true, - }, - { - name: "kill requested wins even over a dead+dead probe", - in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonManuallyKilled, - wantTermNil: true, - }, - { - name: "runtime probe failed routes to detecting, never death", - in: ProbeInput{Runtime: domain.RuntimeMissing, RuntimeFailed: true, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "process probe failed routes to detecting", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, ProcessFailed: true, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "runtime state probe_failed routes to detecting", - in: ProbeInput{Runtime: domain.RuntimeProbeFailed, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantDetect: true, - }, - { - name: "runtime alive + process alive is working", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - }, - { - name: "runtime alive + process indeterminate leans alive", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - }, - { - name: "runtime alive + process dead disagree -> detecting (agent_process_exited)", - in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonAgentProcessExited, - wantDetect: true, - }, - { - name: "runtime dead + process alive disagree -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessAlive, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime dead + recent activity disagree -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, RecentActivity: true, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime dead + process indeterminate cannot confirm -> detecting", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - { - name: "runtime exited + process dead + no activity -> killed terminal", - in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantTermNil: true, - }, - { - name: "runtime missing + process dead + no activity -> killed terminal", - in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusKilled, - wantState: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantTermNil: true, - }, - { - name: "runtime unknown is ambiguous -> detecting (runtime_lost)", - in: ProbeInput{Runtime: domain.RuntimeUnknown, Process: ProcessDead, Now: t0}, - wantStatus: domain.StatusDetecting, - wantState: domain.SessionDetecting, - wantReason: domain.ReasonRuntimeLost, - wantDetect: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := ResolveProbeDecision(tt.in) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) - } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) - } - if got.SessionReason != tt.wantReason { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) - } - if tt.wantDetect && got.Detecting == nil { - t.Errorf("expected non-nil Detecting memory, got nil") - } - if tt.wantTermNil && got.Detecting != nil { - t.Errorf("terminal decision must carry nil Detecting, got %+v", got.Detecting) - } - }) - } -} - -func TestResolveOpenPRDecision(t *testing.T) { - tests := []struct { - name string - in OpenPRInput - wantStatus domain.SessionStatus - wantPR domain.PRReason - wantPRState domain.PRState - wantState domain.SessionState + name string + in ProbeInput + wantState domain.SessionState + wantReason domain.TerminationReason + wantAlive bool + wantDetect bool // expect a detecting verdict (first attempt -> SessionDetecting) }{ { - name: "ci failing dominates everything", - in: OpenPRInput{CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusCIFailed, - wantPR: domain.PRReasonCIFailing, - wantState: domain.SessionWorking, - }, - { - name: "draft with failing CI maps to ci_failed", - in: OpenPRInput{Draft: true, CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusCIFailed, - wantPR: domain.PRReasonCIFailing, - wantPRState: domain.PRDraft, - wantState: domain.SessionWorking, - }, - { - name: "draft ignores review and merge states", - in: OpenPRInput{Draft: true, ChangesRequested: true, Approved: true, Mergeable: true, ReviewPending: true, IdleBeyond: true}, - wantStatus: domain.StatusDraft, - wantPR: domain.PRReasonInProgress, - wantPRState: domain.PRDraft, - wantState: domain.SessionWorking, - }, - { - name: "changes requested before approval states", - in: OpenPRInput{ChangesRequested: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusChangesRequested, - wantPR: domain.PRReasonChangesRequested, - wantState: domain.SessionWorking, - }, - { - name: "bot comments get distinct PR reason", - in: OpenPRInput{BotComments: true, Approved: true, Mergeable: true}, - wantStatus: domain.StatusChangesRequested, - wantPR: domain.PRReasonBotComments, - wantState: domain.SessionWorking, - }, - { - name: "merge conflicts get distinct PR reason", - in: OpenPRInput{MergeConflicts: true, Approved: true}, - wantStatus: domain.StatusPROpen, - wantPR: domain.PRReasonMergeConflicts, - wantState: domain.SessionWorking, - }, - { - name: "approved + mergeable -> mergeable", - in: OpenPRInput{Approved: true, Mergeable: true}, - wantStatus: domain.StatusMergeable, - wantPR: domain.PRReasonMergeReady, - wantState: domain.SessionIdle, - }, - { - name: "mergeable without formal approval (no required review) -> mergeable", - in: OpenPRInput{Mergeable: true}, - wantStatus: domain.StatusMergeable, - wantPR: domain.PRReasonMergeReady, - wantState: domain.SessionIdle, - }, - { - name: "approved but not mergeable -> approved", - in: OpenPRInput{Approved: true}, - wantStatus: domain.StatusApproved, - wantPR: domain.PRReasonApproved, - wantState: domain.SessionIdle, - }, - { - name: "review pending", - in: OpenPRInput{ReviewPending: true}, - wantStatus: domain.StatusReviewPending, - wantPR: domain.PRReasonReviewPending, - wantState: domain.SessionIdle, + name: "kill requested -> terminated with reason", + in: ProbeInput{KillRequested: true, KillReason: domain.TermManuallyKilled, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, }, { - name: "idle beyond threshold -> stuck", - in: OpenPRInput{IdleBeyond: true}, - wantStatus: domain.StatusStuck, - wantPR: domain.PRReasonInProgress, - wantState: domain.SessionStuck, + name: "kill requested without reason defaults to manually_killed", + in: ProbeInput{KillRequested: true, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, }, { - name: "review pending wins over idle-beyond", - in: OpenPRInput{ReviewPending: true, IdleBeyond: true}, - wantStatus: domain.StatusReviewPending, - wantPR: domain.PRReasonReviewPending, - wantState: domain.SessionIdle, + name: "runtime probe failed -> detecting (not death)", + in: ProbeInput{RuntimeFailed: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "nothing set -> plain open", - in: OpenPRInput{}, - wantStatus: domain.StatusPROpen, - wantPR: domain.PRReasonInProgress, - wantState: domain.SessionWorking, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := ResolveOpenPRDecision(tt.in) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) - } - if got.PRReason != tt.wantPR { - t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) - } - wantPRState := tt.wantPRState - if wantPRState == "" { - wantPRState = domain.PROpen - } - if got.PRState != wantPRState { - t.Errorf("PRState = %q, want %q", got.PRState, wantPRState) - } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) - } - }) - } -} - -func TestResolveOpenPRDecisionEvidence(t *testing.T) { - tests := []struct { - name string - in OpenPRInput - want string - }{ - { - name: "condition with PR number and URL", - in: OpenPRInput{CIFailing: true, Number: 123, URL: "https://example.com/pr/123"}, - want: "ci_failing #123 https://example.com/pr/123", + name: "process probe failed -> detecting", + in: ProbeInput{RuntimeAlive: true, ProcessFailed: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "condition with number only", - in: OpenPRInput{Approved: true, Mergeable: true, Number: 7}, - want: "merge_ready #7", + name: "runtime alive + process alive -> working", + in: ProbeInput{RuntimeAlive: true, Process: ProcessAlive, Now: t0}, + wantState: domain.SessionWorking, wantAlive: true, }, { - name: "no identity falls back to the bare condition", - in: OpenPRInput{}, - want: "pr_open", + name: "runtime alive + process indeterminate -> working", + in: ProbeInput{RuntimeAlive: true, Process: ProcessIndeterminate, Now: t0}, + wantState: domain.SessionWorking, wantAlive: true, }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := ResolveOpenPRDecision(tt.in).Evidence; got != tt.want { - t.Errorf("Evidence = %q, want %q", got, tt.want) - } - }) - } -} - -func TestDecidersDeriveConsistently(t *testing.T) { - // Every decision a decider produces must be self-consistent: the display - // Status it reports must equal what DeriveLegacyStatus produces from the - // canonical (session, pr) sub-states it emits. This locks the deciders and - // the display-derivation against drifting apart. - // - // The ResolveTerminalPRStateDecision none/open default is intentionally - // excluded — it is a documented no-op for misuse, not a real verdict. - var decisions []LifecycleDecision - - for _, in := range []OpenPRInput{ - {Draft: true, CIFailing: true}, - {Draft: true, ChangesRequested: true, Approved: true, Mergeable: true, ReviewPending: true, IdleBeyond: true}, - {CIFailing: true}, - {ChangesRequested: true}, - {BotComments: true}, - {MergeConflicts: true}, - {Approved: true, Mergeable: true}, - {Mergeable: true}, - {Approved: true}, - {ReviewPending: true}, - {IdleBeyond: true}, - {}, - } { - decisions = append(decisions, ResolveOpenPRDecision(in)) - } - - decisions = append(decisions, - ResolveTerminalPRStateDecision(domain.PRMerged), - ResolveTerminalPRStateDecision(domain.PRClosed), - ) - - for _, in := range []ProbeInput{ - {KillRequested: true, Now: t0}, - {Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, - {Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, - {Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, - } { - decisions = append(decisions, ResolveProbeDecision(in)) - } - - for _, d := range decisions { - l := domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: d.SessionState, Reason: d.SessionReason}, - PR: domain.PRSubstate{State: d.PRState, Reason: d.PRReason}, - } - if got := domain.DeriveLegacyStatus(l); got != d.Status { - t.Errorf("decision %+v: Status=%q but DeriveLegacyStatus=%q", d, d.Status, got) - } - } -} - -func TestResolveTerminalPRStateDecision(t *testing.T) { - tests := []struct { - name string - pr domain.PRState - wantStatus domain.SessionStatus - wantState domain.SessionState - wantReason domain.SessionReason - wantPR domain.PRReason - }{ { - name: "merged parks idle awaiting decision", - pr: domain.PRMerged, - wantStatus: domain.StatusMerged, - wantState: domain.SessionIdle, - wantReason: domain.ReasonMergedWaitingDecision, - wantPR: domain.PRReasonMerged, + name: "runtime alive + process dead -> detecting (disagree)", + in: ProbeInput{RuntimeAlive: true, Process: ProcessDead, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "closed drops to idle", - pr: domain.PRClosed, - wantStatus: domain.StatusIdle, - wantState: domain.SessionIdle, - wantReason: domain.ReasonAwaitingUserInput, - wantPR: domain.PRReasonClosedUnmerged, + name: "runtime down + process dead + no activity -> terminated runtime_lost", + in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: false, Now: t0}, + wantState: domain.SessionTerminated, wantReason: domain.TermRuntimeLost, wantAlive: false, }, { - name: "non-terminal none is a working no-op", - pr: domain.PRNone, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process alive -> detecting (disagree)", + in: ProbeInput{RuntimeAlive: false, Process: ProcessAlive, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "non-terminal open is a working no-op", - pr: domain.PROpen, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process dead + recent activity -> detecting", + in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: true, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, { - name: "non-terminal draft is a working no-op", - pr: domain.PRDraft, - wantStatus: domain.StatusWorking, - wantState: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, + name: "runtime down + process indeterminate -> detecting", + in: ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0}, + wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := ResolveTerminalPRStateDecision(tt.pr) - if got.Status != tt.wantStatus { - t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + d := ResolveProbeDecision(tt.in) + if d.SessionState != tt.wantState { + t.Errorf("state = %q, want %q", d.SessionState, tt.wantState) } - if got.SessionState != tt.wantState { - t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + if d.TerminationReason != tt.wantReason { + t.Errorf("reason = %q, want %q", d.TerminationReason, tt.wantReason) } - if got.SessionReason != tt.wantReason { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + if d.IsAlive != tt.wantAlive { + t.Errorf("isAlive = %v, want %v", d.IsAlive, tt.wantAlive) } - if tt.wantPR != "" && got.PRReason != tt.wantPR { - t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + if tt.wantDetect && d.Detecting == nil { + t.Errorf("expected detecting memory, got nil") } }) } } func TestCreateDetectingDecision(t *testing.T) { - const ev = "runtime_lost runtime=missing process=indeterminate" - hash := HashEvidence(ev) - - t.Run("first entry records attempt 1 and stays detecting", func(t *testing.T) { - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Now: t0}) - if got.Status != domain.StatusDetecting || got.SessionState != domain.SessionDetecting { - t.Fatalf("want detecting, got Status=%q State=%q", got.Status, got.SessionState) - } - if got.Detecting == nil || got.Detecting.Attempts != 1 { - t.Fatalf("want attempts=1, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt = %v, want %v", got.Detecting.StartedAt, t0) - } - if got.Detecting.EvidenceHash != hash { - t.Errorf("EvidenceHash = %q, want %q", got.Detecting.EvidenceHash, hash) - } - if got.SessionReason != domain.ReasonRuntimeLost { - t.Errorf("SessionReason = %q, want %q", got.SessionReason, domain.ReasonRuntimeLost) + t.Run("first entry sets attempts 1", func(t *testing.T) { + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Now: t0}) + if d.SessionState != domain.SessionDetecting || d.Detecting == nil || d.Detecting.Attempts != 1 { + t.Fatalf("got %+v", d) } }) - - t.Run("unchanged evidence climbs the counter", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Detecting == nil || got.Detecting.Attempts != 2 { - t.Fatalf("want attempts=2, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt must be preserved, got %v", got.Detecting.StartedAt) + t.Run("same evidence climbs the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) + if d.Detecting == nil || d.Detecting.Attempts != 2 { + t.Fatalf("attempts = %+v, want 2", d.Detecting) } }) - - t.Run("escalates to stuck on the third unchanged tick", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Status != domain.StatusStuck || got.SessionState != domain.SessionStuck { - t.Fatalf("want stuck, got Status=%q State=%q", got.Status, got.SessionState) - } - if got.Detecting != nil { - t.Errorf("stuck decision must drop detecting memory, got %+v", got.Detecting) - } - if got.SessionReason != domain.ReasonRuntimeLost { - t.Errorf("escalation should carry the why, got %q", got.SessionReason) - } - }) - - t.Run("changing evidence resets the counter but preserves StartedAt", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: "different evidence", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) - if got.Status != domain.StatusDetecting { - t.Fatalf("changed evidence should stay detecting, got %q", got.Status) - } - if got.Detecting == nil || got.Detecting.Attempts != 1 { - t.Fatalf("counter should reset to 1, got %+v", got.Detecting) - } - if !got.Detecting.StartedAt.Equal(t0) { - t.Errorf("StartedAt must survive an evidence change, got %v", got.Detecting.StartedAt) + t.Run("changed evidence resets the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 2, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "process dead", Prior: prior, Now: t0.Add(time.Second)}) + if d.Detecting == nil || d.Detecting.Attempts != 1 { + t.Fatalf("attempts = %+v, want 1 (evidence changed)", d.Detecting) } }) - - t.Run("duration cap escalates even below the attempt count", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration)}) - if got.Status != domain.StatusStuck { - t.Fatalf("want stuck from duration cap, got %q", got.Status) + t.Run("escalates to stuck at the attempt cap", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) + if d.SessionState != domain.SessionStuck { + t.Fatalf("state = %q, want stuck", d.SessionState) } }) - - t.Run("duration cap fires even when evidence keeps flapping", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} - got := CreateDetectingDecision(DetectingInput{Evidence: "ever-changing", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Minute)}) - if got.Status != domain.StatusStuck { - t.Fatalf("duration cap must override a reset counter, got %q", got.Status) + t.Run("escalates to stuck past the duration cap", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} + d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Second)}) + if d.SessionState != domain.SessionStuck { + t.Fatalf("state = %q, want stuck (duration cap)", d.SessionState) } }) } func TestProbeDetectingEscalationFlow(t *testing.T) { - // An unchanging ambiguous probe should escalate to stuck after exactly - // DetectingMaxAttempts ticks. - in := ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0} - d := ResolveProbeDecision(in) + in := ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0} + var prior *domain.DetectingState for i := 1; i < DetectingMaxAttempts; i++ { - if d.Status != domain.StatusDetecting { - t.Fatalf("tick %d: expected detecting, got %q", i, d.Status) - } - in.Prior = d.Detecting + in.Prior = prior in.Now = t0.Add(time.Duration(i) * time.Second) - d = ResolveProbeDecision(in) + d := ResolveProbeDecision(in) + if d.SessionState != domain.SessionDetecting { + t.Fatalf("attempt %d: state = %q, want detecting", i, d.SessionState) + } + prior = d.Detecting } - if d.Status != domain.StatusStuck { - t.Fatalf("expected escalation to stuck after %d ticks, got %q", DetectingMaxAttempts, d.Status) + in.Prior = prior + in.Now = t0.Add(time.Hour) + if d := ResolveProbeDecision(in); d.SessionState != domain.SessionStuck { + t.Fatalf("final attempt: state = %q, want stuck", d.SessionState) } } func TestHashEvidence(t *testing.T) { - t.Run("identical strings hash identically", func(t *testing.T) { - if HashEvidence("same input") != HashEvidence("same input") { - t.Error("identical evidence must hash equal") - } - }) - - t.Run("different evidence hashes differently", func(t *testing.T) { - if HashEvidence("runtime_lost") == HashEvidence("agent_process_exited") { - t.Error("distinct evidence must hash differently") - } - }) - - t.Run("only the timestamp differs -> equal hash", func(t *testing.T) { - a := "probe failed at 2026-05-26T12:00:00Z runtime=missing" - b := "probe failed at 2026-05-26T12:05:43.218Z runtime=missing" - if HashEvidence(a) != HashEvidence(b) { - t.Errorf("restamped evidence should hash equal:\n a=%q\n b=%q", a, b) - } - }) - - t.Run("bare time-of-day stripped", func(t *testing.T) { - if HashEvidence("idle since 12:00:00") != HashEvidence("idle since 13:30:59") { - t.Error("time-of-day differences should be stripped") - } - }) - - t.Run("unix epoch stripped", func(t *testing.T) { - if HashEvidence("last seen 1716724800") != HashEvidence("last seen 1716728400") { - t.Error("epoch differences should be stripped") - } - }) - - t.Run("a real content change still changes the hash", func(t *testing.T) { - a := "probe at 2026-05-26T12:00:00Z runtime=missing" - b := "probe at 2026-05-26T12:00:00Z runtime=alive" - if HashEvidence(a) == HashEvidence(b) { - t.Error("non-timestamp content change must change the hash") - } - }) - - t.Run("whitespace differences are normalised", func(t *testing.T) { - if HashEvidence("runtime=missing process=dead") != HashEvidence("runtime=missing process=dead") { - t.Error("collapsed whitespace should hash equal") - } - }) + // timestamp-only differences hash equal; a real change differs. + a := HashEvidence("runtime down at 2026-05-31T12:00:00Z") + b := HashEvidence("runtime down at 2026-05-31T13:30:45Z") + if a != b { + t.Errorf("restamped evidence should hash equal") + } + c := HashEvidence("process dead at 2026-05-31T12:00:00Z") + if a == c { + t.Errorf("different evidence should hash differently") + } } diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go index 1666fae7..832fab6f 100644 --- a/backend/internal/domain/decide/types.go +++ b/backend/internal/domain/decide/types.go @@ -6,39 +6,34 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleDecision is the output of every decider: the derived display status -// plus the canonical sub-state values to persist, the human-readable evidence, -// and the (possibly updated) detecting memory. +// LifecycleDecision is the output of a decider: the canonical session sub-state +// to persist (state, the liveness bool, and — only for a terminal state — the +// termination reason), the human-readable evidence, and the (possibly updated) +// detecting memory. The display status is NOT here — it is derived on read by +// domain.DeriveStatus from the persisted lifecycle plus the pr table. // -// Zero-value sub-state fields mean "this decider does not address that -// sub-state — leave it unchanged", NOT "set it to the empty value". SessionState -// is always populated, but the probe/detecting/kill paths legitimately leave -// PRState/PRReason empty: a liveness verdict knows nothing about the PR. When -// the LCM folds a decision into the next full canonical row it must therefore -// leave empty PRState/PRReason unchanged rather than writing them through — -// writing PRNone on a routine probe tick would clobber a live PR. Detecting is -// nil-by-default; the LCM explicitly clears stale detecting memory when a probe -// verdict leaves detecting. +// PR facts are likewise not here: a liveness verdict knows nothing about the PR, +// and PR-driven display/reactions are handled off the pr table, not the session +// state machine. type LifecycleDecision struct { - Status domain.SessionStatus - Evidence string - Detecting *domain.DetectingState - SessionState domain.SessionState - SessionReason domain.SessionReason - PRState domain.PRState - PRReason domain.PRReason + Evidence string + Detecting *domain.DetectingState + SessionState domain.SessionState + TerminationReason domain.TerminationReason // set only when SessionState is terminated + IsAlive bool } -// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout -// or error) is distinct from a "dead" verdict and must route to detecting, -// never to a death conclusion. KillRequested short-circuits to terminal. +// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout or +// error) is distinct from a "dead" verdict and must route to detecting, never to +// a death conclusion. KillRequested short-circuits to terminal with KillReason. type ProbeInput struct { - Runtime domain.RuntimeState - RuntimeFailed bool + RuntimeAlive bool // the runtime probe reports the backing runtime is up + RuntimeFailed bool // the runtime probe itself failed (timeout/error) — not "dead" Process ProcessLiveness ProcessFailed bool RecentActivity bool KillRequested bool + KillReason domain.TerminationReason // the terminal reason when KillRequested Prior *domain.DetectingState Now time.Time } @@ -52,28 +47,11 @@ const ( ProcessIndeterminate ProcessLiveness = "indeterminate" ) -// OpenPRInput drives the PR pipeline ladder for an open or draft PR. -type OpenPRInput struct { - Draft bool - CIFailing bool - ChangesRequested bool - BotComments bool - MergeConflicts bool - Approved bool - Mergeable bool - ReviewPending bool - IdleBeyond bool // idle past the stuck threshold - Number int - URL string -} - -// DetectingInput feeds the quarantine counter. Evidence is hashed with +// DetectingInput feeds the anti-flap quarantine counter. Evidence is hashed with // timestamps stripped, so "same ambiguous signal" keeps the counter climbing // while any real change resets it. type DetectingInput struct { - Evidence string - ProposedState domain.SessionState - ProposedReason domain.SessionReason - Prior *domain.DetectingState - Now time.Time + Evidence string + Prior *domain.DetectingState + Now time.Time } diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go index fca87b6b..a82ea85a 100644 --- a/backend/internal/domain/lifecycle.go +++ b/backend/internal/domain/lifecycle.go @@ -11,30 +11,35 @@ import "time" // Greenfield: we start at 1 and carry no migration/synthesis code. const LifecycleVersion = 1 -// CanonicalSessionLifecycle is the ONLY thing persisted for a session's state. -// The display status is derived from it on read (see DeriveLegacyStatus) and is -// never stored — this prevents canonical truth and display from drifting. +// CanonicalSessionLifecycle is the ONLY lifecycle state persisted for a session. +// The display status is derived from it (plus the session's PR facts, which live +// in the separate pr table) on read — see DeriveStatus — and is never stored, so +// canonical truth and display cannot drift. // -// Three orthogonal (state, reason) sub-states describe the session, its PR, and -// its runtime. Activity and Detecting are decider *inputs* that must survive -// between observations (they are read back by the pure decide core), so they -// live in the persisted record too. +// PR facts are deliberately NOT here: a session can own several PRs over its +// life, and PR state is owned by the pr table. The runtime axis is collapsed to +// a single IsAlive boolean. Activity and Detecting are decider *inputs* that +// must survive between observations, so they live in the persisted record. type CanonicalSessionLifecycle struct { // Version is the Go-only schema-shape constant for this record. It is not // persisted and is not part of the CDC payload. Version int - // Revision is the per-write monotonic counter. The storage layer's Upsert - // bumps it when the full row is persisted; the LCM does not. - Revision int `json:"revision"` - Session SessionSubstate `json:"session"` - PR PRSubstate `json:"pr"` - Runtime RuntimeSubstate `json:"runtime"` - - // Activity is the last-known agent activity. It arrives on a different - // cadence (ApplyActivitySignal) than runtime probes (the reaper), so the - // probe decider reads it from here to answer "was there recent activity?". + + Session SessionSubstate `json:"session"` Activity ActivitySubstate `json:"activity"` + // TerminationReason is set only when Session.State is terminated; '' otherwise. + TerminationReason TerminationReason `json:"terminationReason,omitempty"` + + // IsAlive is the single liveness fact: is the runtime/process backing this + // session still up? It replaces the old runtime (state, reason) axis — the + // nuance the probe decider needs (failed-probe != dead, anti-flap) lives in + // the decide core's inputs, not in a persisted enum. + IsAlive bool `json:"isAlive"` + + // Harness is the agent harness the session runs (claude-code, codex, ...). + Harness AgentHarness `json:"harness,omitempty"` + // Detecting is the anti-flap quarantine memory. It is non-nil only while // the session is in the detecting state; it carries the attempt counter, // the first-entry time, and a hash of the (timestamp-stripped) evidence so @@ -42,6 +47,18 @@ type CanonicalSessionLifecycle struct { Detecting *DetectingState `json:"detecting,omitempty"` } +// ---- agent harness ---- + +// AgentHarness identifies which agent CLI/runtime a session drives. +type AgentHarness string + +const ( + HarnessClaudeCode AgentHarness = "claude-code" + HarnessCodex AgentHarness = "codex" + HarnessAider AgentHarness = "aider" + HarnessOpenCode AgentHarness = "opencode" +) + // ---- session sub-state ---- type SessionState string @@ -57,99 +74,75 @@ const ( SessionTerminated SessionState = "terminated" ) -type SessionReason string +// TerminationReason is the typed "why" for a terminated session — the only +// state that carries a reason. Empty for every non-terminal state. It decides +// the terminal display status (killed / cleanup / errored). The PR-pipeline +// "why" (fixing CI, awaiting review, …) is NOT here; it is derived on read from +// the pr table, not persisted on the session. +type TerminationReason string const ( - ReasonSpawnRequested SessionReason = "spawn_requested" - ReasonAgentAcknowledged SessionReason = "agent_acknowledged" - ReasonTaskInProgress SessionReason = "task_in_progress" - ReasonPRCreated SessionReason = "pr_created" - ReasonFixingCI SessionReason = "fixing_ci" - ReasonResolvingReviewComments SessionReason = "resolving_review_comments" - ReasonAwaitingUserInput SessionReason = "awaiting_user_input" - ReasonAwaitingExternalReview SessionReason = "awaiting_external_review" - ReasonResearchComplete SessionReason = "research_complete" - ReasonMergedWaitingDecision SessionReason = "merged_waiting_decision" - ReasonManuallyKilled SessionReason = "manually_killed" - ReasonPRMerged SessionReason = "pr_merged" - ReasonAutoCleanup SessionReason = "auto_cleanup" - ReasonRuntimeLost SessionReason = "runtime_lost" - ReasonAgentProcessExited SessionReason = "agent_process_exited" - ReasonProbeFailure SessionReason = "probe_failure" - ReasonErrorInProcess SessionReason = "error_in_process" + TermNone TerminationReason = "" + TermManuallyKilled TerminationReason = "manually_killed" + TermRuntimeLost TerminationReason = "runtime_lost" + TermAgentProcessExited TerminationReason = "agent_process_exited" + TermProbeFailure TerminationReason = "probe_failure" + TermErrorInProcess TerminationReason = "error_in_process" + TermAutoCleanup TerminationReason = "auto_cleanup" + TermPRMerged TerminationReason = "pr_merged" ) type SessionSubstate struct { - State SessionState `json:"state"` - Reason SessionReason `json:"reason"` + State SessionState `json:"state"` } -// ---- PR sub-state ---- - -type PRState string - -const ( - PRNone PRState = "none" - PRDraft PRState = "draft" - PROpen PRState = "open" - PRMerged PRState = "merged" - PRClosed PRState = "closed" -) +// ---- PR facts (NOT persisted on the session; sourced from the pr table) ---- + +// PRFacts is the per-session PR snapshot the status/reaction derivation reads +// from the pr table. It is the decider input that replaces the old persisted PR +// axis. The zero value (Exists=false) means "no PR", which derivation treats as +// "session has no PR". +type PRFacts struct { + URL string + Number int + Exists bool + Draft bool + Merged bool + Closed bool + CI CIState + Review ReviewDecision + Mergeability Mergeability + ReviewComments bool // has unresolved review comments (any author) to address +} -type PRReason string +type CIState string const ( - PRReasonNotCreated PRReason = "not_created" - PRReasonInProgress PRReason = "in_progress" - PRReasonCIFailing PRReason = "ci_failing" - PRReasonReviewPending PRReason = "review_pending" - PRReasonChangesRequested PRReason = "changes_requested" - PRReasonBotComments PRReason = "bot_comments" - PRReasonMergeConflicts PRReason = "merge_conflicts" - PRReasonApproved PRReason = "approved" - PRReasonMergeReady PRReason = "merge_ready" - PRReasonMerged PRReason = "merged" - PRReasonClosedUnmerged PRReason = "closed_unmerged" - PRReasonClearedOnRestore PRReason = "cleared_on_restore" + CIUnknown CIState = "unknown" + CIPending CIState = "pending" + CIPassing CIState = "passing" + CIFailing CIState = "failing" ) -type PRSubstate struct { - State PRState `json:"state"` - Reason PRReason `json:"reason"` - Number int `json:"number,omitempty"` - URL string `json:"url,omitempty"` -} - -// ---- runtime sub-state ---- - -type RuntimeState string +type ReviewDecision string const ( - RuntimeUnknown RuntimeState = "unknown" - RuntimeAlive RuntimeState = "alive" - RuntimeExited RuntimeState = "exited" - RuntimeMissing RuntimeState = "missing" - RuntimeProbeFailed RuntimeState = "probe_failed" + ReviewNone ReviewDecision = "none" + ReviewApproved ReviewDecision = "approved" + ReviewChangesRequest ReviewDecision = "changes_requested" + ReviewRequired ReviewDecision = "review_required" ) -type RuntimeReason string +type Mergeability string const ( - RuntimeReasonSpawnIncomplete RuntimeReason = "spawn_incomplete" - RuntimeReasonProcessRunning RuntimeReason = "process_running" - RuntimeReasonProcessMissing RuntimeReason = "process_missing" - RuntimeReasonTmuxMissing RuntimeReason = "tmux_missing" - RuntimeReasonManualKillRequested RuntimeReason = "manual_kill_requested" - RuntimeReasonPRMergedCleanup RuntimeReason = "pr_merged_cleanup" - RuntimeReasonAutoCleanup RuntimeReason = "auto_cleanup" - RuntimeReasonProbeError RuntimeReason = "probe_error" + MergeUnknown Mergeability = "unknown" + MergeMergeable Mergeability = "mergeable" + MergeConflicting Mergeability = "conflicting" + MergeBlocked Mergeability = "blocked" + MergeUnstable Mergeability = "unstable" ) -type RuntimeSubstate struct { - State RuntimeState `json:"state"` - Reason RuntimeReason `json:"reason"` -} - // ---- activity sub-state (decider input) ---- type ActivityState string diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go index 578cca40..2b81088a 100644 --- a/backend/internal/domain/session.go +++ b/backend/internal/domain/session.go @@ -17,16 +17,41 @@ const ( KindOrchestrator SessionKind = "orchestrator" ) +// SessionMetadata is the typed, off-canonical metadata for a session: the +// operational handles and seed inputs the Session Manager and reaper need but +// that are NOT part of the canonical lifecycle. The set of fields is fixed here +// (no free-form keys), so what a session can carry is a compile-time fact, and +// it is folded into the sessions row off the CDC path. +// +// Empty fields mean "unset": the LCM merges metadata without overwriting a +// stored value with an empty one, so a partial write (spawn setting only the +// runtime handle) does not clobber a value set earlier (the branch at creation). +type SessionMetadata struct { + Branch string `json:"branch,omitempty"` + WorkspacePath string `json:"workspacePath,omitempty"` + RuntimeHandleID string `json:"runtimeHandleId,omitempty"` + RuntimeName string `json:"runtimeName,omitempty"` + AgentSessionID string `json:"agentSessionId,omitempty"` + Prompt string `json:"prompt,omitempty"` +} + +// IsZero reports whether no metadata field is set. +func (m SessionMetadata) IsZero() bool { return m == SessionMetadata{} } + // SessionRecord is the PERSISTENCE shape: identity, canonical lifecycle, and // metadata — everything the store holds, and nothing derived. The store reads // and writes records; it never produces the derived display status. +// +// Metadata is json:"-" on purpose: it lives off the canonical path, so it must +// never ride along in the change_log / snapshot payloads. Enforcing that at the +// type level means no caller has to remember to scrub it before marshalling. type SessionRecord struct { ID SessionID `json:"id"` ProjectID ProjectID `json:"projectId"` IssueID IssueID `json:"issueId,omitempty"` Kind SessionKind `json:"kind"` Lifecycle CanonicalSessionLifecycle `json:"lifecycle"` - Metadata map[string]string `json:"metadata,omitempty"` + Metadata SessionMetadata `json:"-"` CreatedAt time.Time `json:"createdAt"` UpdatedAt time.Time `json:"updatedAt"` } diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go index 1cc4404d..3ae1e00c 100644 --- a/backend/internal/domain/status.go +++ b/backend/internal/domain/status.go @@ -1,7 +1,8 @@ package domain // SessionStatus is the single-word DISPLAY status the dashboard renders. It is -// derived from the canonical lifecycle on read and never persisted. +// derived from the canonical lifecycle (plus the session's PR facts) on read and +// never persisted. type SessionStatus string const ( @@ -26,27 +27,27 @@ const ( StatusTerminated SessionStatus = "terminated" ) -// DeriveLegacyStatus is the ONLY producer of the display status. It must stay a -// pure, total function of the canonical record. +// DeriveStatus is the ONLY producer of the display status. It is a pure, total +// function of the canonical record plus the session's PR facts (read from the pr +// table by the caller, since PR state is no longer persisted on the session). // // Order matters: // 1. Terminal / hard session states (done, terminated, needs_input, stuck, // detecting, not_started) map directly — these OUTRANK PR facts. -// 2. Otherwise a merged PR wins. -// 3. Otherwise a draft PR maps to draft, except CI failure still dominates. -// 4. Otherwise an open PR maps by its reason. -// 5. Otherwise fall through to the SOFT session state (idle/working). +// 2. Otherwise, if the session has a PR: a merged PR wins, else the PR pipeline +// ladder (CI failure dominates, then draft/review/merge states). +// 3. Otherwise fall through to the SOFT session state (idle/working). // // So "PR facts dominate session facts" applies only to the soft states: an idle // or working session with an open, CI-failing PR displays as ci_failed — but a -// session that is stuck or needs_input shows that regardless of PR state, since -// it needs a human either way. -func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { +// session that is stuck or needs_input shows that regardless, since it needs a +// human either way. +func DeriveStatus(l CanonicalSessionLifecycle, pr PRFacts) SessionStatus { switch l.Session.State { case SessionDone: return StatusDone case SessionTerminated: - return terminatedStatus(l.Session.Reason) + return terminatedStatus(l.TerminationReason) case SessionNeedsInput: return StatusNeedsInput case SessionStuck: @@ -57,16 +58,13 @@ func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { return StatusSpawning } - if l.PR.State == PRMerged { - return StatusMerged - } - - if l.PR.State == PRDraft { - return draftPRStatus(l.PR.Reason) - } - - if l.PR.State == PROpen { - return openPRStatus(l.PR.Reason) + if pr.Exists { + if pr.Merged { + return StatusMerged + } + if !pr.Closed { + return prPipelineStatus(pr) + } } if l.Session.State == SessionIdle { @@ -75,37 +73,35 @@ func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { return StatusWorking } -func terminatedStatus(r SessionReason) SessionStatus { +func terminatedStatus(r TerminationReason) SessionStatus { switch r { - case ReasonManuallyKilled, ReasonRuntimeLost, ReasonAgentProcessExited: + case TermManuallyKilled, TermRuntimeLost, TermAgentProcessExited: return StatusKilled - case ReasonAutoCleanup, ReasonPRMerged: + case TermAutoCleanup, TermPRMerged: return StatusCleanup - case ReasonErrorInProcess, ReasonProbeFailure: + case TermErrorInProcess, TermProbeFailure: return StatusErrored default: return StatusTerminated } } -func draftPRStatus(r PRReason) SessionStatus { - if r == PRReasonCIFailing { +// prPipelineStatus maps an open/draft PR's facts to a display status, preserving +// the old ladder: CI failure dominates everything, then draft, then the review / +// merge states. +func prPipelineStatus(pr PRFacts) SessionStatus { + switch { + case pr.CI == CIFailing: return StatusCIFailed - } - return StatusDraft -} - -func openPRStatus(r PRReason) SessionStatus { - switch r { - case PRReasonCIFailing: - return StatusCIFailed - case PRReasonChangesRequested, PRReasonBotComments: + case pr.Draft: + return StatusDraft + case pr.Review == ReviewChangesRequest || pr.ReviewComments: return StatusChangesRequested - case PRReasonApproved: - return StatusApproved - case PRReasonMergeReady: + case pr.Mergeability == MergeMergeable: return StatusMergeable - case PRReasonReviewPending: + case pr.Review == ReviewApproved: + return StatusApproved + case pr.Review == ReviewRequired: return StatusReviewPending default: return StatusPROpen diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go index 09854998..57512577 100644 --- a/backend/internal/domain/status_test.go +++ b/backend/internal/domain/status_test.go @@ -2,117 +2,58 @@ package domain import "testing" -func TestDeriveLegacyStatus(t *testing.T) { +func TestDeriveStatus(t *testing.T) { + // sess builds a non-terminal lifecycle (no reason). + sess := func(s SessionState) CanonicalSessionLifecycle { + return CanonicalSessionLifecycle{Session: SessionSubstate{State: s}} + } + // term builds a terminated lifecycle carrying a TerminationReason. + term := func(r TerminationReason) CanonicalSessionLifecycle { + return CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated}, TerminationReason: r} + } + openPR := func(mut func(*PRFacts)) PRFacts { + f := PRFacts{Exists: true, CI: CIUnknown, Review: ReviewNone, Mergeability: MergeUnknown} + if mut != nil { + mut(&f) + } + return f + } + tests := []struct { name string in CanonicalSessionLifecycle + pr PRFacts want SessionStatus }{ - { - name: "not_started maps to spawning", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNotStarted, Reason: ReasonSpawnRequested}}, - want: StatusSpawning, - }, - { - name: "terminated+manually_killed maps to killed", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonManuallyKilled}}, - want: StatusKilled, - }, - { - name: "terminated+auto_cleanup maps to cleanup", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonAutoCleanup}}, - want: StatusCleanup, - }, - { - name: "terminated+error maps to errored", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonErrorInProcess}}, - want: StatusErrored, - }, - { - name: "hard state needs_input maps directly", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNeedsInput}}, - want: StatusNeedsInput, - }, - { - name: "merged PR dominates an idle session", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionIdle}, - PR: PRSubstate{State: PRMerged}, - }, - want: StatusMerged, - }, - { - name: "open PR with failing CI dominates idle session", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionIdle}, - PR: PRSubstate{State: PROpen, Reason: PRReasonCIFailing}, - }, - want: StatusCIFailed, - }, - { - name: "draft PR with failing CI maps to ci_failed", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PRDraft, Reason: PRReasonCIFailing}, - }, - want: StatusCIFailed, - }, - { - name: "draft PR ignores review and merge reasons", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PRDraft, Reason: PRReasonMergeReady}, - }, - want: StatusDraft, - }, - { - name: "open PR bot comments display as changes_requested", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonBotComments}, - }, - want: StatusChangesRequested, - }, - { - name: "open PR merge conflicts display as plain open", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonMergeConflicts}, - }, - want: StatusPROpen, - }, - { - name: "open PR approved", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonApproved}, - }, - want: StatusApproved, - }, - { - name: "open PR merge_ready maps to mergeable", - in: CanonicalSessionLifecycle{ - Session: SessionSubstate{State: SessionWorking}, - PR: PRSubstate{State: PROpen, Reason: PRReasonMergeReady}, - }, - want: StatusMergeable, - }, - { - name: "no PR falls through to idle", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionIdle}}, - want: StatusIdle, - }, - { - name: "no PR falls through to working", - in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionWorking}}, - want: StatusWorking, - }, + {"not_started maps to spawning", sess(SessionNotStarted), PRFacts{}, StatusSpawning}, + {"terminated+manually_killed -> killed", term(TermManuallyKilled), PRFacts{}, StatusKilled}, + {"terminated+runtime_lost -> killed", term(TermRuntimeLost), PRFacts{}, StatusKilled}, + {"terminated+auto_cleanup -> cleanup", term(TermAutoCleanup), PRFacts{}, StatusCleanup}, + {"terminated+pr_merged -> cleanup", term(TermPRMerged), PRFacts{}, StatusCleanup}, + {"terminated+error -> errored", term(TermErrorInProcess), PRFacts{}, StatusErrored}, + {"needs_input maps directly", sess(SessionNeedsInput), PRFacts{}, StatusNeedsInput}, + {"stuck dominates any PR", sess(SessionStuck), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusStuck}, + + {"no PR + idle -> idle", sess(SessionIdle), PRFacts{}, StatusIdle}, + {"no PR + working -> working", sess(SessionWorking), PRFacts{}, StatusWorking}, + + {"merged PR dominates idle session", sess(SessionIdle), PRFacts{Exists: true, Merged: true}, StatusMerged}, + {"open PR failing CI -> ci_failed", sess(SessionIdle), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusCIFailed}, + {"draft PR failing CI -> ci_failed (CI dominates)", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.CI = CIFailing }), StatusCIFailed}, + {"draft PR ignores review state -> draft", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.Review = ReviewApproved }), StatusDraft}, + {"open PR changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewChangesRequest }), StatusChangesRequested}, + {"open PR review comments -> changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.ReviewComments = true }), StatusChangesRequested}, + {"open PR mergeable", sess(SessionWorking), openPR(func(f *PRFacts) { f.Mergeability = MergeMergeable }), StatusMergeable}, + {"open PR approved", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewApproved }), StatusApproved}, + {"open PR review required -> review_pending", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewRequired }), StatusReviewPending}, + {"open PR no signal -> pr_open", sess(SessionWorking), openPR(nil), StatusPROpen}, + {"closed PR falls through to soft state", sess(SessionIdle), PRFacts{Exists: true, Closed: true}, StatusIdle}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := DeriveLegacyStatus(tt.in); got != tt.want { - t.Errorf("DeriveLegacyStatus() = %q, want %q", got, tt.want) + if got := DeriveStatus(tt.in, tt.pr); got != tt.want { + t.Errorf("DeriveStatus() = %q, want %q", got, tt.want) } }) } diff --git a/backend/internal/lifecycle/decide_bridge.go b/backend/internal/lifecycle/decide_bridge.go index 501d12ac..4f88cbe5 100644 --- a/backend/internal/lifecycle/decide_bridge.go +++ b/backend/internal/lifecycle/decide_bridge.go @@ -8,236 +8,102 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// defaultRecentActivityWindow is how fresh the last activity signal must be for -// the probe decider to treat the agent as "recently active" (which keeps an -// ambiguous dead-runtime probe in detecting instead of concluding death). +// defaultRecentActivityWindow is how fresh the last activity must be for the +// probe decider to treat the agent as "recently active" — which keeps an +// ambiguous dead-runtime probe in detecting instead of concluding death. const defaultRecentActivityWindow = 60 * time.Second -// ---- fact translation: ports DTOs -> pure decide inputs ---- - -// runtimeFactsToProbeInput maps a raw RuntimeFacts (plus the prior detecting -// memory and last-known activity read back from canonical) into the probe -// decider's input. KillRequested is always false here: the inferred-death path -// never carries an explicit kill — that arrives via OnKillRequested. -func runtimeFactsToProbeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { - rt, rtFailed := runtimeProbeToState(f.RuntimeState) - proc, procFailed := processProbeToLiveness(f.ProcessState) +// probeInput maps a raw RuntimeFacts (plus the prior detecting memory and last +// activity) into the pure decider's input. A failed/unknown probe is reported as +// such, never as a death — that routes to the detecting quarantine. +func probeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { now := nowOr(f.ObservedAt) - return decide.ProbeInput{ - Runtime: rt, - RuntimeFailed: rtFailed, - Process: proc, - ProcessFailed: procFailed, - RecentActivity: hasRecentActivity(cur.Activity, now, window), - Prior: cur.Detecting, - Now: now, - } -} -func runtimeProbeToState(p ports.RuntimeProbe) (domain.RuntimeState, bool) { - switch p { - case ports.RuntimeProbeAlive: - return domain.RuntimeAlive, false - case ports.RuntimeProbeDead: - return domain.RuntimeExited, false - case ports.RuntimeProbeFailed: - return domain.RuntimeProbeFailed, true - default: // indeterminate / unset: ambiguous, never a death conclusion - return domain.RuntimeUnknown, false + var runtimeAlive, runtimeFailed bool + switch f.Runtime { + case ports.ProbeAlive: + runtimeAlive = true + case ports.ProbeFailed, ports.ProbeUnknown: + runtimeFailed = true // ambiguous: quarantine, never conclude death } -} -func processProbeToLiveness(p ports.ProcessProbe) (decide.ProcessLiveness, bool) { - switch p { - case ports.ProcessProbeAlive: - return decide.ProcessAlive, false - case ports.ProcessProbeDead: - return decide.ProcessDead, false - case ports.ProcessProbeFailed: - return decide.ProcessIndeterminate, true - default: // indeterminate / unset - return decide.ProcessIndeterminate, false + var process decide.ProcessLiveness + var processFailed bool + switch f.Process { + case ports.ProbeAlive: + process = decide.ProcessAlive + case ports.ProbeDead: + process = decide.ProcessDead + case ports.ProbeFailed: + process, processFailed = decide.ProcessIndeterminate, true + default: + process = decide.ProcessIndeterminate } -} -// runtimeSubstateFromFacts derives the runtime sub-state to persist. Liveness -// always owns this axis, so it is written on every runtime observation -// regardless of what the session axis does. -func runtimeSubstateFromFacts(f ports.RuntimeFacts) domain.RuntimeSubstate { - switch f.RuntimeState { - case ports.RuntimeProbeAlive: - return domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} - case ports.RuntimeProbeDead: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonTmuxMissing} - case ports.RuntimeProbeFailed: - return domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError} - case ports.RuntimeProbeIndeterminate: - // Probe ran but couldn't tell — distinct from a probe error, so no - // probe_error reason; the ambiguity is carried by RuntimeUnknown alone. - return domain.RuntimeSubstate{State: domain.RuntimeUnknown} - default: // unset - return domain.RuntimeSubstate{State: domain.RuntimeUnknown} + return decide.ProbeInput{ + RuntimeAlive: runtimeAlive, + RuntimeFailed: runtimeFailed, + Process: process, + ProcessFailed: processFailed, + RecentActivity: hasRecentActivity(cur.Activity, now, window), + Prior: cur.Detecting, + Now: now, } } -// hasRecentActivity answers the probe decider's "was the agent heard from -// recently?" question. Sticky states (waiting_input/blocked) count as recent -// because they mean a live-but-paused agent; an explicit exited signal never -// counts; otherwise we age the last-activity timestamp against the window. +// hasRecentActivity answers the decider's "heard from the agent recently?" +// question. Sticky states (waiting_input/blocked) count as recent (a live-but- +// paused agent); an explicit exited never counts; else age the timestamp. func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool { - if a.State == domain.ActivityExited { + switch { + case a.State == domain.ActivityExited: return false - } - if a.State.IsSticky() { + case a.State.IsSticky(): return true - } - if a.LastActivityAt.IsZero() { + case a.LastActivityAt.IsZero(): return false + default: + return now.Sub(a.LastActivityAt) <= window } - return now.Sub(a.LastActivityAt) <= window -} - -// openPRInput maps SCM facts onto the open-PR ladder. IdleBeyond is always false -// in split A — the idle-duration signal is owned by the escalation engine -// (split B); the synchronous LCM has no clock of its own here. -func openPRInput(f ports.SCMFacts) decide.OpenPRInput { - hasBotComments, hasHumanComments := classifyPendingComments(f.PendingComments) - return decide.OpenPRInput{ - Draft: f.PRState == domain.PRDraft || f.Draft, - CIFailing: f.CISummary == ports.CIFailing, - ChangesRequested: f.ReviewDecision == ports.ReviewChangesRequested || hasHumanComments, - BotComments: hasBotComments, - MergeConflicts: hasMergeConflicts(f.Mergeability), - Approved: f.ReviewDecision == ports.ReviewApproved, - Mergeable: f.Mergeability.Mergeable, - ReviewPending: f.ReviewDecision == ports.ReviewPending, - Number: f.PRNumber, - URL: f.PRURL, - } -} - -func classifyPendingComments(comments []ports.ReviewComment) (hasBot, hasHuman bool) { - for _, c := range comments { - if c.IsBot { - hasBot = true - } else { - hasHuman = true - } - } - return hasBot, hasHuman -} - -func hasMergeConflicts(m ports.Mergeability) bool { - return !m.Mergeable && !m.NoConflicts && (m.CIPassing || m.Approved || len(m.Blockers) > 0) } -// ---- activity -> session axis mapping (activity owns working/idle/waiting) ---- - -// activityToSession maps an activity classification onto the session sub-state. -// exited returns ok=false: an exit signal must NOT write a terminal session -// state — only the probe pipeline (via detecting) may conclude inferred death. -func activityToSession(a domain.ActivityState) (domain.SessionState, domain.SessionReason, bool) { +// activityToSession maps an activity classification onto the session state. +// exited returns ok=false: only the probe pipeline may conclude death. +func activityToSession(a domain.ActivityState) (domain.SessionState, bool) { switch a { case domain.ActivityActive: - return domain.SessionWorking, domain.ReasonTaskInProgress, true - case domain.ActivityReady: - // ready = the agent finished a unit and is waiting for more work. - return domain.SessionIdle, domain.ReasonResearchComplete, true - case domain.ActivityIdle: - // plain inactivity carries no completion claim, so no specific reason - // (research_complete here would read misleadingly in diagnostics). - return domain.SessionIdle, "", true + return domain.SessionWorking, true + case domain.ActivityReady, domain.ActivityIdle: + return domain.SessionIdle, true case domain.ActivityWaitingInput: - return domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, true + return domain.SessionNeedsInput, true case domain.ActivityBlocked: - return domain.SessionStuck, domain.ReasonAwaitingUserInput, true - default: // exited / unset - return "", "", false + return domain.SessionStuck, true + default: + return "", false } } -// ---- composition predicates: who may write the session axis ---- - -// isTerminal reports a final session state that must not be resurrected by an -// observation (only an explicit Restore reopens a terminal session). +// isTerminal reports a final session state — reopened only by an explicit +// Restore, never by an observation. func isTerminal(s domain.SessionState) bool { return s == domain.SessionDone || s == domain.SessionTerminated } -// isLivenessOwned reports whether the current session sub-state was set by the -// liveness/death axis (the probe pipeline) and may therefore be recovered by a -// later healthy probe. detecting is always liveness-owned; a stuck/terminated -// state is liveness-owned only when its reason came from a death inference. -func isLivenessOwned(s domain.SessionSubstate) bool { - if s.State == domain.SessionDetecting { - return true - } - switch s.Reason { - case domain.ReasonRuntimeLost, domain.ReasonAgentProcessExited, domain.ReasonProbeFailure: - return true - } - return false -} - -// shouldWriteSessionRuntime is the #1 composition rule for ApplyRuntimeObservation. -// A death-axis verdict (detecting/stuck/terminal) always writes — it overrides -// activity because a (maybe) dead agent can't be working/waiting. A healthy -// "working" verdict only writes when it is recovering a liveness-owned state -// (e.g. detecting -> working); it must NOT clobber an activity-owned -// needs_input/blocked/idle the activity axis is responsible for. -func shouldWriteSessionRuntime(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { +// writeRuntimeSession reports whether a probe verdict may write the session +// state. A death-axis verdict (detecting/stuck/terminated) always writes; a +// healthy "working" verdict only recovers a detecting session — it must not +// clobber an activity-owned idle/needs_input. +func writeRuntimeSession(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { if isTerminal(cur.Session.State) { - // A terminal session is only reopened by an explicit Restore — never by - // an observation. Even a death-axis verdict (e.g. detecting) must not - // resurrect it; the runtime axis is still patched separately. return false } if d.SessionState == domain.SessionWorking { - return isLivenessOwned(cur.Session) + return cur.Session.State == domain.SessionDetecting } return true } -// shouldWriteSessionActivity is the mirror rule for ApplyActivitySignal: the -// activity axis owns working/idle/waiting. A valid activity signal is direct -// proof of life, so it is allowed to RESOLVE a detecting session (pull it out of -// the liveness quarantine) — but it must not resurrect a terminal session, and -// it leaves a liveness-escalated stuck state to the probe pipeline (stuck is a -// deliberate human-facing escalation, not a transient quarantine). -func shouldWriteSessionActivity(cur domain.CanonicalSessionLifecycle) bool { - if isTerminal(cur.Session.State) { - return false - } - if cur.Session.State == domain.SessionDetecting { - return true - } - return !isLivenessOwned(cur.Session) -} - -// ---- explicit-kill mapping (SM's terminal-write authority) ---- - -func killSession(k ports.LifecycleKillReason) domain.SessionSubstate { - switch k { - case ports.KillManual: - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonManuallyKilled} - case ports.KillCleanup: - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonAutoCleanup} - default: // error - return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonErrorInProcess} - } -} - -func killRuntime(k ports.LifecycleKillReason) domain.RuntimeSubstate { - switch k { - case ports.KillManual: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonManualKillRequested} - case ports.KillCleanup: - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonAutoCleanup} - default: // error - return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonProbeError} - } -} - func nowOr(t time.Time) time.Time { if t.IsZero() { return time.Now() diff --git a/backend/internal/lifecycle/fakes_test.go b/backend/internal/lifecycle/fakes_test.go deleted file mode 100644 index 5bacb49a..00000000 --- a/backend/internal/lifecycle/fakes_test.go +++ /dev/null @@ -1,149 +0,0 @@ -package lifecycle - -import ( - "context" - "fmt" - "sync" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// fakeStore is an in-memory LifecycleStore that faithfully applies full-row -// Upsert semantics so tests assert against the real persisted canonical. -type fakeStore struct { - mu sync.Mutex - records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]map[string]string -} - -var _ ports.LifecycleStore = (*fakeStore)(nil) - -func newFakeStore() *fakeStore { - return &fakeStore{ - records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]map[string]string{}, - } -} - -// seed installs a starting lifecycle for a session id (bypassing the patch path). -func (s *fakeStore) seed(id domain.SessionID, l domain.CanonicalSessionLifecycle) { - s.mu.Lock() - defer s.mu.Unlock() - if l.Version == 0 { - l.Version = domain.LifecycleVersion - } - s.records[id] = &domain.SessionRecord{ID: id, Lifecycle: l} -} - -func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.CanonicalSessionLifecycle{}, false, nil - } - return rec.Lifecycle, true, nil -} - -func (s *fakeStore) Upsert(_ context.Context, rec domain.SessionRecord, _ ports.EventType) error { - s.mu.Lock() - defer s.mu.Unlock() - if existing, ok := s.records[rec.ID]; ok { - if rec.Lifecycle.Revision != existing.Lifecycle.Revision { - return fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, existing.Lifecycle.Revision) - } - rec.Lifecycle.Revision = existing.Lifecycle.Revision + 1 - } else { - if rec.Lifecycle.Revision != 0 { - return fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) - } - rec.Lifecycle.Revision = 1 - } - if rec.Lifecycle.Version == 0 { - rec.Lifecycle.Version = domain.LifecycleVersion - } - r := rec - s.records[rec.ID] = &r - return nil -} - -func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.SessionRecord{}, false, nil - } - return *rec, true, nil -} - -func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - s.mu.Lock() - defer s.mu.Unlock() - var out []domain.SessionRecord - for _, rec := range s.records { - if rec.ProjectID == project { - out = append(out, *rec) - } - } - return out, nil -} - -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { - s.mu.Lock() - defer s.mu.Unlock() - out := map[string]string{} - for k, v := range s.metadata[id] { - out[k] = v - } - return out, nil -} - -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { - s.mu.Lock() - defer s.mu.Unlock() - if s.metadata[id] == nil { - s.metadata[id] = map[string]string{} - } - for k, v := range kv { - s.metadata[id][k] = v - } - return nil -} - -// recordingNotifier captures emitted events for assertions. -type recordingNotifier struct { - mu sync.Mutex - events []ports.OrchestratorEvent -} - -var _ ports.Notifier = (*recordingNotifier)(nil) - -func (n *recordingNotifier) Notify(_ context.Context, e ports.OrchestratorEvent) error { - n.mu.Lock() - defer n.mu.Unlock() - n.events = append(n.events, e) - return nil -} - -// recordingMessenger captures messages injected into agents. -type recordingMessenger struct { - mu sync.Mutex - sent []struct { - ID domain.SessionID - Message string - } -} - -var _ ports.AgentMessenger = (*recordingMessenger)(nil) - -func (a *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { - a.mu.Lock() - defer a.mu.Unlock() - a.sent = append(a.sent, struct { - ID domain.SessionID - Message string - }{id, message}) - return nil -} diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index b5751e86..f61d38b4 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -1,13 +1,8 @@ // Package lifecycle implements ports.LifecycleManager: the synchronous -// observe->decide->persist reducer. Every Apply*/On* entrypoint runs the same -// pipeline under a per-session lock — load the full canonical record, run the -// matching pure decider, classify the resulting change, and persist the full -// row through the store. The store owns Revision++; the LCM never polls and -// never writes the display status (that is derived on read). -// -// After a transition is persisted, the Apply* paths fire the mapped reaction -// (the ACT layer: reaction table + escalation engine) via the react() chokepoint -// in reactions.go. +// observe -> decide -> persist reducer. Every Apply*/On* entrypoint loads the +// session, runs the pure decider, and persists the full row under a single write +// lock. The DB triggers emit the CDC; the engine never writes the change log. +// After a transition it fires the mapped reaction (see reactions.go). package lifecycle import ( @@ -21,441 +16,245 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// Metadata keys OnSpawnCompleted records for the spawned session's handles. -// -// MetaPrompt is the assembled launch prompt, persisted so a Restore that finds -// no captured agent session id can still fall back to a fresh launch with the -// same prompt rather than failing. -const ( - MetaBranch = "branch" - MetaWorkspacePath = "workspacePath" - MetaRuntimeHandleID = "runtimeHandleId" - MetaRuntimeName = "runtimeName" - MetaAgentSessionID = "agentSessionId" - MetaPrompt = "prompt" -) - -// Manager is the LCM. The Apply* pipeline persists a transition and then fires -// the mapped reaction via Notifier/AgentMessenger (see reactions.go). +// Manager is the lifecycle engine. mu serialises the load->decide->persist +// read-modify-write across sessions; reactions dispatch after the lock releases +// so a slow agent send never blocks the write path. type Manager struct { - store ports.LifecycleStore + store ports.SessionStore + pr ports.PRWriter notifier ports.Notifier messenger ports.AgentMessenger - recentActivityWindow time.Duration - locks keyedMutex + mu sync.Mutex + window time.Duration + clock func() time.Time - // trackers hold per-(session,reaction) escalation budgets (ACT policy, not - // canonical state). trackerMu guards them: react() touches them from the - // caller's goroutine, TickEscalations from the reaper's. clock is the time - // source for escalation stamping (overridable in tests). - trackers map[trackerKey]*reactionTracker - trackerMu sync.Mutex - clock func() time.Time - - // sessionLister returns every session known to persistence so RunningSessions - // can filter by runtime axis without coupling the LCM to a cross-project - // store API the Tom-store does not yet expose. The daemon (lane #10) injects - // the production lister via WithSessionLister; until then, the call returns - // no sessions so a reaper attached to an unwired Manager is a clean no-op - // rather than a panic. - sessionLister func(ctx context.Context) ([]domain.SessionRecord, error) + // in-memory ACT state (policy, not canonical truth — reset on restart). + react reactionState } var _ ports.LifecycleManager = (*Manager)(nil) -func New(store ports.LifecycleStore, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { +func New(store ports.SessionStore, pr ports.PRWriter, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { return &Manager{ - store: store, - notifier: notifier, - messenger: messenger, - recentActivityWindow: defaultRecentActivityWindow, - trackers: map[trackerKey]*reactionTracker{}, - clock: time.Now, - } -} - -// WithSessionLister injects the function the LCM uses to enumerate all -// persisted sessions for RunningSessions. The daemon wires this against the -// store at startup; it must be called BEFORE any reaper attached to this -// Manager starts running, since concurrent calls would race the bare-field -// read in RunningSessions. Calling it more than once replaces the previous -// lister. -func (m *Manager) WithSessionLister(fn func(ctx context.Context) ([]domain.SessionRecord, error)) { - m.sessionLister = fn -} - -// ---- per-session serialisation ---- - -// keyedMutex hands out one lock per session id so the load->decide->persist -// read-modify-write is serial within a session but parallel across sessions. -// -// Entries are reference-counted and evicted when the last holder releases, so -// the map stays bounded to sessions with in-flight operations rather than -// growing unbounded over the lifetime of a long-running daemon. -type keyedMutex struct { - mu sync.Mutex - locks map[domain.SessionID]*lockEntry -} - -type lockEntry struct { - mu sync.Mutex - refs int -} - -func (k *keyedMutex) lock(id domain.SessionID) func() { - k.mu.Lock() - if k.locks == nil { - k.locks = make(map[domain.SessionID]*lockEntry) - } - e, ok := k.locks[id] - if !ok { - e = &lockEntry{} - k.locks[id] = e - } - e.refs++ - k.mu.Unlock() - - e.mu.Lock() - return func() { - e.mu.Unlock() - k.mu.Lock() - e.refs-- - if e.refs == 0 { - delete(k.locks, id) - } - k.mu.Unlock() + store: store, + pr: pr, + notifier: notifier, + messenger: messenger, + window: defaultRecentActivityWindow, + clock: time.Now, + react: newReactionState(), } } -func (m *Manager) withLock(id domain.SessionID, fn func() error) error { - unlock := m.locks.lock(id) - defer unlock() - return fn() -} - -// transition is what a persisted write produced: the canonical before and after -// the full-row upsert. The ACT layer (react) derives the reaction from these. It -// is nil when the pipeline made no write. -// -// projectID is captured so reaction events fired downstream (Notifier.Notify in -// executeReaction and escalate) can populate OrchestratorEvent.ProjectID — the -// human-facing event router groups events by project. Empty when the record has -// no ProjectID (e.g. test-only seeded records that omit identity). -type transition struct { - beforeLC domain.CanonicalSessionLifecycle - afterLC domain.CanonicalSessionLifecycle - projectID domain.ProjectID -} - -// mutate runs the shared pipeline: load full row -> build next canonical -> -// Upsert full row (only if changed). decideFn returns the full next lifecycle -// and whether it changed anything; false is a clean no-op (no write), which is -// how failed-probe / unknown-fact inputs are dropped. -// -// On a write it returns the transition (before/after canonical) so the caller — -// which still holds the originating facts — can fire the mapped reaction. +// mutate runs the shared pipeline: load -> decideFn -> persist (only if changed). +// It returns whether a write happened. A stray observation for an unknown session +// is a clean no-op. func (m *Manager) mutate( ctx context.Context, id domain.SessionID, - decideFn func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error), -) (*transition, error) { - var tr *transition - err := m.withLock(id, func() error { - rec, exists, err := m.store.Get(ctx, id) - if err != nil { - return err - } - cur := rec.Lifecycle - next, changed, err := decideFn(cur, exists) - if err != nil { - return err - } - if !changed { - return nil - } - rec.Lifecycle = m.prepareLifecycleWrite(next) - rec.UpdatedAt = m.clock() - if err := m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, false)); err != nil { - return err - } - // ProjectID is captured straight from the record we already loaded at the - // top of this closure — identity is set once at OnSpawnInitiated and never - // mutated, so no second store roundtrip is needed for reaction events. - tr = &transition{beforeLC: cur, afterLC: rec.Lifecycle, projectID: rec.ProjectID} - return nil - }) - return tr, err -} - -func (m *Manager) prepareLifecycleWrite(next domain.CanonicalSessionLifecycle) domain.CanonicalSessionLifecycle { + fn func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool), +) (bool, error) { + m.mu.Lock() + defer m.mu.Unlock() + + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return false, err + } + next, changed := fn(rec.Lifecycle) + if !changed { + return false, nil + } next.Version = domain.LifecycleVersion - return next + rec.Lifecycle = next + rec.UpdatedAt = m.clock() + if err := m.store.UpdateSession(ctx, rec); err != nil { + return false, err + } + return true, nil } // ---- OBSERVE entrypoints ---- -// ApplyRuntimeObservation feeds the probe decider. Liveness always writes the -// runtime axis; the session axis follows the #1 composition rule; and a -// non-detecting verdict clears any stale detecting memory (#3) so the next -// probe doesn't read a phantom prior. +// ApplyRuntimeObservation feeds the probe decider. is_alive always tracks the +// verdict; the session state follows the runtime-write rule; a non-detecting +// verdict clears stale detecting memory. func (m *Manager) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists { - return cur, false, nil // nothing seeded; ignore stray probe - } - - d := decide.ResolveProbeDecision(runtimeFactsToProbeInput(f, cur, m.recentActivityWindow)) - + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + d := decide.ResolveProbeDecision(probeInput(f, cur, m.window)) next := cur - changed := false - - if rt := runtimeSubstateFromFacts(f); cur.Runtime != rt { - next.Runtime = rt - changed = true + ch := false + if next.IsAlive != d.IsAlive { + next.IsAlive, ch = d.IsAlive, true } - // A terminal session is reopened only by an explicit Restore: an - // observation may refresh the runtime axis above but must touch neither - // the session axis nor the detecting memory. if !isTerminal(cur.Session.State) { - if shouldWriteSessionRuntime(d, cur) { - changed = setSessionIfChanged(&next, d.SessionState, d.SessionReason) || changed + if writeRuntimeSession(d, cur) { + ch = setSessionState(&next, d.SessionState, d.TerminationReason) || ch } - changed = setDetecting(&next, d.Detecting) || changed + ch = setDetecting(&next, d.Detecting) || ch } - - return next, changed, nil + return next, ch }) - if err != nil { + if err != nil || !changed { return err } - return m.react(ctx, id, tr, reactionContext{}) + return m.runReactions(ctx, id, reactionContent{}) } -// ApplySCMObservation maps PR facts onto the PR axis. A failed fetch is dropped -// (failed probe != "no PR"). An open or draft PR writes only the PR sub-state — -// the session axis stays owned by activity, and DeriveLegacyStatus surfaces the -// PR reason for display. A terminal PR (merged/closed) also parks the session. -func (m *Manager) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists || !f.Fetched { - return cur, false, nil - } - - switch f.PRState { - case domain.PRDraft, domain.PROpen: - d := decide.ResolveOpenPRDecision(openPRInput(f)) - next := cur - changed := setPRIfChanged(&next, d, f) - return next, changed, nil - - case domain.PRMerged, domain.PRClosed: - d := decide.ResolveTerminalPRStateDecision(f.PRState) - next := cur - changed := setPRIfChanged(&next, d, f) - // A merge/close is a milestone that ends the work, so it parks the - // session axis (idle / merged_waiting_decision) even over an - // activity-owned needs_input/blocked — unlike the open-PR path, - // which leaves the session axis to activity. A terminal session is - // still never reopened. - if !isTerminal(cur.Session.State) { - changed = setSessionIfChanged(&next, d.SessionState, d.SessionReason) || changed - } - return next, changed, nil - - default: // none / unset: no PR-driven transition in split A - return cur, false, nil - } - }) - if err != nil { - return err - } - return m.react(ctx, id, tr, reactionContext{ciFailureLogTail: f.CIFailureLogTail}) -} - -// ApplyActivitySignal updates the activity axis. Only a valid-confidence signal -// is authoritative (stale/unavailable/probe_failure != idleness). It refreshes -// the persisted activity sub-state (the probe decider's RecentActivity input) -// and maps the classification onto the session axis. A valid signal is proof of -// life, so it may resolve a detecting session — clearing the quarantine memory -// so a later probe doesn't resume counting from a stale prior. +// ApplyActivitySignal updates the activity axis. Only a valid signal is +// authoritative, and it is proof of life: it may resolve a detecting session and +// move the session out of any non-terminal state. func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { - tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists || s.State != ports.SignalValid { - return cur, false, nil + if !s.Valid { + return nil + } + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false } - next := cur - changed := false - - act := domain.ActivitySubstate{State: s.Activity, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} + ch := false + act := domain.ActivitySubstate{State: s.State, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} if !sameActivity(cur.Activity, act) { - next.Activity = act - changed = true + next.Activity, ch = act, true } - if st, rs, ok := activityToSession(s.Activity); ok && shouldWriteSessionActivity(cur) { - changed = setSessionIfChanged(&next, st, rs) || changed - // Proof of life that pulls the session out of detecting must also - // drop the quarantine memory (detecting memory only exists while - // detecting, so this is a no-op otherwise). - if cur.Detecting != nil { - next.Detecting = nil - changed = true + if st, ok := activityToSession(s.State); ok { + ch = setSessionState(&next, st, domain.TermNone) || ch + if next.Detecting != nil { + next.Detecting, ch = nil, true } } - - return next, changed, nil + if s.State != domain.ActivityExited && !next.IsAlive { + next.IsAlive, ch = true, true + } + return next, ch }) - if err != nil { + if err != nil || !changed { return err } - return m.react(ctx, id, tr, reactionContext{}) + return m.runReactions(ctx, id, reactionContent{}) } -// ---- mutation commands/outcomes reported by the Session Manager ---- +// ApplyPRObservation records the observed PR facts in the pr tables, terminates +// the session on a merge, and fires the PR-driven reactions. A failed fetch is +// dropped (failed probe != "PR closed"). +func (m *Manager) ApplyPRObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + if !o.Fetched { + return nil + } + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return err + } + if err := m.writePR(ctx, id, o); err != nil { + return err + } -// OnSpawnInitiated seeds or reopens the full session record for a spawn-like -// mutation. It is the Session Manager's create/reopen command under the Writer -// contract: the SM builds the identity + initial canonical row, but only the LCM -// writes it. Fresh rows emit session_created; reopening a terminal row reuses -// the current row as the before-image and lets the classifier emit the schema -// event for the reopen. -func (m *Manager) OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error { - return m.withLock(rec.ID, func() error { - cur := rec.Lifecycle - isInsert := true - if current, ok, err := m.store.Get(ctx, rec.ID); err != nil { - return err - } else if ok { - currentLC := current.Lifecycle - if !isTerminal(currentLC.Session.State) && !isTerminal(rec.Lifecycle.Session.State) { - return fmt.Errorf("lifecycle: OnSpawnInitiated for active session %q", rec.ID) + if o.Merged { + changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false } - cur = currentLC - isInsert = false - } - rec.Lifecycle = m.prepareLifecycleWrite(rec.Lifecycle) - if isInsert { - rec.Lifecycle.Revision = 0 - } else { - rec.Lifecycle.Revision = cur.Revision - } - now := m.clock() - if rec.CreatedAt.IsZero() { - rec.CreatedAt = now - } - rec.UpdatedAt = now - return m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, isInsert)) - }) -} - -// OnSpawnCompleted records that a spawn finished: the runtime is up and the -// handles are known. Per the agreed rule it flips the runtime axis to alive and -// stores the handles in metadata, but leaves the session at not_started -// (display: spawning) — the agent "acknowledges" via the first activity signal. -func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { - return m.withLock(id, func() error { - rec, exists, err := m.store.Get(ctx, id) + next := cur + next.Session.State = domain.SessionTerminated + next.TerminationReason = domain.TermPRMerged + next.IsAlive = false + next.Detecting = nil + return next, true + }) if err != nil { return err } - if !exists { - // The SM seeds the initial lifecycle before spawning; a completion - // for an unseeded session is a contract violation, not a stray - // observation, so surface it rather than fabricating a record. - return fmt.Errorf("lifecycle: OnSpawnCompleted for unseeded session %q", id) - } - rt := domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} - if rec.Lifecycle.Runtime != rt { - cur := rec.Lifecycle - next := cur - next.Runtime = rt - rec.Lifecycle = m.prepareLifecycleWrite(next) - rec.UpdatedAt = m.clock() - if err := m.store.Upsert(ctx, rec, classifyEventType(cur, rec.Lifecycle, false)); err != nil { - return err - } - } - if meta := spawnMetadata(o); len(meta) > 0 { - if err := m.store.PatchMetadata(ctx, id, meta); err != nil { - return err - } + if changed { + m.clearReactions(id) + return m.fireNotify(ctx, id, rec.ProjectID, reactions[rxMerged]) } return nil - }) + } + + return m.runReactions(ctx, id, prContent(o)) } -// OnKillRequested is the SM's explicit terminal-write authority (the one -// terminal path that does not go through the inferred-death decider). It writes -// the terminal session/runtime sub-states for the kill kind and clears any -// in-flight detecting memory. -func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { - // An explicit user kill is a human action, not an inferred event, so it - // fires no reaction — the transition is discarded. - _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (domain.CanonicalSessionLifecycle, bool, error) { - if !exists { - // Killing an unknown/already-gone session is a benign race; no-op - // rather than fabricating a terminal record for a session we never - // knew about. - return cur, false, nil +// writePR persists the observation's scalar facts, check runs, and comment set +// in one atomic store call. PR-table CDC is emitted by the DB triggers. +func (m *Manager) writePR(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + now := m.clock() + row := ports.PRRow{ + URL: o.URL, SessionID: string(id), Number: o.Number, + Draft: o.Draft, Merged: o.Merged, Closed: o.Closed, + CI: o.CI, Review: o.Review, Mergeability: o.Mergeability, UpdatedAt: now, + } + checks := make([]ports.PRCheckRow, len(o.Checks)) + for i, c := range o.Checks { + c.PRURL = o.URL + if c.CreatedAt.IsZero() { + c.CreatedAt = now } + checks[i] = c + } + comments := make([]ports.PRComment, len(o.Comments)) + for i, c := range o.Comments { + if c.CreatedAt.IsZero() { + c.CreatedAt = now + } + comments[i] = c + } + return m.pr.WritePR(ctx, row, checks, comments) +} - next := cur - changed := false +// ---- mutation commands from the Session Manager ---- - if sess := killSession(r.Kind); cur.Session != sess { - next.Session = sess - changed = true - } - if rt := killRuntime(r.Kind); cur.Runtime != rt { - next.Runtime = rt - changed = true - } - if cur.Detecting != nil { - next.Detecting = nil - changed = true - } - return next, changed, nil - }) +// OnSpawnCompleted marks a session live and folds in its handles. It serves a +// fresh spawn (not_started -> live) and a restore (terminal -> reopened): both +// land at not_started + is_alive, with the agent acknowledging via first activity. +func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + m.mu.Lock() + defer m.mu.Unlock() + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { return err } - // A kill is terminal but bypasses react()'s incident-over cleanup (it fires - // no reaction). Drop any escalation trackers here so a later duration-based - // TickEscalations can't emit reaction.escalated for a dead session. - m.clearSessionTrackers(id) - return nil + if !ok { + return fmt.Errorf("lifecycle: OnSpawnCompleted for unknown session %q", id) + } + rec.Lifecycle.Version = domain.LifecycleVersion + rec.Lifecycle.Session.State = domain.SessionNotStarted + rec.Lifecycle.TerminationReason = domain.TermNone + rec.Lifecycle.IsAlive = true + rec.Lifecycle.Detecting = nil + rec.Metadata = mergeMetadata(rec.Metadata, spawnMetadata(o)) + rec.UpdatedAt = m.clock() + return m.store.UpdateSession(ctx, rec) } -// ---- read-snapshot helpers ---- +// OnKillRequested is the explicit terminal-write path (the one terminal that does +// not go through the inferred-death decider). It fires no reaction — an explicit +// kill is a human action — but drops the session's ACT state. +func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error { + _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { + if isTerminal(cur.Session.State) { + return cur, false + } + if reason == domain.TermNone { + reason = domain.TermManuallyKilled + } + next := cur + next.Session.State = domain.SessionTerminated + next.TerminationReason = reason + next.IsAlive = false + next.Detecting = nil + return next, true + }) + m.clearReactions(id) + return err +} -// RunningSessions returns a snapshot of every persisted session worth probing -// in the next reaper tick. "Worth probing" is wider than "runtime axis alive": -// it includes sessions in the Detecting quarantine, because a fresh probe is -// the only fact that can recover them (back to working) or escalate them -// (terminal killed). Filtering to runtime-axis-alive would silently park every -// Detecting session — a single failed probe would never get a second chance -// and recovery via runtime probe would be unreachable. -// -// The predicate is "not a final session state". Terminal session states (done, -// terminated) are excluded because Restore is the only path back; observations -// must not reopen them (#1 invariant). Sessions in earlier states — not_started, -// working, idle, needs_input, stuck, detecting — are all included. Those that -// lack runtime handle metadata (e.g. not_started before OnSpawnCompleted) are -// returned and harmlessly skipped by the reaper's per-session handle guard. -// -// The call only reads and copies, so it does not break the single-writer -// invariant; concurrent Apply* calls may move sessions in or out of the probe -// set between snapshots, which is correct — the next tick re-reads. -// -// When no lister has been wired (e.g. tests construct a bare Manager), the -// method returns nil so a goroutine attached to such a Manager degrades to a -// no-op rather than panicking. +// RunningSessions snapshots every non-terminal session for the reaper to probe. +// Detecting sessions are included — a fresh probe is the only fact that recovers +// or escalates them. func (m *Manager) RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) { - if m.sessionLister == nil { - return nil, nil - } - all, err := m.sessionLister(ctx) + all, err := m.store.ListAllSessions(ctx) if err != nil { return nil, err } @@ -468,37 +267,28 @@ func (m *Manager) RunningSessions(ctx context.Context) ([]domain.SessionRecord, return out, nil } -// ---- diff helpers ---- +// ---- diff + metadata helpers ---- -// setSessionIfChanged sets next.Session only when the decided sub-state differs -// from the current next value; an empty decided state means "decider does not -// address the session axis" and is left untouched. -func setSessionIfChanged(next *domain.CanonicalSessionLifecycle, st domain.SessionState, rs domain.SessionReason) bool { +// setSessionState sets the state (and, for a terminal state, the reason) when it +// differs. An empty state means "decider doesn't address the session axis". +func setSessionState(next *domain.CanonicalSessionLifecycle, st domain.SessionState, reason domain.TerminationReason) bool { if st == "" { return false } - want := domain.SessionSubstate{State: st, Reason: rs} - if next.Session == want { - return false + changed := false + if next.Session.State != st { + next.Session.State, changed = st, true } - next.Session = want - return true -} - -// setPRIfChanged folds the decided PR sub-state plus the fact-borne PR identity -// (number/url) into next when it differs from the current next value. -func setPRIfChanged(next *domain.CanonicalSessionLifecycle, d decide.LifecycleDecision, f ports.SCMFacts) bool { - want := domain.PRSubstate{State: d.PRState, Reason: d.PRReason, Number: f.PRNumber, URL: f.PRURL} - if next.PR == want { - return false + want := domain.TermNone + if st == domain.SessionTerminated { + want = reason } - next.PR = want - return true + if next.TerminationReason != want { + next.TerminationReason, changed = want, true + } + return changed } -// setDetecting implements the detecting semantics on the full canonical row: -// set/replace when the decision carries memory, clear (#3) when it doesn't but -// canonical still holds stale memory, else leave untouched. func setDetecting(next *domain.CanonicalSessionLifecycle, d *domain.DetectingState) bool { if d != nil { if next.Detecting != nil && *next.Detecting == *d { @@ -515,50 +305,37 @@ func setDetecting(next *domain.CanonicalSessionLifecycle, d *domain.DetectingSta return false } -func classifyEventType(before, after domain.CanonicalSessionLifecycle, isInsert bool) ports.EventType { - switch { - case isInsert: - return ports.EventSessionCreated - case before.Session.State != after.Session.State && after.Session.State == domain.SessionTerminated: - return ports.EventSessionTerminated - case before.Session != after.Session: - return ports.EventSessionStateChanged - case before.PR != after.PR: - return ports.EventSessionPRUpdated - case before.Runtime != after.Runtime: - return ports.EventSessionRuntimeUpdated - case before.Activity != after.Activity: - return ports.EventSessionActivityUpdated - default: - return ports.EventSessionUpdated - } -} - -// sameActivity compares activity sub-states with time-aware equality (== on -// time.Time is monotonic-clock sensitive and would spuriously report changes). +// sameActivity compares with time-aware equality (== on time.Time is +// monotonic-clock sensitive and would spuriously report changes). func sameActivity(a, b domain.ActivitySubstate) bool { return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt) } -func spawnMetadata(o ports.SpawnOutcome) map[string]string { - meta := map[string]string{} - if o.Branch != "" { - meta[MetaBranch] = o.Branch +func spawnMetadata(o ports.SpawnOutcome) domain.SessionMetadata { + return domain.SessionMetadata{ + Branch: o.Branch, + WorkspacePath: o.WorkspacePath, + RuntimeHandleID: o.RuntimeHandle.ID, + RuntimeName: o.RuntimeHandle.RuntimeName, + AgentSessionID: o.AgentSessionID, + Prompt: o.Prompt, } - if o.WorkspacePath != "" { - meta[MetaWorkspacePath] = o.WorkspacePath - } - if o.RuntimeHandle.ID != "" { - meta[MetaRuntimeHandleID] = o.RuntimeHandle.ID - } - if o.RuntimeHandle.RuntimeName != "" { - meta[MetaRuntimeName] = o.RuntimeHandle.RuntimeName - } - if o.AgentSessionID != "" { - meta[MetaAgentSessionID] = o.AgentSessionID - } - if o.Prompt != "" { - meta[MetaPrompt] = o.Prompt +} + +// mergeMetadata overlays set fields of in onto base without clobbering an +// existing value with an empty one (a partial spawn write keeps the branch set +// at creation). +func mergeMetadata(base, in domain.SessionMetadata) domain.SessionMetadata { + set := func(dst *string, v string) { + if v != "" { + *dst = v + } } - return meta + set(&base.Branch, in.Branch) + set(&base.WorkspacePath, in.WorkspacePath) + set(&base.RuntimeHandleID, in.RuntimeHandleID) + set(&base.RuntimeName, in.RuntimeName) + set(&base.AgentSessionID, in.AgentSessionID) + set(&base.Prompt, in.Prompt) + return base } diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index 6a2cc1d1..4ae9aaaf 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -2,8 +2,8 @@ package lifecycle import ( "context" - "errors" - "sync" + "fmt" + "strings" "testing" "time" @@ -11,605 +11,355 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) +var ctx = context.Background() -const sid domain.SessionID = "s1" +// ---- fakes ---- -func newManager() (*Manager, *fakeStore) { - store := newFakeStore() - return New(store, &recordingNotifier{}, &recordingMessenger{}), store +// fakeStore is a mini SessionStore + PRWriter: it derives PRFacts and recent +// check statuses from what the engine writes, so PR-reaction tests exercise the +// write path and the read-back together. +type fakeStore struct { + sessions map[domain.SessionID]domain.SessionRecord + pr map[domain.SessionID]ports.PRRow + comments map[string][]ports.PRComment + checks []ports.PRCheckRow + num int } -func mustLoad(t *testing.T, store *fakeStore) domain.CanonicalSessionLifecycle { - t.Helper() - l, ok, err := store.Load(context.Background(), sid) - if err != nil || !ok { - t.Fatalf("load: ok=%v err=%v", ok, err) +func newFakeStore() *fakeStore { + return &fakeStore{ + sessions: map[domain.SessionID]domain.SessionRecord{}, + pr: map[domain.SessionID]ports.PRRow{}, + comments: map[string][]ports.PRComment{}, } - return l } -// ---- ApplyRuntimeObservation + #1 composition + #3 detecting clear ---- - -func TestApplyRuntimeObservation(t *testing.T) { - aliveProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} - failedProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} - deadProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0} - - tests := []struct { - name string - seed domain.CanonicalSessionLifecycle - facts ports.RuntimeFacts - wantSession domain.SessionState - wantReason domain.SessionReason - wantRuntime domain.RuntimeState - wantDisplay domain.SessionStatus - wantDetecting bool // expect non-nil detecting memory persisted - }{ - { - name: "healthy probe must not clobber an activity-owned needs_input (#1)", - seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), - facts: aliveProbe, - wantSession: domain.SessionNeedsInput, - wantReason: domain.ReasonAwaitingUserInput, - wantRuntime: domain.RuntimeAlive, - wantDisplay: domain.StatusNeedsInput, - wantDetecting: false, - }, - { - name: "healthy probe recovers a liveness-owned detecting -> working and clears memory (#1 + #3)", - seed: detectingLC(), - facts: aliveProbe, - wantSession: domain.SessionWorking, - wantReason: domain.ReasonTaskInProgress, - wantRuntime: domain.RuntimeAlive, - wantDisplay: domain.StatusWorking, - wantDetecting: false, - }, - { - name: "failed probe routes to detecting and records memory", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - facts: failedProbe, - wantSession: domain.SessionDetecting, - wantReason: domain.ReasonProbeFailure, - wantRuntime: domain.RuntimeProbeFailed, - wantDisplay: domain.StatusDetecting, - wantDetecting: true, - }, - { - name: "dead+dead with no recent activity concludes killed and clears detecting (#3)", - seed: detectingLC(), - facts: deadProbe, - wantSession: domain.SessionTerminated, - wantReason: domain.ReasonRuntimeLost, - wantRuntime: domain.RuntimeExited, - wantDisplay: domain.StatusKilled, - wantDetecting: false, - }, +func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + f.num++ + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) + f.sessions[rec.ID] = rec + return rec, nil +} +func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { + f.sessions[rec.ID] = rec + return nil +} +func (f *fakeStore) GetSession(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + r, ok := f.sessions[id] + return r, ok, nil +} +func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domain.SessionRecord, error) { + var out []domain.SessionRecord + for _, r := range f.sessions { + if r.ProjectID == p { + out = append(out, r) + } } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, tt.seed) - - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, tt.facts); err != nil { - t.Fatalf("apply: %v", err) - } - - l := mustLoad(t, store) - if l.Session.State != tt.wantSession || l.Session.Reason != tt.wantReason { - t.Errorf("session = %v/%v, want %v/%v", l.Session.State, l.Session.Reason, tt.wantSession, tt.wantReason) - } - if l.Runtime.State != tt.wantRuntime { - t.Errorf("runtime = %v, want %v", l.Runtime.State, tt.wantRuntime) - } - if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { - t.Errorf("display = %v, want %v", got, tt.wantDisplay) - } - if (l.Detecting != nil) != tt.wantDetecting { - t.Errorf("detecting present = %v, want %v (%+v)", l.Detecting != nil, tt.wantDetecting, l.Detecting) - } - }) + return out, nil +} +func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { + out := make([]domain.SessionRecord, 0, len(f.sessions)) + for _, r := range f.sessions { + out = append(out, r) } + return out, nil } - -func TestApplyRuntimeObservation_NoRecordIsNoOp(t *testing.T) { - mgr, store := newManager() - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { - t.Fatalf("apply: %v", err) +func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { + r, ok := f.pr[id] + if !ok { + return domain.PRFacts{}, nil + } + facts := domain.PRFacts{ + URL: r.URL, Number: r.Number, Exists: true, + Draft: r.Draft, Merged: r.Merged, Closed: r.Closed, + CI: r.CI, Review: r.Review, Mergeability: r.Mergeability, + } + for _, c := range f.comments[r.URL] { + if !c.Resolved { + facts.ReviewComments = true + break + } } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("a probe for an unseeded session must not fabricate a record") + return facts, nil +} +func (f *fakeStore) WritePR(_ context.Context, pr ports.PRRow, checks []ports.PRCheckRow, comments []ports.PRComment) error { + f.pr[domain.SessionID(pr.SessionID)] = pr + f.checks = append(f.checks, checks...) + f.comments[pr.URL] = comments + return nil +} +func (f *fakeStore) RecentCheckStatuses(_ context.Context, url, name string, limit int) ([]string, error) { + var out []string + for i := len(f.checks) - 1; i >= 0 && len(out) < limit; i-- { + if f.checks[i].PRURL == url && f.checks[i].Name == name { + out = append(out, f.checks[i].Status) + } } + return out, nil } -func TestApplyRuntimeObservation_DoesNotResurrectTerminal(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited)) +type fakeNotifier struct{ events []ports.Event } - // A failed probe would normally route to detecting, but a terminal session - // must not be reopened by an observation (only an explicit Restore does). - if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { - t.Fatalf("apply: %v", err) +func (f *fakeNotifier) Notify(_ context.Context, e ports.Event) error { + f.events = append(f.events, e) + return nil +} +func (f *fakeNotifier) last() string { + if len(f.events) == 0 { + return "" } + return f.events[len(f.events)-1].Type +} - l := mustLoad(t, store) - if l.Session.State != domain.SessionTerminated || l.Session.Reason != domain.ReasonManuallyKilled { - t.Errorf("session = %v/%v, want terminated/manually_killed (no resurrection)", l.Session.State, l.Session.Reason) - } - if l.Detecting != nil { - t.Errorf("terminal session must not gain detecting memory, got %+v", l.Detecting) - } +type fakeMessenger struct{ msgs []string } + +func (f *fakeMessenger) Send(_ context.Context, _ domain.SessionID, m string) error { + f.msgs = append(f.msgs, m) + return nil } -// ---- ApplyActivitySignal ---- - -func TestApplyActivitySignal(t *testing.T) { - tests := []struct { - name string - seed domain.CanonicalSessionLifecycle - signal ports.ActivitySignal - wantSession domain.SessionState - wantReason domain.SessionReason - checkReason bool - wantActivity domain.ActivityState - wantChanged bool - }{ - { - name: "valid waiting_input maps to needs_input", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityWaitingInput, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionNeedsInput, - wantActivity: domain.ActivityWaitingInput, - wantChanged: true, - }, - { - name: "valid active recovers needs_input -> working", - seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantActivity: domain.ActivityActive, - wantChanged: true, - }, - { - name: "valid idle maps to idle with a neutral reason", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionIdle, - wantReason: "", - checkReason: true, - wantActivity: domain.ActivityIdle, - wantChanged: true, - }, - { - name: "low-confidence signal is dropped (no idleness inferred)", - seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), - signal: ports.ActivitySignal{State: ports.SignalProbeFailure, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantChanged: false, - }, - { - name: "valid activity resolves a detecting session (proof of life)", - seed: detectingLC(), - signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, - wantSession: domain.SessionWorking, - wantActivity: domain.ActivityActive, - wantChanged: true, +func newManager() (*Manager, *fakeStore, *fakeNotifier, *fakeMessenger) { + st, n, msg := newFakeStore(), &fakeNotifier{}, &fakeMessenger{} + return New(st, st, n, msg), st, n, msg +} + +func working(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ + ID: id, ProjectID: "mer", + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: domain.SessionWorking}, + IsAlive: true, }, } +} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, tt.seed) - - if err := mgr.ApplyActivitySignal(context.Background(), sid, tt.signal); err != nil { - t.Fatalf("apply: %v", err) - } - - l := mustLoad(t, store) - if l.Session.State != tt.wantSession { - t.Errorf("session = %v, want %v", l.Session.State, tt.wantSession) - } - if tt.checkReason && l.Session.Reason != tt.wantReason { - t.Errorf("session reason = %q, want %q", l.Session.Reason, tt.wantReason) - } - if tt.wantChanged && l.Revision != 1 { - t.Errorf("revision = %d, want 1 (expected a write)", l.Revision) - } - if !tt.wantChanged && l.Revision != 0 { - t.Errorf("revision = %d, want 0 (expected a no-op)", l.Revision) - } - if tt.wantChanged && tt.wantActivity != "" && l.Activity.State != tt.wantActivity { - t.Errorf("activity = %v, want %v", l.Activity.State, tt.wantActivity) - } - if tt.name == "valid activity resolves a detecting session (proof of life)" && l.Detecting != nil { - t.Errorf("resolving detecting must clear the quarantine memory, got %+v", l.Detecting) - } - }) - } +func openPR(o ports.PRObservation) ports.PRObservation { + o.Fetched, o.URL, o.Number = true, "https://example/pr/1", 1 + return o } -// ---- ApplySCMObservation ---- +// ---- runtime observations ---- -func TestApplySCMObservation(t *testing.T) { - t.Run("failed fetch is a no-op (failed probe != no PR)", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: false, PRState: domain.PROpen}); err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 || l.PR.State != "" { - t.Errorf("expected no-op, got revision=%d pr=%v", l.Revision, l.PR.State) - } - }) +func TestRuntimeObservation_InferredDeath(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - t.Run("open PR writes only the PR axis; session stays activity-owned", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - f := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 12, PRURL: "https://x/12"} - if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PROpen || l.PR.Reason != domain.PRReasonCIFailing || l.PR.Number != 12 { - t.Errorf("pr = %+v, want open/ci_failing/#12", l.PR) - } - if l.Session.State != domain.SessionWorking { - t.Errorf("session = %v, want working (untouched)", l.Session.State) - } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusCIFailed { - t.Errorf("display = %v, want ci_failed", got) - } - }) - - t.Run("draft PR writes draft or ci_failed without review states", func(t *testing.T) { - cases := []struct { - name string - facts ports.SCMFacts - wantReason domain.PRReason - wantStatus domain.SessionStatus - }{ - {"draft with failing CI", ports.SCMFacts{Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIFailing}, domain.PRReasonCIFailing, domain.StatusCIFailed}, - {"draft via bool with open state", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Draft: true}, domain.PRReasonInProgress, domain.StatusDraft}, - {"draft via bool with failing CI", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Draft: true, CISummary: ports.CIFailing}, domain.PRReasonCIFailing, domain.StatusCIFailed}, - {"draft ignores review and merge facts", ports.SCMFacts{Fetched: true, PRState: domain.PRDraft, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonInProgress, domain.StatusDraft}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - mgr, store := newManager() - wantSession := domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress} - store.seed(sid, lc(wantSession.State, wantSession.Reason, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PRDraft || l.PR.Reason != c.wantReason { - t.Errorf("pr = %v/%v, want draft/%v", l.PR.State, l.PR.Reason, c.wantReason) - } - if l.Session != wantSession { - t.Errorf("session = %+v, want untouched %+v", l.Session, wantSession) - } - if got := domain.DeriveLegacyStatus(l); got != c.wantStatus { - t.Errorf("display = %v, want %v", got, c.wantStatus) - } - }) - } - }) - - t.Run("merged PR parks the session and displays merged", func(t *testing.T) { - mgr, store := newManager() - seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - seed.PR = domain.PRSubstate{State: domain.PROpen, Reason: domain.PRReasonInProgress, Number: 12} - store.seed(sid, seed) - f := ports.SCMFacts{Fetched: true, PRState: domain.PRMerged, PRNumber: 12} - if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PRMerged || l.Session.Reason != domain.ReasonMergedWaitingDecision { - t.Errorf("got pr=%v session=%v, want merged + merged_waiting_decision", l.PR.State, l.Session.Reason) - } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusMerged { - t.Errorf("display = %v, want merged", got) - } - }) + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeDead, Process: ports.ProbeDead}); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermRuntimeLost || got.IsAlive { + t.Fatalf("want terminated/runtime_lost/dead, got %+v", got) + } + if n.last() != "reaction.agent-exited" { + t.Fatalf("want agent-exited notify, got %q", n.last()) + } +} - t.Run("open-PR review branches map to the PR axis", func(t *testing.T) { - cases := []struct { - name string - facts ports.SCMFacts - wantReason domain.PRReason - wantStatus domain.SessionStatus - }{ - {"changes requested", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested}, domain.PRReasonChangesRequested, domain.StatusChangesRequested}, - {"pending human comments", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, PendingComments: []ports.ReviewComment{{Author: "human", Body: "fix"}}}, domain.PRReasonChangesRequested, domain.StatusChangesRequested}, - {"pending bot comments", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, PendingComments: []ports.ReviewComment{{Author: "bot", Body: "fix", IsBot: true}}}, domain.PRReasonBotComments, domain.StatusChangesRequested}, - {"merge conflicts", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, Mergeability: ports.Mergeability{CIPassing: true, Approved: true, NoConflicts: false, Blockers: []string{"merge conflicts"}}}, domain.PRReasonMergeConflicts, domain.StatusPROpen}, - {"approved + mergeable", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonMergeReady, domain.StatusMergeable}, - {"review pending", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewPending}, domain.PRReasonReviewPending, domain.StatusReviewPending}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - mgr, store := newManager() - wantSession := domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress} - store.seed(sid, lc(wantSession.State, wantSession.Reason, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil { - t.Fatalf("apply: %v", err) - } - l := mustLoad(t, store) - if l.PR.State != domain.PROpen || l.PR.Reason != c.wantReason { - t.Errorf("pr = %v/%v, want open/%v", l.PR.State, l.PR.Reason, c.wantReason) - } - if got := domain.DeriveLegacyStatus(l); got != c.wantStatus { - t.Errorf("display = %v, want %v", got, c.wantStatus) - } - }) - } - }) +func TestRuntimeObservation_FailedProbeQuarantines(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") - t.Run("no PR is a no-op in split A", func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: true, PRState: domain.PRNone}); err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 { - t.Errorf("expected no-op, got revision=%d", l.Revision) - } - }) + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeFailed, Process: ports.ProbeFailed}); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionDetecting || !got.IsAlive || got.Detecting == nil { + t.Fatalf("failed probe should quarantine alive, got %+v", got) + } } -// ---- mutation outcomes ---- - -func TestOnSpawnCompleted(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown)) +func TestRuntimeObservation_RecoversDetecting(t *testing.T) { + m, st, _, _ := newManager() + rec := working("mer-1") + rec.Lifecycle.Session.State = domain.SessionDetecting + rec.Lifecycle.Detecting = &domain.DetectingState{Attempts: 1} + st.sessions["mer-1"] = rec - out := ports.SpawnOutcome{ - Branch: "feat/x", - WorkspacePath: "/w/x", - RuntimeHandle: ports.RuntimeHandle{ID: "tmux:1", RuntimeName: "tmux"}, - AgentSessionID: "agent-1", + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeAlive, Process: ports.ProbeAlive}); err != nil { + t.Fatal(err) } - if err := mgr.OnSpawnCompleted(context.Background(), sid, out); err != nil { - t.Fatalf("apply: %v", err) + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionWorking || got.Detecting != nil { + t.Fatalf("healthy probe should recover to working, got %+v", got) } +} - l := mustLoad(t, store) - if l.Runtime.State != domain.RuntimeAlive { - t.Errorf("runtime = %v, want alive", l.Runtime.State) - } - if l.Session.State != domain.SessionNotStarted { - t.Errorf("session = %v, want not_started (spawn does not assert acknowledgement)", l.Session.State) +// ---- activity signals ---- + +func TestActivity_WaitingInputPagesHuman(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityWaitingInput, Timestamp: time.Now()}); err != nil { + t.Fatal(err) } - if got := domain.DeriveLegacyStatus(l); got != domain.StatusSpawning { - t.Errorf("display = %v, want spawning", got) + if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionNeedsInput { + t.Fatalf("want needs_input, got %v", st.sessions["mer-1"].Lifecycle.Session.State) } - meta, _ := store.GetMetadata(context.Background(), sid) - if meta[MetaBranch] != "feat/x" || meta[MetaAgentSessionID] != "agent-1" || meta[MetaRuntimeName] != "tmux" { - t.Errorf("metadata not recorded: %+v", meta) + if n.last() != "reaction.agent-needs-input" { + t.Fatalf("want needs-input notify, got %q", n.last()) } } -func TestOnSpawnInitiated_ActiveSessionRejected(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) +func TestActivity_InvalidIsIgnored(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + before := st.sessions["mer-1"] - err := mgr.OnSpawnInitiated(context.Background(), domain.SessionRecord{ - ID: sid, - ProjectID: domain.ProjectID("proj"), - Lifecycle: lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown), - }) - if err == nil { - t.Fatal("OnSpawnInitiated should reject a non-terminal row on top of an active session") + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: false, State: domain.ActivityIdle}); err != nil { + t.Fatal(err) } - - got := mustLoad(t, store) - if got.Session.State != domain.SessionWorking || got.Revision != 0 { - t.Fatalf("active row should be unchanged, got %+v", got) + if st.sessions["mer-1"] != before { + t.Fatal("invalid signal must not mutate the session") } } -func TestOnKillRequested(t *testing.T) { - tests := []struct { - name string - kind ports.LifecycleKillReason - wantReason domain.SessionReason - wantRuntime domain.RuntimeReason - wantDisplay domain.SessionStatus - }{ - {"manual", ports.KillManual, domain.ReasonManuallyKilled, domain.RuntimeReasonManualKillRequested, domain.StatusKilled}, - {"cleanup", ports.KillCleanup, domain.ReasonAutoCleanup, domain.RuntimeReasonAutoCleanup, domain.StatusCleanup}, - {"error", ports.KillError, domain.ReasonErrorInProcess, domain.RuntimeReasonProbeError, domain.StatusErrored}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mgr, store := newManager() - store.seed(sid, detectingLC()) - - if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: tt.kind, Detail: "x"}); err != nil { - t.Fatalf("apply: %v", err) - } - - l := mustLoad(t, store) - if l.Session.State != domain.SessionTerminated || l.Session.Reason != tt.wantReason { - t.Errorf("session = %v/%v, want terminated/%v", l.Session.State, l.Session.Reason, tt.wantReason) - } - if l.Runtime.Reason != tt.wantRuntime { - t.Errorf("runtime reason = %v, want %v", l.Runtime.Reason, tt.wantRuntime) - } - if l.Detecting != nil { - t.Errorf("kill must clear detecting memory, got %+v", l.Detecting) - } - if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { - t.Errorf("display = %v, want %v", got, tt.wantDisplay) - } - }) - } -} +// ---- PR observations ---- + +func TestPR_CIFailingNudgesAgentWithLogs(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") -func TestOnSpawnCompleted_UnseededErrors(t *testing.T) { - mgr, store := newManager() - err := mgr.OnSpawnCompleted(context.Background(), sid, ports.SpawnOutcome{Branch: "x"}) - if err == nil { - t.Error("OnSpawnCompleted for an unseeded session must error, not fabricate a record") + o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []ports.PRCheckRow{{Name: "build", CommitHash: "c1", Status: "failed", LogTail: "boom"}}}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("no record should have been created") + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "boom") { + t.Fatalf("want one CI nudge with log tail, got %v", msg.msgs) } } -func TestOnKillRequested_UnseededIsNoOp(t *testing.T) { - mgr, store := newManager() - if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill of unknown session should be a benign no-op, got %v", err) +func TestPR_CIBrakeEscalatesAfterThreeFails(t *testing.T) { + m, st, n, msg := newManager() + st.sessions["mer-1"] = working("mer-1") + + for _, commit := range []string{"c1", "c2", "c3"} { + o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []ports.PRCheckRow{{Name: "build", CommitHash: commit, Status: "failed", LogTail: "boom"}}}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) + } + } + if len(msg.msgs) != 2 { + t.Fatalf("want 2 nudges then escalate, got %d nudges", len(msg.msgs)) } - if _, ok, _ := store.Load(context.Background(), sid); ok { - t.Error("killing an unknown session must not fabricate a terminal record") + if n.last() != "reaction.escalated" { + t.Fatalf("3rd failure should escalate, got %q", n.last()) } } -// ---- fake store contract ---- +func TestPR_ReviewCommentsInjectedRegardlessOfAuthor(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") -func TestFakeStoreUpsertFullRow(t *testing.T) { - store := newFakeStore() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - rec, ok, err := store.Get(context.Background(), sid) - if err != nil || !ok { - t.Fatalf("seeded record missing: ok=%v err=%v", ok, err) + o := openPR(ports.PRObservation{ + Review: domain.ReviewChangesRequest, + Comments: []ports.PRComment{{ID: "1", Author: "greptileai", Body: "use a constant here"}}, + }) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - rec.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionIdle, Reason: domain.ReasonResearchComplete} - rec.Lifecycle.Runtime = domain.RuntimeSubstate{State: domain.RuntimeExited} - if err := store.Upsert(context.Background(), rec, ports.EventSessionStateChanged); err != nil { - t.Fatalf("upsert: %v", err) + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "use a constant here") { + t.Fatalf("review feedback should be injected verbatim, got %v", msg.msgs) } +} - got, _, _ := store.Get(context.Background(), sid) - if got.Lifecycle.Session.State != domain.SessionIdle || got.Lifecycle.Runtime.State != domain.RuntimeExited { - t.Fatalf("upsert should replace the full canonical row, got %+v", got.Lifecycle) +func TestPR_ApprovedAndGreenNotifies(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + + o := openPR(ports.PRObservation{Review: domain.ReviewApproved, Mergeability: domain.MergeMergeable}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - if got.Lifecycle.Revision != 1 { - t.Fatalf("upsert should bump revision inside the store, got %d want 1", got.Lifecycle.Revision) + if n.last() != "reaction.approved-and-green" { + t.Fatalf("want approved-and-green, got %q", n.last()) } } -// ---- per-session serialisation under the race detector ---- - -func TestPerSessionSerialization(t *testing.T) { - mgr, store := newManager() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - const n = 50 - var wg sync.WaitGroup - wg.Add(n) - for i := 0; i < n; i++ { - go func(i int) { - defer wg.Done() - _ = mgr.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{ - State: ports.SignalValid, - Activity: domain.ActivityActive, - Timestamp: t0.Add(time.Duration(i) * time.Second), - Source: domain.SourceHook, - }) - }(i) - } - wg.Wait() - - // Each goroutine writes a distinct LastActivityAt, so every call is a real - // change; with correct serialisation all n land without a lost update. - if l := mustLoad(t, store); l.Revision != n { - t.Errorf("revision = %d, want %d (lost update under concurrency)", l.Revision, n) +func TestPR_MergeTerminatesSession(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + + o := openPR(ports.PRObservation{Merged: true}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermPRMerged { + t.Fatalf("merge should terminate with pr_merged, got %+v", got) + } + if n.last() != "reaction.pr-merged" { + t.Fatalf("want pr-merged notify, got %q", n.last()) } } -// ---- RunningSessions (reaper poll-set) ---- +func TestPR_FailedFetchIsDropped(t *testing.T) { + m, st, _, msg := newManager() + st.sessions["mer-1"] = working("mer-1") -func TestRunningSessions_NoListerWired_ReturnsEmpty(t *testing.T) { - m, _ := newManager() - got, err := m.RunningSessions(context.Background()) - if err != nil { - t.Fatalf("RunningSessions: %v", err) + if err := m.ApplyPRObservation(ctx, "mer-1", ports.PRObservation{Fetched: false, CI: domain.CIFailing}); err != nil { + t.Fatal(err) } - if len(got) != 0 { - t.Fatalf("expected empty slice when no lister wired, got %d records", len(got)) + if len(msg.msgs) != 0 || len(st.pr) != 0 { + t.Fatal("a failed fetch must write nothing and fire nothing") } } -func TestRunningSessions_ListerErrorPropagates(t *testing.T) { - m, _ := newManager() - wantErr := errors.New("boom") - m.WithSessionLister(func(_ context.Context) ([]domain.SessionRecord, error) { - return nil, wantErr - }) - _, err := m.RunningSessions(context.Background()) - if !errors.Is(err, wantErr) { - t.Fatalf("expected lister error to propagate, got %v", err) +// ---- explicit kill ---- + +func TestKill_TerminatesWithoutReacting(t *testing.T) { + m, st, n, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + + if err := m.OnKillRequested(ctx, "mer-1", domain.TermManuallyKilled); err != nil { + t.Fatal(err) + } + got := st.sessions["mer-1"].Lifecycle + if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermManuallyKilled || got.IsAlive { + t.Fatalf("want terminated/manually_killed/dead, got %+v", got) + } + if len(n.events) != 0 { + t.Fatal("an explicit kill must not fire a reaction") } } -// TestRunningSessions_FilterIncludesProbableExcludesTerminal locks in the -// reaper poll-set predicate. The bug we are guarding against is filtering to -// "runtime.State == RuntimeAlive": detecting sessions (RuntimeMissing / -// RuntimeProbeFailed) would be silently parked, breaking the probe-driven -// recovery path proved by manager_test.go:59 and the dead+dead -> killed path -// proved by manager_test.go:79. -func TestRunningSessions_FilterIncludesProbableExcludesTerminal(t *testing.T) { - m, _ := newManager() - records := []domain.SessionRecord{ - {ID: "working-alive", Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)}, - {ID: "detecting-probefailed", Lifecycle: lc(domain.SessionDetecting, domain.ReasonProbeFailure, domain.RuntimeProbeFailed)}, - {ID: "detecting-missing", Lifecycle: lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing)}, - {ID: "idle-alive", Lifecycle: lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.RuntimeAlive)}, - {ID: "needs-input-alive", Lifecycle: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive)}, - {ID: "not-started", Lifecycle: lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown)}, - {ID: "terminated", Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited)}, - {ID: "done", Lifecycle: lc(domain.SessionDone, domain.ReasonPRMerged, domain.RuntimeExited)}, - } - m.WithSessionLister(func(_ context.Context) ([]domain.SessionRecord, error) { - return records, nil - }) +// ---- duration escalation ---- - got, err := m.RunningSessions(context.Background()) - if err != nil { - t.Fatalf("RunningSessions: %v", err) +func TestTickEscalations_DurationPagesHuman(t *testing.T) { + m, st, n, msg := newManager() + now := time.Now() + m.clock = func() time.Time { return now } + st.sessions["mer-1"] = working("mer-1") + + o := openPR(ports.PRObservation{Mergeability: domain.MergeConflicting}) + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + t.Fatal(err) } - gotIDs := map[domain.SessionID]bool{} - for _, r := range got { - gotIDs[r.ID] = true + if len(msg.msgs) != 1 { + t.Fatalf("merge-conflict should nudge once, got %d", len(msg.msgs)) } - wantIncluded := []domain.SessionID{ - "working-alive", "detecting-probefailed", "detecting-missing", - "idle-alive", "needs-input-alive", "not-started", + if err := m.TickEscalations(ctx, now.Add(16*time.Minute)); err != nil { + t.Fatal(err) } - for _, id := range wantIncluded { - if !gotIDs[id] { - t.Errorf("expected %q in poll set, missing", id) - } - } - wantExcluded := []domain.SessionID{"terminated", "done"} - for _, id := range wantExcluded { - if gotIDs[id] { - t.Errorf("expected %q NOT in poll set, found", id) - } + if n.last() != "reaction.escalated" { + t.Fatalf("unaddressed conflict should escalate after 15m, got %q", n.last()) } } -// ---- helpers ---- +func TestRunningSessions_ExcludesTerminal(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = working("mer-1") + dead := working("mer-2") + dead.Lifecycle.Session.State = domain.SessionTerminated + st.sessions["mer-2"] = dead -func lc(state domain.SessionState, reason domain.SessionReason, rt domain.RuntimeState) domain.CanonicalSessionLifecycle { - return domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: state, Reason: reason}, - Runtime: domain.RuntimeSubstate{State: rt}, + got, err := m.RunningSessions(ctx) + if err != nil { + t.Fatal(err) + } + if len(got) != 1 || got[0].ID != "mer-1" { + t.Fatalf("want only the live session, got %+v", got) } -} - -func detectingLC() domain.CanonicalSessionLifecycle { - l := lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing) - l.Detecting = &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: "abc"} - return l } diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go index 26dea562..94f149f4 100644 --- a/backend/internal/lifecycle/reactions.go +++ b/backend/internal/lifecycle/reactions.go @@ -1,446 +1,397 @@ package lifecycle -// reactions.go is the ACT layer: the reaction table, the per-(session,reaction) -// escalation engine, and the duration-driven TickEscalations the synchronous -// LCM can't wake itself for. Reactions fire from react() after a transition is -// persisted by the Apply* pipeline (see manager.go). +// reactions.go is the ACT layer: after a persisted transition the engine maps +// the session's (state, PR facts) to at most one reaction and dispatches it — +// nudging the agent or paging the human. Two reactions inject live content (CI +// logs, review comments) and re-fire when that content changes; the rest fire +// once on entry, with duration escalation driven by TickEscalations. // -// Dispatch is synchronous: react() runs Send/Notify inline. It is the single -// dispatch chokepoint, so moving it onto a worker goroutine later (once a daemon -// owns that goroutine's lifecycle) is a change confined to this one function. +// Budgets are in-memory: a restart re-arms them, which costs a few extra nudges, +// never a missed page. import ( "context" "fmt" + "strings" + "sync" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// reactionKey names a row in the reaction table and a tracker bucket. type reactionKey string const ( - reactionCIFailed reactionKey = "ci-failed" - reactionChangesRequested reactionKey = "changes-requested" - reactionBugbotComments reactionKey = "bugbot-comments" - reactionMergeConflicts reactionKey = "merge-conflicts" - reactionAgentIdle reactionKey = "agent-idle" - reactionApprovedAndGreen reactionKey = "approved-and-green" - reactionAgentStuck reactionKey = "agent-stuck" - reactionNeedsInput reactionKey = "agent-needs-input" - reactionAgentExited reactionKey = "agent-exited" - reactionPRClosed reactionKey = "pr-closed" - reactionAllComplete reactionKey = "all-complete" + rxCIFailed reactionKey = "ci-failed" + rxReviewComments reactionKey = "review-comments" + rxMergeConflicts reactionKey = "merge-conflicts" + rxIdle reactionKey = "agent-idle" + rxApprovedGreen reactionKey = "approved-and-green" + rxStuck reactionKey = "agent-stuck" + rxNeedsInput reactionKey = "agent-needs-input" + rxExited reactionKey = "agent-exited" + rxPRClosed reactionKey = "pr-closed" + rxMerged reactionKey = "pr-merged" ) -type actionKind string - +// Brakes: stop auto-handling and page a human after this many failed attempts. const ( - actionSendToAgent actionKind = "send-to-agent" - actionNotify actionKind = "notify" - actionAutoMerge actionKind = "auto-merge" + ciBrakeRuns = 3 // last N runs of a failing check all failed + reviewMaxNudge = 3 // re-nudged the agent N times over new review feedback ) -// reactionConfig is one row of the reaction table (distillation §4.1/§4.2). -// -// - retries numeric escalation cap: escalate once attempts exceed it. -// - escalateAfter duration escalation: escalate once this elapses since the -// first attempt (fired by TickEscalations, since the LCM never polls). -// - persistent the tracker survives the status leaving the triggering -// state; it only resets when the incident is truly over (PR no longer open -// or the session terminal). Only ci-failed is persistent, so a flapping -// CI (fail→pending→fail) keeps draining one shared retry budget. +// reactionConfig is one row of the reaction table. toAgent reactions nudge the +// agent; the rest notify the human. escalateAfter (when set) drives a +// duration-based escalation via TickEscalations. type reactionConfig struct { - action actionKind + toAgent bool message string - priority ports.EventPriority eventType string - retries int + priority ports.Priority escalateAfter time.Duration - persistent bool } -// defaultReactions is the product's default behaviour (distillation §4.2). -// auto-merge is intentionally absent: approved-and-green is a notify, so the -// human decides to merge. The auto-merge action kind exists for opt-in configs, -// but no default row uses it. -var defaultReactions = map[reactionKey]reactionConfig{ - reactionCIFailed: { - action: actionSendToAgent, persistent: true, retries: 2, - message: "CI is failing on your PR. Review the failing output below and push a fix.", - eventType: "reaction.ci-failed", priority: ports.PriorityAction, - }, - reactionChangesRequested: { - action: actionSendToAgent, escalateAfter: 30 * time.Minute, - message: "A reviewer requested changes on your PR. Address the comments and push.", - eventType: "reaction.changes-requested", priority: ports.PriorityAction, - }, - reactionBugbotComments: { - action: actionSendToAgent, escalateAfter: 30 * time.Minute, - message: "An automated reviewer left comments on your PR. Address them and push.", - eventType: "reaction.bugbot-comments", priority: ports.PriorityAction, - }, - reactionMergeConflicts: { - action: actionSendToAgent, escalateAfter: 15 * time.Minute, - message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them.", - eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, - }, - reactionAgentIdle: { - action: actionSendToAgent, retries: 2, escalateAfter: 15 * time.Minute, - message: "You appear idle. Continue the task or explain what is blocking you.", - eventType: "reaction.agent-idle", priority: ports.PriorityWarning, - }, - reactionApprovedAndGreen: { - // notify-only: a green, approved PR is the human-decision path — the human - // decides to merge (no auto-merge by default). - action: actionNotify, priority: ports.PriorityAction, - message: "PR is approved and green — ready to merge.", - eventType: "reaction.approved-and-green", - }, - reactionAgentStuck: { - // §4.2 lists a threshold: 10m here; it is intentionally not gated — entry - // into stuck is already debounced upstream by the detecting->stuck - // quarantine (DETECTING_MAX_ATTEMPTS/DURATION), so a second timer would be - // redundant. - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent is stuck and needs attention.", - eventType: "reaction.agent-stuck", - }, - reactionNeedsInput: { - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent needs input to continue.", - eventType: "reaction.agent-needs-input", - }, - reactionAgentExited: { - action: actionNotify, priority: ports.PriorityUrgent, - message: "Agent process exited unexpectedly.", - eventType: "reaction.agent-exited", - }, - reactionPRClosed: { - action: actionNotify, priority: ports.PriorityAction, - message: "PR was closed without merging — decide: resume, learn, or terminate.", - eventType: "reaction.pr-closed", - }, - reactionAllComplete: { - action: actionNotify, priority: ports.PriorityInfo, - message: "PR merged — work complete.", - eventType: "reaction.all-complete", - }, +var reactions = map[reactionKey]reactionConfig{ + rxCIFailed: {toAgent: true, eventType: "reaction.ci-failed", priority: ports.PriorityAction, message: "CI is failing on your PR. Review the output below and push a fix."}, + rxReviewComments: {toAgent: true, eventType: "reaction.review-comments", priority: ports.PriorityAction, message: "A reviewer left feedback on your PR. Address it and push."}, + rxMergeConflicts: {toAgent: true, eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, escalateAfter: 15 * time.Minute, message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them."}, + rxIdle: {toAgent: true, eventType: "reaction.agent-idle", priority: ports.PriorityInfo, escalateAfter: 15 * time.Minute, message: "You appear idle. Continue the task or say what is blocking you."}, + rxApprovedGreen: {eventType: "reaction.approved-and-green", priority: ports.PriorityAction, message: "PR is approved and green — ready to merge."}, + rxStuck: {eventType: "reaction.agent-stuck", priority: ports.PriorityUrgent, message: "Agent is stuck and needs attention."}, + rxNeedsInput: {eventType: "reaction.agent-needs-input", priority: ports.PriorityUrgent, message: "Agent needs input to continue."}, + rxExited: {eventType: "reaction.agent-exited", priority: ports.PriorityUrgent, message: "Agent process exited unexpectedly."}, + rxPRClosed: {eventType: "reaction.pr-closed", priority: ports.PriorityAction, message: "PR was closed without merging."}, + rxMerged: {eventType: "reaction.pr-merged", priority: ports.PriorityInfo, message: "PR merged — work complete."}, } -// reactionEventFor maps a canonical record to the reaction it should drive, -// mirroring DeriveLegacyStatus but for the ACT layer. ok is false when the -// current state has no reaction. -// -// A closed PR derives to the idle display status, so it is detected from the PR -// axis directly before falling through to the status mapping. Bot review -// comments and merge conflicts are represented as PR reasons so the ACT layer -// can distinguish them from human-requested changes and plain open PRs. -func reactionEventFor(l domain.CanonicalSessionLifecycle) (reactionKey, bool) { - if l.PR.State == domain.PRClosed { - return reactionPRClosed, true - } - if isActivePRState(l.PR.State) { - switch l.PR.Reason { - case domain.PRReasonBotComments: - return reactionBugbotComments, true - case domain.PRReasonMergeConflicts: - return reactionMergeConflicts, true +// reactionContent carries the live material the feedback reactions inject. Empty +// for runtime/activity transitions; populated from a PR observation. +type reactionContent struct { + ciCheck string + ciCommit string + ciURL string + ciLogTail string + comments []string + reviewSig string +} + +// prContent extracts the CI failure + review feedback from a PR observation. +func prContent(o ports.PRObservation) reactionContent { + c := reactionContent{} + for _, ch := range o.Checks { + if ch.Status == "failed" { + c.ciCheck, c.ciCommit, c.ciLogTail, c.ciURL = ch.Name, ch.CommitHash, ch.LogTail, o.URL + break } } - switch domain.DeriveLegacyStatus(l) { - case domain.StatusCIFailed: - return reactionCIFailed, true - case domain.StatusChangesRequested: - return reactionChangesRequested, true - case domain.StatusApproved, domain.StatusMergeable: - return reactionApprovedAndGreen, true - case domain.StatusIdle: - return reactionAgentIdle, true - case domain.StatusStuck: - return reactionAgentStuck, true - case domain.StatusNeedsInput: - return reactionNeedsInput, true - case domain.StatusKilled: - // Inferred death only — an explicit user kill goes through - // OnKillRequested, which does not react. - return reactionAgentExited, true - case domain.StatusMerged: - return reactionAllComplete, true + var ids []string + for _, cm := range o.Comments { + if cm.Resolved { + continue + } + c.comments = append(c.comments, cm.Body) + ids = append(ids, cm.ID) } - return "", false + c.reviewSig = strings.Join(ids, ",") + return c } -// reactionContext carries fact-derived material the message templates need. The -// SCM path populates it (CI failure log tail); other paths pass the zero value. -type reactionContext struct { - ciFailureLogTail *string -} +// ---- in-memory escalation state ---- -// trackerKey buckets an escalation tracker by session and reaction. type trackerKey struct { id domain.SessionID key reactionKey } -// reactionTracker is the per-(session,reaction) escalation budget. It lives in -// memory on the Manager: a daemon restart resets budgets, which only ever costs -// a few extra agent retries before re-escalating — never a missed human -// notification. Keeping it out of the canonical store preserves the -// truth-vs-policy split (the store holds session truth; this is ACT policy). -// -// projectID is captured at first attempt so TickEscalations — which fires from -// the reaper and has no transition on hand — can still populate ProjectID on -// the escalation event. It is set once and never overwritten; reaction-bearing -// transitions for a given session id always carry the same projectID. -type reactionTracker struct { - attempts int - escalated bool - firstAttemptAt time.Time - projectID domain.ProjectID +type tracker struct { + attempts int + firstAt time.Time + escalated bool + seenSig bool + lastSig string + projectID domain.ProjectID } -// react fires the ACT layer after a persisted transition: clear the tracker for -// the reaction we left, then dispatch the reaction for the one we entered. It -// fires only on a genuine reaction change, so re-persisting the same state does -// not re-dispatch. Synchronous by design (see file header). -// -// Integration-time caveat: react runs AFTER withLock releases (deliberately, so -// a busy-waiting send-to-agent never holds the per-session mutex). Under a live -// daemon with concurrent observers (SCM poller + reaper + activity ingest) the -// afterLC snapshot can be stale by dispatch time — e.g. a ci-failed send firing -// after the session already moved to approved. Tests are single-threaded so it -// is not observable yet; when the daemon lands, give react a per-session -// ordering (a small react queue) or re-check the triggering state before -// dispatching. -func (m *Manager) react(ctx context.Context, id domain.SessionID, tr *transition, rc reactionContext) error { - if tr == nil { - return nil - } - beforeKey, hadBefore := reactionEventFor(tr.beforeLC) - afterKey, hasAfter := reactionEventFor(tr.afterLC) - - changed := beforeKey != afterKey - - switch { - case incidentOver(tr.afterLC) || recovered(tr.afterLC): - // The PR-pipeline incident has ended — the PR resolved (merged/closed), - // the session went terminal, or it reached an approved/green state. Every - // tracker for this session is now stale, including a persistent ci-failed - // one. This is keyed on the state REACHED, not the one left: the recovery - // transition is typically review_pending->approved (beforeKey empty), so - // clearing only beforeKey would leak the ci-failed tracker and leave its - // escalated=true to silence a future regression. Clear them all. - m.clearSessionTrackers(id) - case hadBefore && (!hasAfter || changed): - // Within an unresolved open PR: a normal tracker resets when its state is - // left. A persistent one (ci-failed) is NOT cleared here — it must survive - // the ambiguous review_pending limbo (the fail->pending->fail flap, §4.2); - // it only resets via the recovery/incident-over branch above. - if !defaultReactions[beforeKey].persistent { - m.clearTracker(id, beforeKey) - } - } +type reactionState struct { + mu sync.Mutex + trackers map[trackerKey]*tracker + lastKey map[domain.SessionID]reactionKey +} - if hasAfter && (!hadBefore || changed) { - return m.executeReaction(ctx, id, tr.projectID, afterKey, rc) - } - return nil +func newReactionState() reactionState { + return reactionState{trackers: map[trackerKey]*tracker{}, lastKey: map[domain.SessionID]reactionKey{}} } -// incidentOver reports that a PR-pipeline incident has truly ended (PR no longer -// active, or the session terminal), so all trackers for the session may reset. -func incidentOver(l domain.CanonicalSessionLifecycle) bool { - return !isActivePRState(l.PR.State) || isTerminal(l.Session.State) +// trackerFor returns the (id,key) tracker, creating it on first use. Caller holds mu. +func (rs *reactionState) trackerFor(id domain.SessionID, key reactionKey) *tracker { + k := trackerKey{id, key} + t := rs.trackers[k] + if t == nil { + t = &tracker{} + rs.trackers[k] = t + } + return t } -func isActivePRState(s domain.PRState) bool { - return s == domain.PROpen || s == domain.PRDraft +func (m *Manager) clearReactions(id domain.SessionID) { + m.react.mu.Lock() + defer m.react.mu.Unlock() + for k := range m.react.trackers { + if k.id == id { + delete(m.react.trackers, k) + } + } + delete(m.react.lastKey, id) } -// recovered reports a genuinely-green open PR: an approved/mergeable state, which -// unambiguously means CI is no longer failing (the open-PR ladder ranks ci_failing -// above approved, so an approved display cannot coexist with failing CI). Unlike -// the ambiguous review_pending state — which may just be CI re-running — reaching -// this ends a ci-failed incident and re-arms its budget. Draft PRs are active, -// but not recoverable via review/merge state. -func recovered(l domain.CanonicalSessionLifecycle) bool { - if !isActivePRState(l.PR.State) || l.PR.State == domain.PRDraft { - return false +// ---- dispatch ---- + +// runReactions is the chokepoint called after every persisted transition. It +// runs unlocked (the write lock is already released) so a busy agent send never +// blocks the write path. +func (m *Manager) runReactions(ctx context.Context, id domain.SessionID, content reactionContent) error { + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil || !ok { + return err } - switch l.PR.Reason { - case domain.PRReasonApproved, domain.PRReasonMergeReady: - return true - default: - return false + lc := rec.Lifecycle + project := rec.ProjectID + + if isTerminal(lc.Session.State) { + err := m.dispatch(ctx, id, project, terminalReaction(lc.TerminationReason)) + m.clearReactions(id) // incident over: drop budgets after the final notify + return err } -} -func (m *Manager) executeReaction(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey, rc reactionContext) error { - cfg := defaultReactions[key] - switch cfg.action { - case actionNotify: - // notify reactions are human-attention terminals: fire once on the - // triggering transition, no retry/escalation budget. - return m.notifier.Notify(ctx, ports.OrchestratorEvent{ - Type: cfg.eventType, - Priority: cfg.priority, - SessionID: id, - ProjectID: projectID, - Message: cfg.message, - }) - case actionAutoMerge: - // Off by default: no default row maps here, and wiring a merge port is a - // later PR. An opt-in config could route a reaction here. - return nil - case actionSendToAgent: - return m.sendToAgent(ctx, id, projectID, key, cfg, rc) + pr, err := m.store.PRFactsForSession(ctx, id) + if err != nil { + return err } - return nil + + // Feedback reactions inject live content and re-fire as it changes — only + // while the agent can actually act on it. + if pr.Exists && !pr.Closed && !needsHuman(lc.Session.State) { + if pr.CI == domain.CIFailing && content.ciCheck != "" { + if err := m.handleCIFailure(ctx, id, project, content); err != nil { + return err + } + } + if hasReviewFeedback(pr) { + if err := m.handleReviewFeedback(ctx, id, project, content); err != nil { + return err + } + } + } + + return m.dispatch(ctx, id, project, reactionFor(lc, pr)) } -// sendToAgent runs the escalation engine for an auto send-to-agent reaction: -// count the attempt, escalate when the numeric cap or duration is exceeded -// (silencing further auto-dispatch), else inject the message via the messenger. -func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey, cfg reactionConfig, rc reactionContext) error { - m.trackerMu.Lock() - tk := m.trackerFor(id, key) - // Capture projectID once so the duration-based TickEscalations path — which - // has no transition on hand — can still populate ProjectID on the escalation - // event. A non-empty incoming projectID always wins, in case the tracker was - // first created from an observation that lacked one. - if projectID != "" { - tk.projectID = projectID +// dispatch fires the entry reaction for key, deduped so a steady state does not +// re-fire. Leaving a reaction drops its budget. +func (m *Manager) dispatch(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey) error { + m.react.mu.Lock() + if m.react.lastKey[id] == key { + m.react.mu.Unlock() + return nil } - if tk.escalated { - m.trackerMu.Unlock() - return nil // silenced until the condition clears the tracker + if prev := m.react.lastKey[id]; prev != "" { + delete(m.react.trackers, trackerKey{id, prev}) } - now := m.clock() - freshFirst := tk.firstAttemptAt.IsZero() - if freshFirst { - tk.firstAttemptAt = now + m.react.lastKey[id] = key + m.react.mu.Unlock() + + if key == "" { + return nil } - tk.attempts++ - if shouldEscalate(tk, cfg, now) { - tk.escalated = true - m.trackerMu.Unlock() - return m.escalate(ctx, id, tk.projectID, key) + cfg := reactions[key] + if cfg.toAgent { + return m.fireAgentEntry(ctx, id, project, key, cfg) } - m.trackerMu.Unlock() - - if err := m.messenger.Send(ctx, id, composeMessage(cfg, rc)); err != nil { - // A delivery failure must not consume escalation budget: roll this - // attempt back so the next relevant transition retries from the same - // point rather than marching toward escalation on undelivered messages - // (distillation §4.3). - m.trackerMu.Lock() - tk.attempts-- - if freshFirst { - tk.firstAttemptAt = time.Time{} + return m.fireNotify(ctx, id, project, cfg) +} + +// reactionFor maps (session state, PR facts) to the reaction to enter. CI failure +// and review feedback return "" here — they are handled by the feedback path. +func reactionFor(lc domain.CanonicalSessionLifecycle, pr domain.PRFacts) reactionKey { + switch lc.Session.State { + case domain.SessionStuck: + return rxStuck + case domain.SessionNeedsInput: + return rxNeedsInput + } + if pr.Exists { + if pr.Closed { + if !pr.Merged { + return rxPRClosed + } + return "" + } + switch { + case pr.CI == domain.CIFailing, hasReviewFeedback(pr): + return "" // feedback path + case pr.Mergeability == domain.MergeConflicting: + return rxMergeConflicts + case pr.Mergeability == domain.MergeMergeable, pr.Review == domain.ReviewApproved: + return rxApprovedGreen } - m.trackerMu.Unlock() - return err } - return nil + if lc.Session.State == domain.SessionIdle { + return rxIdle + } + return "" } -// shouldEscalate uses inclusive boundaries: escalate once the numeric cap is -// exceeded or once exactly escalateAfter has elapsed (don't wait for the next -// tick to cross a strict threshold). -func shouldEscalate(tk *reactionTracker, cfg reactionConfig, now time.Time) bool { - if cfg.retries > 0 && tk.attempts > cfg.retries { - return true - } - if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { - return true +func hasReviewFeedback(pr domain.PRFacts) bool { + return pr.Review == domain.ReviewChangesRequest || pr.ReviewComments +} + +func needsHuman(s domain.SessionState) bool { + return s == domain.SessionStuck || s == domain.SessionNeedsInput +} + +// terminalReaction is the notify fired when a session reaches a terminal state by +// inferred death. An explicit kill goes through OnKillRequested (no reaction); +// auto_cleanup / pr_merged are notified elsewhere. +func terminalReaction(r domain.TerminationReason) reactionKey { + switch r { + case domain.TermRuntimeLost, domain.TermAgentProcessExited, domain.TermProbeFailure, domain.TermErrorInProcess: + return rxExited + default: + return "" } - return false } -// escalate emits reaction.escalated and notifies the human. The caller has -// already set tracker.escalated under the lock, which silences further -// auto-dispatch for this reaction until the tracker clears. -func (m *Manager) escalate(ctx context.Context, id domain.SessionID, projectID domain.ProjectID, key reactionKey) error { - return m.notifier.Notify(ctx, ports.OrchestratorEvent{ - Type: "reaction.escalated", - Priority: ports.PriorityUrgent, - SessionID: id, - ProjectID: projectID, - Message: fmt.Sprintf("auto-handling of %q is exhausted and needs a human.", key), - Data: map[string]any{"reaction": string(key)}, +// ---- feedback reactions (content-driven re-fire + brake) ---- + +func (m *Manager) handleCIFailure(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { + msg := reactions[rxCIFailed].message + "\n\nFailing output:\n" + c.ciLogTail + return m.fireFeedback(ctx, id, project, rxCIFailed, c.ciCommit, msg, func(int) (bool, error) { + st, err := m.pr.RecentCheckStatuses(ctx, c.ciURL, c.ciCheck, ciBrakeRuns) + if err != nil { + return false, err + } + return allFailed(st, ciBrakeRuns), nil + }) +} + +func (m *Manager) handleReviewFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { + msg := reactions[rxReviewComments].message + if len(c.comments) > 0 { + msg += "\n\n" + strings.Join(c.comments, "\n\n") + } + return m.fireFeedback(ctx, id, project, rxReviewComments, c.reviewSig, msg, func(attempts int) (bool, error) { + return attempts > reviewMaxNudge, nil }) } -func composeMessage(cfg reactionConfig, rc reactionContext) string { - if rc.ciFailureLogTail != nil && *rc.ciFailureLogTail != "" { - return cfg.message + "\n\nFailing output:\n" + *rc.ciFailureLogTail +// fireFeedback nudges the agent with fresh content, deduped by signature so the +// same content is not re-sent each poll. braked decides whether to escalate to a +// human instead (CI: history; review: attempt count). +func (m *Manager) fireFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, sig, message string, braked func(attempts int) (bool, error)) error { + m.react.mu.Lock() + t := m.react.trackerFor(id, key) + if project != "" { + t.projectID = project + } + if t.escalated || (t.seenSig && t.lastSig == sig) { + m.react.mu.Unlock() + return nil + } + t.seenSig, t.lastSig = true, sig + t.attempts++ + attempts, pid := t.attempts, t.projectID + m.react.lastKey[id] = key // feedback owns the slot so a later dispatch("") clears it + m.react.mu.Unlock() + + brake, err := braked(attempts) + if err != nil { + return err } - return cfg.message + if brake { + m.react.mu.Lock() + t.escalated = true + m.react.mu.Unlock() + return m.escalate(ctx, id, pid, key) + } + return m.messenger.Send(ctx, id, message) } -// trackerFor returns the tracker for (id,key), creating it on first use. The -// caller must hold trackerMu. -func (m *Manager) trackerFor(id domain.SessionID, key reactionKey) *reactionTracker { - k := trackerKey{id: id, key: key} - tk := m.trackers[k] - if tk == nil { - tk = &reactionTracker{} - m.trackers[k] = tk +// ---- entry reactions ---- + +// fireAgentEntry nudges the agent once on entry into a static reaction +// (idle/merge-conflicts); escalation is duration-based via TickEscalations. +func (m *Manager) fireAgentEntry(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, cfg reactionConfig) error { + m.react.mu.Lock() + t := m.react.trackerFor(id, key) + if project != "" { + t.projectID = project } - return tk + if t.escalated { + m.react.mu.Unlock() + return nil + } + if t.firstAt.IsZero() { + t.firstAt = m.clock() + } + t.attempts++ + m.react.mu.Unlock() + return m.messenger.Send(ctx, id, cfg.message) } -func (m *Manager) clearTracker(id domain.SessionID, key reactionKey) { - m.trackerMu.Lock() - delete(m.trackers, trackerKey{id: id, key: key}) - m.trackerMu.Unlock() +func (m *Manager) fireNotify(ctx context.Context, id domain.SessionID, project domain.ProjectID, cfg reactionConfig) error { + return m.notifier.Notify(ctx, ports.Event{ + Type: cfg.eventType, Priority: cfg.priority, + SessionID: id, ProjectID: project, Message: cfg.message, + }) } -// clearSessionTrackers drops every tracker for a session — used when its -// incident is over, so no budget (and no stale escalated=true) survives into a -// later unrelated incident. -func (m *Manager) clearSessionTrackers(id domain.SessionID) { - m.trackerMu.Lock() - for k := range m.trackers { - if k.id == id { - delete(m.trackers, k) - } - } - m.trackerMu.Unlock() +func (m *Manager) escalate(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey) error { + return m.notifier.Notify(ctx, ports.Event{ + Type: "reaction.escalated", Priority: ports.PriorityUrgent, + SessionID: id, ProjectID: project, + Message: fmt.Sprintf("Automatic handling of %q is exhausted — needs a human.", key), + }) } -// TickEscalations fires the duration-based escalations the synchronous LCM -// cannot wake itself for. The reaper calls it on a timer; it escalates any -// not-yet-escalated tracker whose escalateAfter has elapsed. Notifications are -// sent outside the lock so agent/notifier latency never blocks tracker access. +// TickEscalations fires the duration-based escalations the synchronous engine +// cannot wake itself for. The reaper calls it on a timer. func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { type due struct { - id domain.SessionID - projectID domain.ProjectID - key reactionKey + id domain.SessionID + project domain.ProjectID + key reactionKey } var fire []due - - m.trackerMu.Lock() - for k, tk := range m.trackers { - if tk.escalated { + m.react.mu.Lock() + for k, t := range m.react.trackers { + if t.escalated { continue } - cfg := defaultReactions[k.key] - if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { - tk.escalated = true - fire = append(fire, due{id: k.id, projectID: tk.projectID, key: k.key}) + cfg := reactions[k.key] + if cfg.escalateAfter > 0 && !t.firstAt.IsZero() && now.Sub(t.firstAt) >= cfg.escalateAfter { + t.escalated = true + fire = append(fire, due{k.id, t.projectID, k.key}) } } - m.trackerMu.Unlock() + m.react.mu.Unlock() for _, d := range fire { - if err := m.escalate(ctx, d.id, d.projectID, d.key); err != nil { + if err := m.escalate(ctx, d.id, d.project, d.key); err != nil { return err } } return nil } + +func allFailed(statuses []string, n int) bool { + if len(statuses) < n { + return false + } + for i := 0; i < n; i++ { + if statuses[i] != "failed" { + return false + } + } + return true +} diff --git a/backend/internal/lifecycle/reactions_test.go b/backend/internal/lifecycle/reactions_test.go deleted file mode 100644 index 637b1e5b..00000000 --- a/backend/internal/lifecycle/reactions_test.go +++ /dev/null @@ -1,616 +0,0 @@ -package lifecycle - -import ( - "context" - "fmt" - "strings" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// failingMessenger always fails delivery, counting attempts — used to assert a -// send failure does not consume escalation budget. -type failingMessenger struct{ attempts int } - -func (f *failingMessenger) Send(_ context.Context, _ domain.SessionID, _ string) error { - f.attempts++ - return fmt.Errorf("messenger unavailable") -} - -// newReactive wires a Manager with handles on the recording fakes so reaction -// tests can assert what was sent/notified. clock is pinned to t0 for -// deterministic escalation stamping. -func newReactive() (*Manager, *fakeStore, *recordingNotifier, *recordingMessenger) { - store := newFakeStore() - notf := &recordingNotifier{} - msgr := &recordingMessenger{} - m := New(store, notf, msgr) - m.clock = func() time.Time { return t0 } - return m, store, notf, msgr -} - -func lcOpenPR(reason domain.PRReason) domain.CanonicalSessionLifecycle { - l := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - l.PR = domain.PRSubstate{State: domain.PROpen, Reason: reason, Number: 7} - return l -} - -func notifyCount(n *recordingNotifier, eventType string) int { - n.mu.Lock() - defer n.mu.Unlock() - c := 0 - for _, e := range n.events { - if e.Type == eventType { - c++ - } - } - return c -} - -func ctx() context.Context { return context.Background() } - -// ---- right reaction per transition ---- - -func TestReaction_CIFailedSendsToAgentWithLogTail(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - tail := "build failed\nundefined: foo" - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, - PRNumber: 7, CIFailureLogTail: &tail, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want 1 send, got %d", len(msgr.sent)) - } - if got := msgr.sent[0].Message; !strings.Contains(got, "CI is failing") || !strings.Contains(got, tail) { - t.Errorf("message missing base text or log tail: %q", got) - } - if notifyCount(notf, "reaction.escalated") != 0 { - t.Error("a first failure must not escalate") - } -} - -func TestReaction_BotAndHumanCommentsRouteSeparately(t *testing.T) { - tests := []struct { - name string - comments []ports.ReviewComment - wantMessage string - }{ - { - name: "bot comments -> bugbot-comments", - comments: []ports.ReviewComment{{Author: "bugbot", Body: "fix", IsBot: true}}, - wantMessage: "automated reviewer", - }, - { - name: "human comments -> changes-requested", - comments: []ports.ReviewComment{{Author: "reviewer", Body: "fix"}}, - wantMessage: "reviewer requested changes", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, PendingComments: tt.comments, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want one send, got %d", len(msgr.sent)) - } - if !strings.Contains(msgr.sent[0].Message, tt.wantMessage) { - t.Errorf("message %q does not contain %q", msgr.sent[0].Message, tt.wantMessage) - } - }) - } -} - -func TestReaction_MergeConflictsSendsToAgent(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, PRNumber: 7, - Mergeability: ports.Mergeability{CIPassing: true, Approved: true, NoConflicts: false, Blockers: []string{"merge conflicts"}}, - }); err != nil { - t.Fatalf("apply: %v", err) - } - - if len(msgr.sent) != 1 { - t.Fatalf("want one send, got %d", len(msgr.sent)) - } - if !strings.Contains(msgr.sent[0].Message, "merge conflicts") { - t.Errorf("message = %q, want merge conflict nudge", msgr.sent[0].Message) - } -} - -func TestReaction_ApprovedAndGreenNotifiesNeverAutoMerges(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - // approved-and-green is notify (human decides to merge); the agent is never - // messaged and no auto-merge fires. - if len(msgr.sent) != 0 { - t.Errorf("approved-and-green must not message the agent, got %d sends", len(msgr.sent)) - } - if notifyCount(notf, "reaction.approved-and-green") != 1 { - t.Errorf("want one approved-and-green notify, got events %+v", notf.events) - } -} - -func TestReaction_NotifyEventsForHardStates(t *testing.T) { - tests := []struct { - name string - apply func(m *Manager) - eventType string - }{ - { - name: "waiting_input -> agent-needs-input", - apply: func(m *Manager) { applyActivity(m, domain.ActivityWaitingInput) }, - eventType: "reaction.agent-needs-input", - }, - { - name: "blocked -> agent-stuck", - apply: func(m *Manager) { applyActivity(m, domain.ActivityBlocked) }, - eventType: "reaction.agent-stuck", - }, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - tc.apply(m) - if notifyCount(notf, tc.eventType) != 1 { - t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) - } - if len(msgr.sent) != 0 { - t.Errorf("notify reaction must not message the agent, got %d", len(msgr.sent)) - } - }) - } -} - -func TestReaction_InferredDeathNotifiesAgentExited(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, detectingLC()) - - err := m.ApplyRuntimeObservation(ctx(), sid, ports.RuntimeFacts{ - RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if l := mustLoad(t, store); domain.DeriveLegacyStatus(l) != domain.StatusKilled { - t.Fatalf("precondition: want killed, got %s", domain.DeriveLegacyStatus(l)) - } - if notifyCount(notf, "reaction.agent-exited") != 1 { - t.Errorf("want one agent-exited, got events %+v", notf.events) - } -} - -func TestReaction_PRClosedAndMerged(t *testing.T) { - tests := []struct { - name string - prState domain.PRState - eventType string - }{ - {"closed -> pr-closed", domain.PRClosed, "reaction.pr-closed"}, - {"merged -> all-complete", domain.PRMerged, "reaction.all-complete"}, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: tc.prState, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if notifyCount(notf, tc.eventType) != 1 { - t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) - } - }) - } -} - -func TestReaction_OnKillRequestedDoesNotReact(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // An explicit human kill is not an inferred event: no agent-exited, no send. - if len(notf.events) != 0 || len(msgr.sent) != 0 { - t.Errorf("explicit kill must fire no reaction: notifies=%+v sends=%+v", notf.events, msgr.sent) - } -} - -// ---- escalation engine ---- - -func TestReaction_CIFailedNumericEscalation(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // ci-failed has retries 2 and is persistent, so the budget is shared across - // fail->pending->fail oscillations and escalates on the third failure. - failN := 4 - for i := 0; i < failN; i++ { - failCI(t, m) - pendingCI(t, m) // oscillate out (persistent tracker must NOT reset) - } - - if len(msgr.sent) != 2 { - t.Errorf("want 2 auto-sends before escalation, got %d", len(msgr.sent)) - } - if c := notifyCount(notf, "reaction.escalated"); c != 1 { - t.Errorf("want exactly one escalation, got %d", c) - } -} - -func TestReaction_DraftPRDoesNotEndCIFailedIncident(t *testing.T) { - m, store, _, _ := newReactive() - seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) - seed.PR = domain.PRSubstate{State: domain.PRDraft, Reason: domain.PRReasonInProgress, Number: 7} - store.seed(sid, seed) - - tail := "fail" - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail, - }); err != nil { - t.Fatalf("draft fail: %v", err) - } - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a ci-failed tracker") - } - - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRDraft, CISummary: ports.CIPending, PRNumber: 7, - }); err != nil { - t.Fatalf("draft pending: %v", err) - } - if n := sessionTrackerCount(m, sid); n == 0 { - t.Errorf("draft PR is still active; ci-failed tracker should survive, got %d", n) - } -} - -func TestReaction_DurationEscalationFiresOnTick(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // changes-requested: send once now, then escalate by duration (30m) — which - // only the reaper's TickEscalations can fire (the LCM never polls). - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - if len(msgr.sent) != 1 { - t.Fatalf("want one send on transition, got %d", len(msgr.sent)) - } - - if err := m.TickEscalations(ctx(), t0.Add(10*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - if notifyCount(notf, "reaction.escalated") != 0 { - t.Error("must not escalate before escalateAfter elapses") - } - - // Inclusive boundary: escalate at exactly escalateAfter (30m), not only past it. - if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - if notifyCount(notf, "reaction.escalated") != 1 { - t.Errorf("want one duration escalation at exactly 30m, got events %+v", notf.events) - } -} - -func TestReaction_KillClearsEscalationTrackers(t *testing.T) { - m, store, notf, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // changes-requested creates a duration-based tracker. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a tracker") - } - - if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - if n := sessionTrackerCount(m, sid); n != 0 { - t.Errorf("kill must clear trackers, %d left", n) - } - // A later duration tick must not escalate a dead session. - if err := m.TickEscalations(ctx(), t0.Add(time.Hour)); err != nil { - t.Fatalf("tick: %v", err) - } - if c := notifyCount(notf, "reaction.escalated"); c != 0 { - t.Errorf("killed session must not escalate, got %d", c) - } -} - -func TestReaction_SendFailureDoesNotBurnBudget(t *testing.T) { - store := newFakeStore() - notf := &recordingNotifier{} - fm := &failingMessenger{} - m := New(store, notf, fm) - m.clock = func() time.Time { return t0 } - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - tail := "fail" - failing := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail} - pending := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7} - - // ci-failed has retries 2; with every delivery failing, the budget is rolled - // back each time, so even 5 failures never escalate. - for i := 0; i < 5; i++ { - _ = m.ApplySCMObservation(ctx(), sid, failing) // returns the delivery error - _ = m.ApplySCMObservation(ctx(), sid, pending) - } - if fm.attempts < 5 { - t.Errorf("expected at least 5 send attempts, got %d", fm.attempts) - } - if c := notifyCount(notf, "reaction.escalated"); c != 0 { - t.Errorf("undelivered messages must not escalate, got %d", c) - } -} - -func TestReaction_NonPersistentTrackerClearsOnLeave(t *testing.T) { - m, store, _, msgr := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - - // agent-idle has retries 2 but is NOT persistent: leaving idle clears the - // tracker, so three idle incidents each send fresh and none escalate. - for i := 0; i < 3; i++ { - applyActivity(m, domain.ActivityIdle) - applyActivity(m, domain.ActivityActive) - } - if len(msgr.sent) != 3 { - t.Errorf("want 3 idle sends (budget reset each incident), got %d", len(msgr.sent)) - } -} - -func TestReaction_CIFailedRearmsOnGenuineRecovery(t *testing.T) { - m, store, notf, msgr := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - // Drain the ci-failed budget to escalation (silenced thereafter). - for i := 0; i < 4; i++ { - failCI(t, m) - pendingCI(t, m) - } - if notifyCount(notf, "reaction.escalated") != 1 { - t.Fatalf("precondition: want one escalation, got %d", notifyCount(notf, "reaction.escalated")) - } - sentBefore := len(msgr.sent) - - // A genuine recovery (approved + green) ends the incident and re-arms the - // budget; a later regression must re-nudge the agent, not stay silenced. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }); err != nil { - t.Fatalf("recover: %v", err) - } - failCI(t, m) - - if len(msgr.sent) != sentBefore+1 { - t.Errorf("regression after recovery must re-nudge the agent: sends %d -> %d", sentBefore, len(msgr.sent)) - } -} - -func TestReaction_IncidentOverClearsAllSessionTrackers(t *testing.T) { - m, store, _, _ := newReactive() - store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) - - failCI(t, m) // creates a persistent ci-failed tracker - if sessionTrackerCount(m, sid) == 0 { - t.Fatalf("precondition: expected a ci-failed tracker") - } - - // Merging ends the incident; no tracker (and no stale escalated=true) may - // survive for the session. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PRMerged, PRNumber: 7, - }); err != nil { - t.Fatalf("merge: %v", err) - } - if n := sessionTrackerCount(m, sid); n != 0 { - t.Errorf("incident over must clear all trackers, %d left", n) - } -} - -// ---- ProjectID propagation (review R11) ---- - -// TestReaction_ProjectIDOnNotifyAndEscalateEvents asserts that both Notify call -// sites in reactions.go (executeReaction's notify and escalate) carry the -// record's ProjectID. The human-facing event router groups by project, so a -// missing id would land events in the wrong bucket. -func TestReaction_ProjectIDOnNotifyAndEscalateEvents(t *testing.T) { - const proj domain.ProjectID = "acme" - - t.Run("notify path -> ProjectID populated", func(t *testing.T) { - m, store, notf, _ := newReactive() - // Seed via Upsert (not the lifecycle-only seed helper) so the record carries - // the ProjectID that mutate's transition then propagates to react. - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // approved-and-green is a notify reaction; it fires once via executeReaction. - err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, - Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, - }) - if err != nil { - t.Fatalf("apply: %v", err) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.approved-and-green" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected approved-and-green notify, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("notify ProjectID = %q, want %q", got.ProjectID, proj) - } - if got.SessionID != sid { - t.Errorf("notify SessionID = %q, want %q", got.SessionID, sid) - } - }) - - t.Run("escalate path -> ProjectID populated (numeric cap)", func(t *testing.T) { - m, store, notf, _ := newReactive() - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // Drain the ci-failed budget to numeric escalation (sendToAgent -> escalate). - for i := 0; i < 4; i++ { - failCI(t, m) - pendingCI(t, m) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.escalated" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected reaction.escalated event, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("escalate ProjectID = %q, want %q", got.ProjectID, proj) - } - }) - - t.Run("escalate path -> ProjectID populated (TickEscalations duration)", func(t *testing.T) { - m, store, notf, _ := newReactive() - if err := store.Upsert(ctx(), domain.SessionRecord{ - ID: sid, ProjectID: proj, Lifecycle: lcOpenPR(domain.PRReasonReviewPending), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - // changes-requested creates a duration-based tracker on the first send; - // TickEscalations fires escalate from a path with no transition on hand, - // so the tracker's captured ProjectID is what must surface on the event. - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, - }); err != nil { - t.Fatalf("apply: %v", err) - } - if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil { - t.Fatalf("tick: %v", err) - } - - notf.mu.Lock() - defer notf.mu.Unlock() - var got *ports.OrchestratorEvent - for i := range notf.events { - if notf.events[i].Type == "reaction.escalated" { - got = ¬f.events[i] - break - } - } - if got == nil { - t.Fatalf("expected duration-escalated event, got events: %+v", notf.events) - } - if got.ProjectID != proj { - t.Errorf("tick-escalate ProjectID = %q, want %q", got.ProjectID, proj) - } - }) -} - -func sessionTrackerCount(m *Manager, id domain.SessionID) int { - m.trackerMu.Lock() - defer m.trackerMu.Unlock() - c := 0 - for k := range m.trackers { - if k.id == id { - c++ - } - } - return c -} - -// ---- TickEscalations never writes canonical state ---- - -func TestTickEscalations_DoesNotPersist(t *testing.T) { - m, store, _, _ := newReactive() - store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) - if err := m.TickEscalations(ctx(), t0); err != nil { - t.Fatalf("tick: %v", err) - } - if l := mustLoad(t, store); l.Revision != 0 { - t.Errorf("TickEscalations must not write canonical state, got revision=%d", l.Revision) - } -} - -// ---- helpers ---- - -func applyActivity(m *Manager, a domain.ActivityState) { - _ = m.ApplyActivitySignal(ctx(), sid, ports.ActivitySignal{ - State: ports.SignalValid, Activity: a, Timestamp: t0, Source: domain.SourceHook, - }) -} - -func failCI(t *testing.T, m *Manager) { - t.Helper() - tail := "fail" - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail, - }); err != nil { - t.Fatalf("failCI: %v", err) - } -} - -func pendingCI(t *testing.T, m *Manager) { - t.Helper() - if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ - Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7, - }); err != nil { - t.Fatalf("pendingCI: %v", err) - } -} diff --git a/backend/internal/observe/reaper/reaper.go b/backend/internal/observe/reaper/reaper.go index 66456ea6..7edee2b1 100644 --- a/backend/internal/observe/reaper/reaper.go +++ b/backend/internal/observe/reaper/reaper.go @@ -16,7 +16,6 @@ import ( "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -184,16 +183,16 @@ func (r *Reaper) probeOne(ctx context.Context, sess domain.SessionRecord, now ti // transient tmux/zellij outage hide a really-dead session, and a // transient adapter bug terminate a really-alive one. Report failed // and let the LCM's detecting quarantine arbitrate. - facts.RuntimeState = ports.RuntimeProbeFailed - facts.ProcessState = ports.ProcessProbeFailed + facts.Runtime = ports.ProbeFailed + facts.Process = ports.ProbeFailed r.logger.Debug("reaper: probe error reported as failed fact", "session", sess.ID, "runtime", handle.RuntimeName, "err", probeErr) case alive: - facts.RuntimeState = ports.RuntimeProbeAlive - facts.ProcessState = ports.ProcessProbeAlive + facts.Runtime = ports.ProbeAlive + facts.Process = ports.ProbeAlive default: - facts.RuntimeState = ports.RuntimeProbeDead - facts.ProcessState = ports.ProcessProbeDead + facts.Runtime = ports.ProbeDead + facts.Process = ports.ProbeDead } if err := r.lcm.ApplyRuntimeObservation(ctx, sess.ID, facts); err != nil { @@ -203,11 +202,11 @@ func (r *Reaper) probeOne(ctx context.Context, sess domain.SessionRecord, now ti } // handleFromRecord reconstructs the RuntimeHandle stored on the session by -// OnSpawnCompleted. Both keys are required; either being empty is the +// OnSpawnCompleted. Both fields are required; either being empty is the // "session lacks a probable handle" signal that probeOne uses to skip. func handleFromRecord(rec domain.SessionRecord) (ports.RuntimeHandle, bool) { - id := rec.Metadata[lifecycle.MetaRuntimeHandleID] - name := rec.Metadata[lifecycle.MetaRuntimeName] + id := rec.Metadata.RuntimeHandleID + name := rec.Metadata.RuntimeName if id == "" || name == "" { return ports.RuntimeHandle{}, false } diff --git a/backend/internal/observe/reaper/reaper_test.go b/backend/internal/observe/reaper/reaper_test.go index d6b88efd..ffb3eed4 100644 --- a/backend/internal/observe/reaper/reaper_test.go +++ b/backend/internal/observe/reaper/reaper_test.go @@ -1,386 +1,115 @@ -package reaper_test +package reaper import ( "context" "errors" - "reflect" - "sync" + "io" + "log/slog" "testing" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// ---- fakes ---- +var ctx = context.Background() -type aliveResult struct { - alive bool - err error -} - -// fakeRuntime is a programmable ports.Runtime. The reaper only calls IsAlive, -// but the interface requires the other methods so we stub them. -type fakeRuntime struct { - mu sync.Mutex - results map[string]aliveResult - probed []string -} - -var _ ports.Runtime = (*fakeRuntime)(nil) - -func (f *fakeRuntime) IsAlive(_ context.Context, h ports.RuntimeHandle) (bool, error) { - f.mu.Lock() - f.probed = append(f.probed, h.ID) - f.mu.Unlock() - r, ok := f.results[h.ID] - if !ok { - return false, errors.New("fakeRuntime: no programmed response for " + h.ID) - } - return r.alive, r.err -} - -func (f *fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { - return ports.RuntimeHandle{}, nil -} -func (f *fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } -func (f *fakeRuntime) SendMessage(context.Context, ports.RuntimeHandle, string) error { - return nil -} -func (f *fakeRuntime) GetOutput(context.Context, ports.RuntimeHandle, int) (string, error) { - return "", nil -} - -// fakeLCM records every reaper-facing call in order so tests can assert the -// exact sequence (TickEscalations -> RunningSessions -> ApplyRuntimeObservation). type fakeLCM struct { - mu sync.Mutex - sessions []domain.SessionRecord - calls []call - - runErr error - tickErr error - obsErr error -} - -type call struct { - Kind string - Now time.Time - Session domain.SessionID - Facts ports.RuntimeFacts + running []domain.SessionRecord + observed map[domain.SessionID]ports.RuntimeFacts + escalated int } -var _ ports.LifecycleManager = (*fakeLCM)(nil) - -func (l *fakeLCM) RunningSessions(_ context.Context) ([]domain.SessionRecord, error) { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "RunningSessions"}) - if l.runErr != nil { - return nil, l.runErr - } - out := make([]domain.SessionRecord, len(l.sessions)) - copy(out, l.sessions) - return out, nil +func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { + return l.running, nil } - -func (l *fakeLCM) TickEscalations(_ context.Context, now time.Time) error { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "TickEscalations", Now: now}) - return l.tickErr -} - func (l *fakeLCM) ApplyRuntimeObservation(_ context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - l.mu.Lock() - defer l.mu.Unlock() - l.calls = append(l.calls, call{Kind: "ApplyRuntimeObservation", Session: id, Facts: f}) - return l.obsErr -} - -// unused methods on the LCM port — the reaper never invokes them. -func (l *fakeLCM) ApplySCMObservation(context.Context, domain.SessionID, ports.SCMFacts) error { + if l.observed == nil { + l.observed = map[domain.SessionID]ports.RuntimeFacts{} + } + l.observed[id] = f return nil } +func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { l.escalated++; return nil } func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { return nil } -func (l *fakeLCM) OnSpawnInitiated(context.Context, domain.SessionRecord) error { return nil } +func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { + return nil +} func (l *fakeLCM) OnSpawnCompleted(context.Context, domain.SessionID, ports.SpawnOutcome) error { return nil } -func (l *fakeLCM) OnKillRequested(context.Context, domain.SessionID, ports.KillReason) error { +func (l *fakeLCM) OnKillRequested(context.Context, domain.SessionID, domain.TerminationReason) error { return nil } -// ---- helpers ---- +type fakeRuntime struct { + alive bool + err error +} -func aliveSessionWith(id domain.SessionID, runtimeName, handleID string) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking, Reason: domain.ReasonTaskInProgress}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, - }, - Metadata: map[string]string{ - lifecycle.MetaRuntimeHandleID: handleID, - lifecycle.MetaRuntimeName: runtimeName, - }, - } +func (r fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { + return ports.RuntimeHandle{}, nil +} +func (r fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } +func (r fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { + return r.alive, r.err } -// detectingSessionWith returns a session in the Detecting quarantine, the -// shape `Manager.RunningSessions` MUST include so a probe-alive can recover it -// (otherwise the reaper traps every session that hiccups once in detecting). -func detectingSessionWith(id domain.SessionID, runtimeName, handleID string) domain.SessionRecord { +func probableSession(id domain.SessionID) domain.SessionRecord { return domain.SessionRecord{ - ID: id, + ID: id, + Metadata: domain.SessionMetadata{RuntimeHandleID: "h1", RuntimeName: "tmux"}, Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionDetecting, Reason: domain.ReasonProbeFailure}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError}, - }, - Metadata: map[string]string{ - lifecycle.MetaRuntimeHandleID: handleID, - lifecycle.MetaRuntimeName: runtimeName, + Session: domain.SessionSubstate{State: domain.SessionWorking}, }, } } -// ---- tests ---- +func quietLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } -func TestReaper_Tick(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - - type runtimeProbes struct { - name string - results map[string]aliveResult - } - - tests := []struct { - name string - sessions []domain.SessionRecord - runtimes []runtimeProbes - wantCalls []call - wantProbe map[string][]string // runtime name -> handle IDs probed, in order - }{ - { - // "No death applied" per the spec: the LCM does not receive a - // death-causing fact. It still receives the alive fact, because - // the reaper reports what it probed and the LCM is the one that - // diffs against canonical (a no-op when runtime is already alive, - // a recovery when the session was in Detecting). - name: "alive session: alive fact reported, no death applied, tick still fires", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: true}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - // Recovery path: a session in Detecting+probe_failed must be in - // the poll set so an alive probe can flow through and recover it. - // If the reaper filtered to runtime-axis-alive only, this session - // would be trapped in Detecting forever. - name: "detecting session: alive probe reported so LCM can recover from quarantine", - sessions: []domain.SessionRecord{detectingSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: true}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "dead session: exactly one ApplyRuntimeObservation with Dead facts", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {alive: false}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "probe error: reported as failed fact, NOT collapsed to alive", - sessions: []domain.SessionRecord{aliveSessionWith("s1", "tmux", "h1")}, - runtimes: []runtimeProbes{{name: "tmux", results: map[string]aliveResult{"h1": {err: errors.New("boom")}}}}, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeFailed}, - }, - }, - wantProbe: map[string][]string{"tmux": {"h1"}}, - }, - { - name: "multi-runtime dispatch: tmux + zellij in same tick", - sessions: []domain.SessionRecord{ - aliveSessionWith("s1", "tmux", "ht"), - aliveSessionWith("s2", "zellij", "hz"), - }, - runtimes: []runtimeProbes{ - {name: "tmux", results: map[string]aliveResult{"ht": {alive: false}}}, - {name: "zellij", results: map[string]aliveResult{"hz": {alive: true}}}, - }, - wantCalls: []call{ - {Kind: "TickEscalations", Now: now}, - {Kind: "RunningSessions"}, - { - Kind: "ApplyRuntimeObservation", - Session: "s1", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead}, - }, - { - Kind: "ApplyRuntimeObservation", - Session: "s2", - Facts: ports.RuntimeFacts{ObservedAt: now, RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive}, - }, - }, - wantProbe: map[string][]string{"tmux": {"ht"}, "zellij": {"hz"}}, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - lcm := &fakeLCM{sessions: tc.sessions} - registry := reaper.MapRegistry{} - byName := map[string]*fakeRuntime{} - for _, r := range tc.runtimes { - rt := &fakeRuntime{results: r.results} - registry[r.name] = rt - byName[r.name] = rt - } - rp := reaper.New(lcm, registry, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) - } - - if !reflect.DeepEqual(lcm.calls, tc.wantCalls) { - t.Errorf("LCM call log mismatch:\n got %#v\n want %#v", lcm.calls, tc.wantCalls) - } - - for name, want := range tc.wantProbe { - got := byName[name].probed - if !reflect.DeepEqual(got, want) { - t.Errorf("runtime %q probed handles mismatch: got %v want %v", name, got, want) - } - } - }) - } +func newReaper(lcm *fakeLCM, rt fakeRuntime) *Reaper { + return New(lcm, MapRegistry{"tmux": rt}, Config{Logger: quietLogger()}) } -// TestReaper_Loop verifies the background goroutine actually drives ticks and -// exits on context cancel without leaking. -func TestReaper_Loop(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - lcm := &fakeLCM{} - rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Clock: clock, Tick: 5 * time.Millisecond}) - - ctx, cancel := context.WithCancel(context.Background()) - done := rp.Start(ctx) - - // Wait for at least two ticks so we know the loop is actually firing. - deadline := time.Now().Add(500 * time.Millisecond) - for time.Now().Before(deadline) { - lcm.mu.Lock() - n := countKind(lcm.calls, "TickEscalations") - lcm.mu.Unlock() - if n >= 2 { - break - } - time.Sleep(2 * time.Millisecond) +func TestTick_ReportsAliveProbe(t *testing.T) { + lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + t.Fatal(err) } - cancel() - - select { - case <-done: - case <-time.After(time.Second): - t.Fatal("reaper goroutine did not exit within 1s of ctx cancel") - } - - lcm.mu.Lock() - defer lcm.mu.Unlock() - if got := countKind(lcm.calls, "TickEscalations"); got < 2 { - t.Errorf("expected at least 2 TickEscalations calls during loop, got %d", got) + if lcm.observed["mer-1"].Runtime != ports.ProbeAlive { + t.Fatalf("want alive probe, got %q", lcm.observed["mer-1"].Runtime) } } -func countKind(calls []call, kind string) int { - n := 0 - for _, c := range calls { - if c.Kind == kind { - n++ - } +func TestTick_ReportsProbeErrorAsFailed(t *testing.T) { + lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, fakeRuntime{err: errors.New("tmux gone")}).Tick(ctx); err != nil { + t.Fatal(err) + } + if lcm.observed["mer-1"].Runtime != ports.ProbeFailed { + t.Fatalf("probe error must be reported as failed, got %q", lcm.observed["mer-1"].Runtime) } - return n } -// TestReaper_SkipsUnknownRuntime verifies the reaper does not panic and does not -// report a fact when a session references an unregistered runtime — the reaper -// only reports what it actually probed. -func TestReaper_SkipsUnknownRuntime(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - lcm := &fakeLCM{sessions: []domain.SessionRecord{aliveSessionWith("s1", "ghost", "h1")}} - rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) +func TestTick_FiresEscalationHeartbeat(t *testing.T) { + lcm := &fakeLCM{} + if err := newReaper(lcm, fakeRuntime{}).Tick(ctx); err != nil { + t.Fatal(err) } - - for _, c := range lcm.calls { - if c.Kind == "ApplyRuntimeObservation" { - t.Fatalf("unexpected ApplyRuntimeObservation for unknown-runtime session: %+v", c) - } + if lcm.escalated != 1 { + t.Fatalf("tick must drive TickEscalations once, got %d", lcm.escalated) } } -// TestReaper_SkipsMissingHandle verifies the reaper does not probe (and does not -// report) for sessions whose runtime handle metadata is missing — probing -// nothing returns no fact. -func TestReaper_SkipsMissingHandle(t *testing.T) { - now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) - clock := func() time.Time { return now } - sess := aliveSessionWith("s1", "tmux", "h1") - delete(sess.Metadata, lifecycle.MetaRuntimeHandleID) - lcm := &fakeLCM{sessions: []domain.SessionRecord{sess}} - rt := &fakeRuntime{results: map[string]aliveResult{}} - rp := reaper.New(lcm, reaper.MapRegistry{"tmux": rt}, reaper.Config{Clock: clock, Tick: time.Hour}) - - if err := rp.Tick(context.Background()); err != nil { - t.Fatalf("Tick error: %v", err) - } - if len(rt.probed) != 0 { - t.Errorf("expected no probes for session without handle id, got %v", rt.probed) +func TestTick_SkipsSessionWithoutHandle(t *testing.T) { + noHandle := domain.SessionRecord{ID: "mer-1"} // no runtime metadata + lcm := &fakeLCM{running: []domain.SessionRecord{noHandle}} + if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + t.Fatal(err) } - for _, c := range lcm.calls { - if c.Kind == "ApplyRuntimeObservation" { - t.Fatalf("unexpected ApplyRuntimeObservation: %+v", c) - } + if _, probed := lcm.observed["mer-1"]; probed { + t.Fatal("a session without a runtime handle must be skipped") } } diff --git a/backend/internal/ports/facts.go b/backend/internal/ports/facts.go index e1854fac..a3b3b397 100644 --- a/backend/internal/ports/facts.go +++ b/backend/internal/ports/facts.go @@ -1,9 +1,6 @@ -// Package ports declares the boundary contracts for the LCM + Session Manager -// lane: the inbound interfaces we implement, the outbound interfaces others -// implement for us, and the fact DTOs that cross those boundaries. -// -// These are the types the SCM poller, persistence adapter, and API layer build -// against, so they are committed and stabilised before the LCM/SM logic. +// Package ports declares the boundary contracts for the lifecycle lane: the +// inbound interfaces the engine implements, the outbound interfaces its adapters +// implement, and the plain DTOs that cross those edges. It holds no logic. package ports import ( @@ -12,122 +9,55 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// SCMFacts is produced by the SCM poller and handed to ApplySCMObservation. -// -// Fetched is the failed-probe guard: when false, the GitHub query timed out or -// errored and the rest of the struct is meaningless — the LCM must NOT read it -// as "no PR / PR closed" (the SCM analogue of "failed probe != dead"). -// -// CIFailureLogTail is a pointer because it is only populated when CI is failing; -// it carries ~120 lines and we don't want it on the hot poll path otherwise. -type SCMFacts struct { - Fetched bool - ObservedAt time.Time - PRState domain.PRState - Draft bool - PRNumber int - PRURL string - CISummary CISummary - ReviewDecision ReviewDecision - Mergeability Mergeability - PendingComments []ReviewComment - CIFailureLogTail *string -} - -type CISummary string +// ProbeResult is a single liveness reading. "failed" (the probe errored/timed +// out) and "unknown" (ran but couldn't tell) are kept distinct from dead — both +// route to the detecting quarantine, never to a death conclusion. +type ProbeResult string const ( - CIPending CISummary = "pending" - CIPassing CISummary = "passing" - CIFailing CISummary = "failing" - CINone CISummary = "none" + ProbeAlive ProbeResult = "alive" + ProbeDead ProbeResult = "dead" + ProbeFailed ProbeResult = "failed" + ProbeUnknown ProbeResult = "unknown" ) -type ReviewDecision string - -const ( - ReviewApproved ReviewDecision = "approved" - ReviewChangesRequested ReviewDecision = "changes_requested" - ReviewPending ReviewDecision = "pending" - ReviewNone ReviewDecision = "none" -) - -// Mergeability is the structured "can this merge?" answer. CIPassing/Approved -// here overlap CISummary/ReviewDecision by design (different granularity); -// Mergeability is authoritative for the merge gate, the others for display. -type Mergeability struct { - Mergeable bool - CIPassing bool - Approved bool - NoConflicts bool - Blockers []string -} - -// ReviewComment carries IsBot so the decider can route bot review comments -// (bugbot-comments reaction) differently from human ones (changes-requested). -type ReviewComment struct { - Author string - Body string - IsBot bool - URL string -} - -// RuntimeFacts is produced by the reaper and handed to ApplyRuntimeObservation. +// RuntimeFacts is what the reaper reports each probe: is the runtime container +// up, and is the agent process inside it up. type RuntimeFacts struct { - ObservedAt time.Time - RuntimeState RuntimeProbe - ProcessState ProcessProbe + ObservedAt time.Time + Runtime ProbeResult + Process ProbeResult } -// RuntimeProbe / ProcessProbe keep "failed" (the probe call itself errored or -// timed out) distinct from "indeterminate" (the probe ran but couldn't tell) — -// they route differently in the decider. -type RuntimeProbe string - -const ( - RuntimeProbeAlive RuntimeProbe = "alive" - RuntimeProbeDead RuntimeProbe = "dead" - RuntimeProbeIndeterminate RuntimeProbe = "indeterminate" - RuntimeProbeFailed RuntimeProbe = "failed" -) - -type ProcessProbe string - -const ( - ProcessProbeAlive ProcessProbe = "alive" - ProcessProbeDead ProcessProbe = "dead" - ProcessProbeIndeterminate ProcessProbe = "indeterminate" - ProcessProbeFailed ProcessProbe = "failed" -) - -// ActivitySignal is pushed by agent hooks / the FS watcher. State is the -// confidence wrapper (so unavailable/probe_failure != idleness); Activity is -// the actual classification. +// ActivitySignal is pushed by the agent hooks. Only a Valid signal is +// authoritative; a stale/absent one is ignored rather than read as idleness. type ActivitySignal struct { - State SignalConfidence - Activity domain.ActivityState + Valid bool + State domain.ActivityState Timestamp time.Time Source domain.ActivitySource } -type SignalConfidence string - -const ( - SignalValid SignalConfidence = "valid" - SignalStale SignalConfidence = "stale" - SignalNull SignalConfidence = "null" - SignalUnavailable SignalConfidence = "unavailable" - SignalProbeFailure SignalConfidence = "probe_failure" -) +// PRObservation is what the SCM poller reports for one PR. Fetched is the +// failed-fetch guard: when false the rest is meaningless and the engine must not +// read it as "PR closed". Checks/Comments are the current full sets (the engine +// records the checks and replaces the comment set). +type PRObservation struct { + Fetched bool + URL string + Number int + Draft bool + Merged bool + Closed bool + CI domain.CIState + Review domain.ReviewDecision + Mergeability domain.Mergeability + Checks []PRCheckRow + Comments []PRComment +} -// SpawnOutcome is what the Session Manager reports to the LCM after a spawn. -// RuntimeHandle is the same structured handle the Runtime port returns, so no -// ad-hoc string encoding is needed for later Destroy/SendMessage calls. -// -// Prompt is the assembled launch prompt persisted as metadata so Restore can -// fall back to a fresh launch (Agent.GetLaunchCommand) when the agent's native -// session id was never captured — without it Restore would have nothing to -// resume and nothing to re-seed a fresh run with. +// SpawnOutcome is what the Session Manager reports once a spawn is live: the +// handles needed for later teardown/restore. type SpawnOutcome struct { Branch string WorkspacePath string @@ -136,17 +66,41 @@ type SpawnOutcome struct { Prompt string } -// KillReason is what the Session Manager reports to the LCM when a kill is -// requested. Kind drives whether the terminal state is killed/cleanup/errored. -type KillReason struct { - Kind LifecycleKillReason - Detail string +// ---- store row DTOs (shared by the PRWriter port and its sqlite adapter) ---- + +// PRRow is the scalar PR facts row. +type PRRow struct { + URL string + SessionID string + Number int + Draft bool + Merged bool + Closed bool + CI domain.CIState + Review domain.ReviewDecision + Mergeability domain.Mergeability + UpdatedAt time.Time } -type LifecycleKillReason string +// PRCheckRow is one CI check run (one row per check name per commit). +type PRCheckRow struct { + PRURL string + Name string + CommitHash string + Status string + URL string + LogTail string + CreatedAt time.Time +} -const ( - KillManual LifecycleKillReason = "manual" - KillCleanup LifecycleKillReason = "cleanup" - KillError LifecycleKillReason = "error" -) +// PRComment is one review comment. Review feedback is injected into the agent +// regardless of author, so there is no bot/human distinction. +type PRComment struct { + ID string + Author string + File string + Line int + Body string + Resolved bool + CreatedAt time.Time +} diff --git a/backend/internal/ports/inbound.go b/backend/internal/ports/inbound.go index 58ec2015..00223ae9 100644 --- a/backend/internal/ports/inbound.go +++ b/backend/internal/ports/inbound.go @@ -7,73 +7,45 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleManager is the inbound contract we implement. Every Apply* method -// runs the same synchronous pipeline: load canonical -> pure decide -> diff -> -// persist (full-row Upsert) -> if the status transitioned, fire reactions. The LCM -// never polls; observers (SCM poller, reaper, activity ingest) call in. -// -// Concurrency: the LCM serialises per session, so concurrent Apply* calls for -// the same session do not race the load/decide/persist read-modify-write. +// LifecycleManager is the inbound contract the engine implements. Observers +// (reaper, SCM poller, activity hooks) and the Session Manager call in; the LCM +// is the sole writer of canonical transitions and the only place reactions fire. type LifecycleManager interface { - // Raw-fact entrypoints (each runs decide internally). - ApplySCMObservation(ctx context.Context, id domain.SessionID, f SCMFacts) error ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f RuntimeFacts) error ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ActivitySignal) error + ApplyPRObservation(ctx context.Context, id domain.SessionID, o PRObservation) error - // Mutation commands/outcomes reported by the Session Manager. - OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error + // OnSpawnCompleted marks a session live and records its handles. It works for + // a fresh spawn (not_started -> live) and a restore (terminal -> reopened). OnSpawnCompleted(ctx context.Context, id domain.SessionID, o SpawnOutcome) error - OnKillRequested(ctx context.Context, id domain.SessionID, r KillReason) error + OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error - // Reaper heartbeat that drives duration-based escalation (a non-polling - // LCM can't wake itself to fire a "30m elapsed" escalation). + // TickEscalations fires the duration-based escalations the synchronous LCM + // can't wake itself for; the reaper calls it on a timer. TickEscalations(ctx context.Context, now time.Time) error - - // RunningSessions returns a snapshot of every session whose runtime axis is - // alive. The reaper calls it once per tick to decide whom to probe. It is a - // read snapshot — the slice and its elements are safe for the caller to - // iterate without holding any LCM lock — and does not violate the - // single-writer invariant (the reaper never writes; it reports facts back - // through ApplyRuntimeObservation). + // RunningSessions snapshots every non-terminal session for the reaper to probe. RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) } -// SessionManager is the inbound contract called by the API layer and CLI. It -// owns explicit mutations (spawn/kill/restore/cleanup) and never writes -// sessions directly — it routes mutation commands/outcomes to the LCM. +// SessionManager is the inbound contract the API/CLI call for explicit +// mutations. It drives the runtime/agent/workspace plugins and routes canonical +// writes to the LCM. type SessionManager interface { Spawn(ctx context.Context, cfg SpawnConfig) (domain.Session, error) - Kill(ctx context.Context, id domain.SessionID, opts KillOptions) (KillResult, error) + Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (freed bool, err error) + Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) Send(ctx context.Context, id domain.SessionID, message string) error - Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) - Cleanup(ctx context.Context, project domain.ProjectID) (CleanupResult, error) + Cleanup(ctx context.Context, project domain.ProjectID) ([]domain.SessionID, error) } type SpawnConfig struct { ProjectID domain.ProjectID IssueID domain.IssueID Kind domain.SessionKind + Harness domain.AgentHarness Branch string Prompt string AgentRules string - // OpenTerminal is reserved for a later lane (open a terminal tab on spawn). - // Spawn does NOT honor it yet — setting it has no effect. - OpenTerminal bool -} - -type KillOptions struct { - Reason LifecycleKillReason - Detail string -} - -type KillResult struct { - ID domain.SessionID - WorkspaceFreed bool -} - -type CleanupResult struct { - Cleaned []domain.SessionID - Skipped []domain.SessionID // e.g. paths that still held uncommitted work } diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go index ba08d9b9..75a24bf0 100644 --- a/backend/internal/ports/outbound.go +++ b/backend/internal/ports/outbound.go @@ -6,86 +6,65 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// LifecycleStore is Tom's persistence adapter for session records. -// -// Writer contract: the Lifecycle Manager (LCM) is the sole logical writer of -// sessions. Controllers, the Session Manager, observers, and other goroutines -// must route mutations to the LCM; no other goroutine writes sessions directly. -// The LCM serializes mutations and calls Upsert with the full SessionRecord and -// the classified event_type. The storage layer owns Revision++ and performs the -// full-row insert-or-update; the older sparse merge-patch model is gone. -// -// List/Get return persistence records (no derived status); the Session Manager -// hydrates them into domain.Session by attaching DeriveLegacyStatus on read. -type LifecycleStore interface { - // Upsert inserts or replaces the full session row and bumps Revision inside - // the storage layer. Only the LCM may call it. - Upsert(ctx context.Context, rec domain.SessionRecord, eventType EventType) error - Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) - List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) - GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error) - PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error - - // Get returns a single full record (with identity) by id. Load is - // lifecycle-only, so readers use this to build the read-model and reconstruct - // teardown handles for Kill/Restore on one id. - Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) +// SessionStore persists session records and serves the derived read-model's PR +// facts. The Session Manager creates rows; the Lifecycle Manager is the sole +// writer of canonical transitions thereafter. +type SessionStore interface { + CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) + UpdateSession(ctx context.Context, rec domain.SessionRecord) error + GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) + ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) + ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) + // PRFactsForSession returns the PR facts that drive a session's display + // status: the most-recently-updated non-closed PR, else the most recent. + // Zero value (Exists=false) means the session has no PR. + PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) } -// EventType is the schema-level event label attached to each Upsert. -type EventType string - -const ( - EventSessionCreated EventType = "session_created" - EventSessionTerminated EventType = "session_terminated" - EventSessionStateChanged EventType = "session_state_changed" - EventSessionPRUpdated EventType = "session_pr_updated" - EventSessionRuntimeUpdated EventType = "session_runtime_updated" - EventSessionAttentionUpdated EventType = "session_attention_updated" - EventSessionActivityUpdated EventType = "session_activity_updated" - EventSessionDisplayUpdated EventType = "session_display_updated" - EventSessionUpdated EventType = "session_updated" -) +// PRWriter records the PR facts a PR observation carries. The pr table's own DB +// triggers emit the CDC; this just writes the rows. +type PRWriter interface { + // WritePR persists a full PR observation — scalar facts, check runs, and the + // replacement comment set — in one transaction, so the rows and the CDC + // events they emit are all-or-nothing. + WritePR(ctx context.Context, pr PRRow, checks []PRCheckRow, comments []PRComment) error + // RecentCheckStatuses reads the last `limit` runs of a check (the CI brake). + RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) +} -// Notifier delivers events to the human (desktop/Slack later). Push, never pull. +// Notifier delivers an event to the human (desktop/Slack later). Push, never poll. type Notifier interface { - Notify(ctx context.Context, event OrchestratorEvent) error + Notify(ctx context.Context, event Event) error +} + +// AgentMessenger injects a message into a running agent (busy-detecting until the +// agent is ready). Used by the auto-nudge reactions. +type AgentMessenger interface { + Send(ctx context.Context, id domain.SessionID, message string) error } -type EventPriority string +type Priority string const ( - PriorityUrgent EventPriority = "urgent" - PriorityAction EventPriority = "action" - PriorityWarning EventPriority = "warning" - PriorityInfo EventPriority = "info" + PriorityUrgent Priority = "urgent" + PriorityAction Priority = "action" + PriorityInfo Priority = "info" ) -type OrchestratorEvent struct { +// Event is a human-facing notification produced by a reaction. +type Event struct { Type string - Priority EventPriority + Priority Priority SessionID domain.SessionID ProjectID domain.ProjectID Message string - Data map[string]any -} - -// AgentMessenger injects a message into a running agent. The implementation -// busy-detects (waits for the agent to be idle/ready) and verifies delivery, -// which is why activity-detection accuracy matters. -type AgentMessenger interface { - Send(ctx context.Context, id domain.SessionID, message string) error } -// The runtime/agent/workspace plugin ports are co-owned with the coding-agents -// lane; the method sets below are the minimum the Session Manager spawn/kill -// pipelines call. They will be fleshed out alongside the tmux/claude-code impls. +// ---- runtime / agent / workspace plugin ports (used by the Session Manager) ---- type Runtime interface { Create(ctx context.Context, cfg RuntimeConfig) (RuntimeHandle, error) Destroy(ctx context.Context, handle RuntimeHandle) error - SendMessage(ctx context.Context, handle RuntimeHandle, message string) error - GetOutput(ctx context.Context, handle RuntimeHandle, lines int) (string, error) IsAlive(ctx context.Context, handle RuntimeHandle) (bool, error) } @@ -104,10 +83,6 @@ type RuntimeHandle struct { type Agent interface { GetLaunchCommand(cfg AgentConfig) string GetEnvironment(cfg AgentConfig) map[string]string - // ProbeProcess returns the agent process liveness classification - // (alive/dead/indeterminate/failed) — not a boolean and not an activity - // state. Activity classification arrives separately via ActivitySignal. - ProbeProcess(ctx context.Context, handle RuntimeHandle) (ProcessProbe, error) GetRestoreCommand(agentSessionID string) string } @@ -120,7 +95,6 @@ type AgentConfig struct { type Workspace interface { Create(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) Destroy(ctx context.Context, info WorkspaceInfo) error - List(ctx context.Context, project domain.ProjectID) ([]WorkspaceInfo, error) Restore(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) } diff --git a/backend/internal/session/fakes_test.go b/backend/internal/session/fakes_test.go deleted file mode 100644 index 71eaa4af..00000000 --- a/backend/internal/session/fakes_test.go +++ /dev/null @@ -1,383 +0,0 @@ -package session - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// callLog records the cross-fake call order so tests can assert pipeline -// sequencing (e.g. OnKillRequested before Runtime.Destroy before Workspace.Destroy). -type callLog struct { - mu sync.Mutex - calls []string -} - -func (c *callLog) add(s string) { - c.mu.Lock() - defer c.mu.Unlock() - c.calls = append(c.calls, s) -} - -func (c *callLog) snapshot() []string { - c.mu.Lock() - defer c.mu.Unlock() - out := make([]string, len(c.calls)) - copy(out, c.calls) - return out -} - -// indexOf returns the position of the first call equal to name, or -1. -func (c *callLog) indexOf(name string) int { - for i, s := range c.snapshot() { - if s == name { - return i - } - } - return -1 -} - -// ---- fakeStore: in-memory LifecycleStore with full-row Upsert + Get ---- - -type fakeStore struct { - mu sync.Mutex - records map[domain.SessionID]*domain.SessionRecord - metadata map[domain.SessionID]map[string]string -} - -var _ ports.LifecycleStore = (*fakeStore)(nil) - -func newFakeStore() *fakeStore { - return &fakeStore{ - records: map[domain.SessionID]*domain.SessionRecord{}, - metadata: map[domain.SessionID]map[string]string{}, - } -} - -func (s *fakeStore) Upsert(_ context.Context, rec domain.SessionRecord, _ ports.EventType) error { - s.mu.Lock() - defer s.mu.Unlock() - if existing, ok := s.records[rec.ID]; ok { - if rec.Lifecycle.Revision != existing.Lifecycle.Revision { - return fmt.Errorf("revision mismatch for %s: have %d, want %d", rec.ID, rec.Lifecycle.Revision, existing.Lifecycle.Revision) - } - rec.Lifecycle.Revision = existing.Lifecycle.Revision + 1 - } else { - if rec.Lifecycle.Revision != 0 { - return fmt.Errorf("revision mismatch for insert %s: have %d, want 0", rec.ID, rec.Lifecycle.Revision) - } - rec.Lifecycle.Revision = 1 - } - if rec.Lifecycle.Version == 0 { - rec.Lifecycle.Version = domain.LifecycleVersion - } - r := rec - s.records[rec.ID] = &r - return nil -} - -func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.SessionRecord{}, false, nil - } - return s.withMetadata(*rec), true, nil -} - -func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { - s.mu.Lock() - defer s.mu.Unlock() - rec, ok := s.records[id] - if !ok { - return domain.CanonicalSessionLifecycle{}, false, nil - } - return rec.Lifecycle, true, nil -} - -func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - s.mu.Lock() - defer s.mu.Unlock() - var out []domain.SessionRecord - for _, rec := range s.records { - if rec.ProjectID == project { - out = append(out, s.withMetadata(*rec)) - } - } - return out, nil -} - -func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { - s.mu.Lock() - defer s.mu.Unlock() - return cloneMap(s.metadata[id]), nil -} - -func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { - s.mu.Lock() - defer s.mu.Unlock() - if s.metadata[id] == nil { - s.metadata[id] = map[string]string{} - } - for k, v := range kv { - s.metadata[id][k] = v - } - return nil -} - -// withMetadata attaches the separately-stored metadata to a record copy (a real -// store would return them together). Caller holds s.mu. -func (s *fakeStore) withMetadata(rec domain.SessionRecord) domain.SessionRecord { - if md := s.metadata[rec.ID]; len(md) > 0 { - rec.Metadata = cloneMap(md) - } - return rec -} - -// ---- fakeRuntime ---- - -type fakeRuntime struct { - log *callLog - createErr error - alive bool - - created []ports.RuntimeConfig - destroyed []ports.RuntimeHandle - sent []string -} - -var _ ports.Runtime = (*fakeRuntime)(nil) - -func (r *fakeRuntime) Create(_ context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) { - r.log.add("Runtime.Create") - if r.createErr != nil { - return ports.RuntimeHandle{}, r.createErr - } - r.created = append(r.created, cfg) - return ports.RuntimeHandle{ID: "rt-" + string(cfg.SessionID), RuntimeName: "tmux"}, nil -} - -func (r *fakeRuntime) Destroy(_ context.Context, h ports.RuntimeHandle) error { - r.log.add("Runtime.Destroy") - r.destroyed = append(r.destroyed, h) - return nil -} - -func (r *fakeRuntime) SendMessage(_ context.Context, _ ports.RuntimeHandle, message string) error { - r.sent = append(r.sent, message) - return nil -} - -func (r *fakeRuntime) GetOutput(_ context.Context, _ ports.RuntimeHandle, _ int) (string, error) { - return "", nil -} - -func (r *fakeRuntime) IsAlive(_ context.Context, _ ports.RuntimeHandle) (bool, error) { - return r.alive, nil -} - -// ---- fakeAgent ---- - -type fakeAgent struct { - env map[string]string -} - -var _ ports.Agent = (*fakeAgent)(nil) - -func (a *fakeAgent) GetLaunchCommand(_ ports.AgentConfig) string { return "claude" } - -func (a *fakeAgent) GetEnvironment(_ ports.AgentConfig) map[string]string { return cloneMap(a.env) } - -func (a *fakeAgent) ProbeProcess(_ context.Context, _ ports.RuntimeHandle) (ports.ProcessProbe, error) { - return ports.ProcessProbeAlive, nil -} - -func (a *fakeAgent) GetRestoreCommand(agentSessionID string) string { - return "claude --resume " + agentSessionID -} - -// ---- fakeWorkspace (with worktree-remove refusal mode) ---- - -type fakeWorkspace struct { - log *callLog - createErr error - refuse map[string]bool // path -> still registered after prune (uncommitted work) - created []ports.WorkspaceConfig - destroyed []ports.WorkspaceInfo - restoredID []domain.SessionID -} - -var _ ports.Workspace = (*fakeWorkspace)(nil) - -func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - w.log.add("Workspace.Create") - if w.createErr != nil { - return ports.WorkspaceInfo{}, w.createErr - } - w.created = append(w.created, cfg) - return workspaceFor(cfg), nil -} - -func (w *fakeWorkspace) Destroy(_ context.Context, info ports.WorkspaceInfo) error { - w.log.add("Workspace.Destroy") - if w.refuse[info.Path] { - // Worktree-remove safety: after `git worktree prune` the path is still - // registered, so it may hold the agent's uncommitted work — refuse. - return fmt.Errorf("workspace: refusing to rm -rf %s: still registered after prune", info.Path) - } - w.destroyed = append(w.destroyed, info) - return nil -} - -func (w *fakeWorkspace) List(_ context.Context, _ domain.ProjectID) ([]ports.WorkspaceInfo, error) { - return nil, nil -} - -func (w *fakeWorkspace) Restore(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - w.log.add("Workspace.Restore") - w.restoredID = append(w.restoredID, cfg.SessionID) - return workspaceFor(cfg), nil -} - -func workspaceFor(cfg ports.WorkspaceConfig) ports.WorkspaceInfo { - return ports.WorkspaceInfo{ - Path: "/tmp/ws/" + string(cfg.SessionID), - Branch: cfg.Branch, - SessionID: cfg.SessionID, - ProjectID: cfg.ProjectID, - } -} - -// ---- recordingMessenger ---- - -type recordingMessenger struct { - sent []struct { - ID domain.SessionID - Message string - } -} - -var _ ports.AgentMessenger = (*recordingMessenger)(nil) - -func (m *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { - m.sent = append(m.sent, struct { - ID domain.SessionID - Message string - }{id, message}) - return nil -} - -// ---- noopNotifier ---- - -type noopNotifier struct{} - -var _ ports.Notifier = (*noopNotifier)(nil) - -func (noopNotifier) Notify(_ context.Context, _ ports.OrchestratorEvent) error { return nil } - -// ---- recordingLCM: wraps the REAL lifecycle.Manager and logs SM-facing calls ---- - -type recordingLCM struct { - log *callLog - inner ports.LifecycleManager - - // onSpawnErr, when set, makes OnSpawnCompleted fail (without touching the - // inner manager) so tests can exercise the SM's post-spawn failure paths. - onSpawnErr error -} - -var _ ports.LifecycleManager = (*recordingLCM)(nil) - -func (l *recordingLCM) OnSpawnInitiated(ctx context.Context, rec domain.SessionRecord) error { - l.log.add("OnSpawnInitiated") - return l.inner.OnSpawnInitiated(ctx, rec) -} - -func (l *recordingLCM) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { - l.log.add("OnSpawnCompleted") - if l.onSpawnErr != nil { - return l.onSpawnErr - } - return l.inner.OnSpawnCompleted(ctx, id, o) -} - -func (l *recordingLCM) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { - l.log.add("OnKillRequested") - return l.inner.OnKillRequested(ctx, id, r) -} - -func (l *recordingLCM) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { - return l.inner.ApplySCMObservation(ctx, id, f) -} - -func (l *recordingLCM) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - return l.inner.ApplyRuntimeObservation(ctx, id, f) -} - -func (l *recordingLCM) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { - return l.inner.ApplyActivitySignal(ctx, id, s) -} - -func (l *recordingLCM) TickEscalations(ctx context.Context, now time.Time) error { - return l.inner.TickEscalations(ctx, now) -} - -func (l *recordingLCM) RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) { - return l.inner.RunningSessions(ctx) -} - -// ---- harness: wires the SM against the fakes + the real LCM ---- - -type harness struct { - sm *Manager - store *fakeStore - runtime *fakeRuntime - agent *fakeAgent - workspace *fakeWorkspace - messenger *recordingMessenger - lcm *recordingLCM - log *callLog -} - -var fixedTime = time.Date(2026, 5, 27, 12, 0, 0, 0, time.UTC) - -func newHarness(id domain.SessionID) *harness { - log := &callLog{} - store := newFakeStore() - rt := &fakeRuntime{log: log, alive: true} - ag := &fakeAgent{env: map[string]string{"BASE": "1"}} - ws := &fakeWorkspace{log: log, refuse: map[string]bool{}} - msg := &recordingMessenger{} - - lcm := &recordingLCM{log: log, inner: lifecycle.New(store, noopNotifier{}, msg)} - - sm := New(Deps{ - Runtime: rt, - Agent: ag, - Workspace: ws, - Store: store, - Messenger: msg, - Lifecycle: lcm, - Clock: func() time.Time { return fixedTime }, - NewID: func(ports.SpawnConfig) domain.SessionID { return id }, - }) - - return &harness{sm: sm, store: store, runtime: rt, agent: ag, workspace: ws, messenger: msg, lcm: lcm, log: log} -} - -func cloneMap(in map[string]string) map[string]string { - if in == nil { - return nil - } - out := make(map[string]string, len(in)) - for k, v := range in { - out[k] = v - } - return out -} diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go index e764f6a3..d7350f5f 100644 --- a/backend/internal/session/manager.go +++ b/backend/internal/session/manager.go @@ -1,76 +1,53 @@ -// Package session implements ports.SessionManager: the explicit-mutation half -// of the lane. The SM is impure plumbing — it drives the Runtime/Agent/Workspace -// plugins to create and tear down sessions, and routes mutation commands and -// outcomes to the LCM (OnSpawnInitiated / OnSpawnCompleted / OnKillRequested). -// -// It NEVER writes sessions directly: observed transitions and explicit -// canonical mutations are the LCM's job under the Writer contract. The SM is the -// single producer of the derived display status, attached on read in List/Get -// and never persisted. +// Package session implements ports.SessionManager: the explicit-mutation half of +// the lane. It drives the runtime/agent/workspace plugins to create and tear +// down sessions, routes canonical writes to the LCM, and is the single producer +// of the derived display status (attached on read in List/Get). package session import ( "context" - "crypto/rand" - "encoding/hex" "errors" "fmt" - "strconv" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// ErrNotFound is returned by Get/Restore when no record exists for the id. -var ErrNotFound = errors.New("session: not found") - -// ErrNotRestorable is returned by Restore when the session is not torn down. -// Restoring a live session would spin up a second runtime/workspace for the same -// id, duplicating the agent and risking data loss. -var ErrNotRestorable = errors.New("session: not restorable (not terminal)") - -// ErrIncompleteTeardownMetadata is returned when a record's teardown handles are -// missing (empty workspace path or runtime handle), so calling a real adapter's -// Destroy could act on empty args — an unsafe delete. The teardown is skipped. -var ErrIncompleteTeardownMetadata = errors.New("session: incomplete teardown metadata") +var ( + ErrNotFound = errors.New("session: not found") + ErrNotRestorable = errors.New("session: not restorable (not terminal)") + ErrIncompleteHandle = errors.New("session: incomplete teardown handle") +) -// Env vars a spawned process reads to learn who it is (distillation §5.4). +// Env vars a spawned process reads to learn who it is. const ( EnvSessionID = "AO_SESSION_ID" EnvProjectID = "AO_PROJECT_ID" EnvIssueID = "AO_ISSUE_ID" ) -// Manager implements ports.SessionManager against the outbound ports. Every -// dependency is an interface so the SM runs entirely against fakes in tests. +// Manager implements ports.SessionManager over the outbound ports. type Manager struct { runtime ports.Runtime agent ports.Agent workspace ports.Workspace - store ports.LifecycleStore + store ports.SessionStore messenger ports.AgentMessenger lcm ports.LifecycleManager - - clock func() time.Time - newID func(ports.SpawnConfig) domain.SessionID + clock func() time.Time } var _ ports.SessionManager = (*Manager)(nil) -// Deps groups the SM's collaborators. Clock and NewID are optional (defaulted) -// so production wiring only supplies the ports. type Deps struct { Runtime ports.Runtime Agent ports.Agent Workspace ports.Workspace - Store ports.LifecycleStore + Store ports.SessionStore Messenger ports.AgentMessenger Lifecycle ports.LifecycleManager - - Clock func() time.Time - NewID func(ports.SpawnConfig) domain.SessionID + Clock func() time.Time } func New(d Deps) *Manager { @@ -82,38 +59,27 @@ func New(d Deps) *Manager { messenger: d.Messenger, lcm: d.Lifecycle, clock: d.Clock, - newID: d.NewID, } if m.clock == nil { m.clock = time.Now } - if m.newID == nil { - m.newID = defaultNewID - } return m } -// ---- Spawn ---- - -// Spawn runs the create pipeline in spec order: workspace -> runtime -> route -// seed command to the LCM -> report completion to the LCM. The record is seeded LATE (after the runtime is up), so a -// failure before the seed leaves no record for Cleanup to reclaim — hence each -// step eagerly rolls back the steps that already succeeded. +// Spawn creates the session row (which assigns the "{project}-{n}" id), then the +// workspace and runtime, then reports completion to the LCM. A failure after the +// row exists routes it to a terminal errored state and rolls back what was built. func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) { - id := m.newID(cfg) - if _, ok, err := m.store.Get(ctx, id); err != nil { - return domain.Session{}, fmt.Errorf("spawn %s: check existing: %w", id, err) - } else if ok { - return domain.Session{}, fmt.Errorf("spawn %s: already exists", id) + rec, err := m.store.CreateSession(ctx, seedRecord(cfg, m.clock())) + if err != nil { + return domain.Session{}, fmt.Errorf("spawn: create: %w", err) } + id := rec.ID - ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ - ProjectID: cfg.ProjectID, - SessionID: id, - Branch: cfg.Branch, - }) + ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ProjectID: cfg.ProjectID, SessionID: id, Branch: cfg.Branch}) if err != nil { - return domain.Session{}, fmt.Errorf("spawn %s: workspace create: %w", id, err) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: workspace: %w", id, err) } agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: buildPrompt(cfg)} @@ -124,121 +90,127 @@ func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Sess Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, cfg.ProjectID, cfg.IssueID), }) if err != nil { - m.rollbackWorkspace(ctx, ws) // nothing seeded yet - return domain.Session{}, fmt.Errorf("spawn %s: runtime create: %w", id, err) - } - - if err := m.lcm.OnSpawnInitiated(ctx, seedRecord(id, cfg, m.clock())); err != nil { - m.rollbackRuntime(ctx, handle) - m.rollbackWorkspace(ctx, ws) - return domain.Session{}, fmt.Errorf("spawn %s: on spawn initiated: %w", id, err) + _ = m.workspace.Destroy(ctx, ws) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: runtime: %w", id, err) } - // Prompt is persisted via OnSpawnCompleted -> spawnMetadata so a later Restore - // can fall back to a fresh launch if the agent's native session id was never - // captured (the capture path is a separate hook that may never have run). outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, Prompt: agentCfg.Prompt} if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { - // The record is seeded but the runtime/workspace are about to be torn - // down. The store has no delete, so route the orphan to a terminal - // errored state (best effort) rather than strand a phantom "spawning". - _ = m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: ports.KillError, Detail: "spawn completion failed"}) - m.rollbackRuntime(ctx, handle) - m.rollbackWorkspace(ctx, ws) - return domain.Session{}, fmt.Errorf("spawn %s: on spawn completed: %w", id, err) + _ = m.runtime.Destroy(ctx, handle) + _ = m.workspace.Destroy(ctx, ws) + m.markErrored(ctx, id) + return domain.Session{}, fmt.Errorf("spawn %s: completed: %w", id, err) } - return m.Get(ctx, id) } -// rollback* are best-effort: the caller already has the originating failure, and -// there is no logger at this layer, so a secondary teardown error is dropped -// rather than masking the real cause. -func (m *Manager) rollbackWorkspace(ctx context.Context, ws ports.WorkspaceInfo) { - _ = m.workspace.Destroy(ctx, ws) -} - -func (m *Manager) rollbackRuntime(ctx context.Context, h ports.RuntimeHandle) { - _ = m.runtime.Destroy(ctx, h) +// markErrored best-effort parks an orphaned spawn in a terminal errored state +// (the store has no delete; a phantom "spawning" row is worse than a terminal one). +func (m *Manager) markErrored(ctx context.Context, id domain.SessionID) { + _ = m.lcm.OnKillRequested(ctx, id, domain.TermErrorInProcess) } -// ---- Kill ---- - -// Kill records terminal intent with the LCM FIRST, then tears down the runtime -// and workspace. There is no separate Agent stop: the agent runs inside the -// runtime, so Runtime.Destroy stops it. The workspace teardown honors the -// worktree-remove safety — a refusal (path still registered after prune, so it -// may hold uncommitted work) surfaces as an error with WorkspaceFreed=false and -// is never forced. -func (m *Manager) Kill(ctx context.Context, id domain.SessionID, opts ports.KillOptions) (ports.KillResult, error) { - rec, ok, err := m.store.Get(ctx, id) +// Kill records terminal intent with the LCM, then tears down the runtime and +// workspace. A workspace teardown refused by the worktree-remove safety +// (uncommitted work) surfaces as an error with freed=false and is never forced. +func (m *Manager) Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (bool, error) { + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w", id, err) + return false, fmt.Errorf("kill %s: %w", id, err) } if !ok { - // Already gone: benign race, mirrors LCM.OnKillRequested's no-op. - return ports.KillResult{ID: id}, nil + return false, nil // already gone: benign race } - meta, err := m.store.GetMetadata(ctx, id) - if err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: metadata: %w", id, err) + handle := runtimeHandle(rec.Metadata) + ws := workspaceInfo(rec) + if handle.ID == "" || ws.Path == "" { + return false, fmt.Errorf("kill %s: %w", id, ErrIncompleteHandle) + } + if err := m.lcm.OnKillRequested(ctx, id, reason); err != nil { + return false, fmt.Errorf("kill %s: %w", id, err) + } + if err := m.runtime.Destroy(ctx, handle); err != nil { + return false, fmt.Errorf("kill %s: runtime: %w", id, err) + } + if err := m.workspace.Destroy(ctx, ws); err != nil { + return false, fmt.Errorf("kill %s: workspace: %w", id, err) } + return true, nil +} - // Validate the teardown handles BEFORE recording intent or touching an - // adapter: a corrupted/partially-seeded record with empty handles must never - // reach Destroy (empty path / handle could be an unsafe delete). - rtHandle := runtimeHandle(meta) - wsInfo := workspaceInfo(rec, meta) - if !validRuntimeHandle(rtHandle) { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: runtime handle", id, ErrIncompleteTeardownMetadata) +// Restore relaunches a torn-down session in its workspace. The fallible I/O runs +// before any canonical write, so a failure never resurrects the row or destroys +// the worktree (it may hold the agent's prior work). +func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { + rec, ok, err := m.store.GetSession(ctx, id) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, err) } - if !validWorkspaceInfo(wsInfo) { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: workspace path", id, ErrIncompleteTeardownMetadata) + if !ok { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) + } + if !isTerminal(rec.Lifecycle.Session.State) { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) + } + meta := rec.Metadata + if meta.AgentSessionID == "" && meta.Prompt == "" { + return domain.Session{}, fmt.Errorf("restore %s: nothing to resume from", id) } - if err := m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: opts.Reason, Detail: opts.Detail}); err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: on kill requested: %w", id, err) + ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ProjectID: rec.ProjectID, SessionID: id, Branch: meta.Branch}) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: workspace: %w", id, err) + } + agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: meta.Prompt} + launch := m.agent.GetRestoreCommand(meta.AgentSessionID) + if meta.AgentSessionID == "" { + launch = m.agent.GetLaunchCommand(agentCfg) } - if err := m.runtime.Destroy(ctx, rtHandle); err != nil { - return ports.KillResult{ID: id}, fmt.Errorf("kill %s: runtime destroy: %w", id, err) + handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ + SessionID: id, + WorkspacePath: ws.Path, + LaunchCommand: launch, + Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID), + }) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: runtime: %w", id, err) } - if err := m.workspace.Destroy(ctx, wsInfo); err != nil { - return ports.KillResult{ID: id, WorkspaceFreed: false}, fmt.Errorf("kill %s: workspace destroy: %w", id, err) + outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, AgentSessionID: meta.AgentSessionID, Prompt: meta.Prompt} + if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + _ = m.runtime.Destroy(ctx, handle) + return domain.Session{}, fmt.Errorf("restore %s: completed: %w", id, err) } - return ports.KillResult{ID: id, WorkspaceFreed: true}, nil + return m.Get(ctx, id) } -// ---- read-model ---- - -// List builds the read-model for a project: stored records with the display -// status derived on read. The SM is the single producer of that status. func (m *Manager) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) { - recs, err := m.store.List(ctx, project) + recs, err := m.store.ListSessions(ctx, project) if err != nil { return nil, fmt.Errorf("list %s: %w", project, err) } out := make([]domain.Session, 0, len(recs)) for _, rec := range recs { - out = append(out, toSession(rec)) + s, err := m.toSession(ctx, rec) + if err != nil { + return nil, err + } + out = append(out, s) } return out, nil } func (m *Manager) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) { - rec, ok, err := m.store.Get(ctx, id) + rec, ok, err := m.store.GetSession(ctx, id) if err != nil { return domain.Session{}, fmt.Errorf("get %s: %w", id, err) } if !ok { return domain.Session{}, fmt.Errorf("get %s: %w", id, ErrNotFound) } - return toSession(rec), nil + return m.toSession(ctx, rec) } -// ---- Send ---- - -// Send routes a message to the running agent through the AgentMessenger, which -// busy-detects and verifies delivery. func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) error { if err := m.messenger.Send(ctx, id, message); err != nil { return fmt.Errorf("send %s: %w", id, err) @@ -246,156 +218,64 @@ func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) return nil } -// ---- Restore ---- - -// Restore relaunches a previously torn-down session in its workspace. The -// fallible I/O (workspace restore + runtime create) runs first so a failure -// touches no canonical state and never destroys the worktree (it may hold the -// agent's prior work). Only once the runtime is up do we reopen the lifecycle: -// resetting a terminal session is an explicit mutation routed to the LCM (the -// LCM's observe path would never resurrect a terminal session), and the PR axis -// is cleared. OnSpawnCompleted then flips the runtime to alive. -func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { - rec, ok, err := m.store.Get(ctx, id) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, err) - } - if !ok { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) - } - // Only a torn-down session may be restored. Reopening a live one would spawn a - // duplicate runtime/workspace for the same id and reset its lifecycle. - if !isTerminalSession(rec.Lifecycle.Session.State) { - return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) - } - meta, err := m.store.GetMetadata(ctx, id) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: metadata: %w", id, err) - } - - // Resume is only possible with the agent's captured session id; without it we - // fall back to a fresh launch using the seeded prompt persisted at spawn time - // (the agent's id-capture path is a separate hook that may never have run, so - // "no id" is the common case rather than an error). If neither is available - // there is nothing to relaunch from — fail early, before any I/O. - agentSessionID := meta[lifecycle.MetaAgentSessionID] - seededPrompt := meta[lifecycle.MetaPrompt] - if agentSessionID == "" && seededPrompt == "" { - return domain.Session{}, fmt.Errorf("restore %s: no agent session id or seeded prompt (cannot resume or relaunch)", id) - } - - ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ - ProjectID: rec.ProjectID, - SessionID: id, - Branch: meta[lifecycle.MetaBranch], - }) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: workspace restore: %w", id, err) - } - - agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: seededPrompt} - launchCommand := m.agent.GetRestoreCommand(agentSessionID) - if agentSessionID == "" { - launchCommand = m.agent.GetLaunchCommand(agentCfg) - } - handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ - SessionID: id, - WorkspacePath: ws.Path, - LaunchCommand: launchCommand, - Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID), - }) - if err != nil { - return domain.Session{}, fmt.Errorf("restore %s: runtime create: %w", id, err) - } - - // Past this point the runtime is live: a failure must tear it back down (but - // never the workspace, which holds the agent's prior work) so we don't strand - // a process while parking the session in a terminal lifecycle. - reopen := rec - reopen.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested} - reopen.Lifecycle.PR = domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonClearedOnRestore} - reopen.Lifecycle.Runtime = domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete} - reopen.Lifecycle.Detecting = nil - if err := m.lcm.OnSpawnInitiated(ctx, reopen); err != nil { - m.rollbackRuntime(ctx, handle) - return domain.Session{}, fmt.Errorf("restore %s: on spawn initiated: %w", id, err) - } - - outcome := ports.SpawnOutcome{ - Branch: ws.Branch, - WorkspacePath: ws.Path, - RuntimeHandle: handle, - AgentSessionID: agentSessionID, - Prompt: seededPrompt, - } - if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { - m.rollbackRuntime(ctx, handle) - // Re-upsert the original record to undo the reopen; the store will - // assign the next revision. - if revertErr := m.lcm.OnSpawnInitiated(ctx, rec); revertErr != nil { - return domain.Session{}, fmt.Errorf("restore %s: revert after spawn completed failure: %w (original error: %v)", id, revertErr, err) - } - if len(rec.Metadata) > 0 { - if revertErr := m.store.PatchMetadata(ctx, id, rec.Metadata); revertErr != nil { - return domain.Session{}, fmt.Errorf("restore %s: revert metadata after spawn completed failure: %w (original error: %v)", id, revertErr, err) - } - } - return domain.Session{}, fmt.Errorf("restore %s: on spawn completed: %w", id, err) - } - return m.Get(ctx, id) -} - -// ---- Cleanup ---- - // Cleanup reclaims the workspaces of terminal sessions in a project. A workspace -// whose teardown is refused by the worktree-remove safety (uncommitted work) is -// skipped, never forced. Runtime teardown is best-effort (a terminal session's -// runtime is usually already gone); the workspace result decides cleaned/skipped. -func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) (ports.CleanupResult, error) { - recs, err := m.store.List(ctx, project) +// whose teardown is refused (uncommitted work) is skipped, never forced. +func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) ([]domain.SessionID, error) { + recs, err := m.store.ListSessions(ctx, project) if err != nil { - return ports.CleanupResult{}, fmt.Errorf("cleanup %s: %w", project, err) + return nil, fmt.Errorf("cleanup %s: %w", project, err) } - var res ports.CleanupResult + var cleaned []domain.SessionID for _, rec := range recs { - if !isTerminalSession(rec.Lifecycle.Session.State) { + if !isTerminal(rec.Lifecycle.Session.State) { continue } - meta, err := m.store.GetMetadata(ctx, rec.ID) - if err != nil { - return res, fmt.Errorf("cleanup %s: metadata %s: %w", project, rec.ID, err) - } - wsInfo := workspaceInfo(rec, meta) - if !validWorkspaceInfo(wsInfo) { - // No workspace path to reclaim — skip rather than hand empty args to a - // real adapter's Destroy (an unsafe delete). - res.Skipped = append(res.Skipped, rec.ID) + ws := workspaceInfo(rec) + if ws.Path == "" { continue } - if rtHandle := runtimeHandle(meta); validRuntimeHandle(rtHandle) { - _ = m.runtime.Destroy(ctx, rtHandle) // best effort; usually already gone + if h := runtimeHandle(rec.Metadata); h.ID != "" { + _ = m.runtime.Destroy(ctx, h) // best effort; usually already gone } - if err := m.workspace.Destroy(ctx, wsInfo); err != nil { - res.Skipped = append(res.Skipped, rec.ID) - continue + if err := m.workspace.Destroy(ctx, ws); err != nil { + continue // skipped: uncommitted work } - res.Cleaned = append(res.Cleaned, rec.ID) + cleaned = append(cleaned, rec.ID) } - return res, nil + return cleaned, nil } // ---- helpers ---- -func toSession(rec domain.SessionRecord) domain.Session { - return domain.Session{SessionRecord: rec, Status: domain.DeriveLegacyStatus(rec.Lifecycle)} +func (m *Manager) toSession(ctx context.Context, rec domain.SessionRecord) (domain.Session, error) { + pr, err := m.store.PRFactsForSession(ctx, rec.ID) + if err != nil { + return domain.Session{}, fmt.Errorf("pr facts %s: %w", rec.ID, err) + } + return domain.Session{SessionRecord: rec, Status: domain.DeriveStatus(rec.Lifecycle, pr)}, nil } -func isTerminalSession(s domain.SessionState) bool { +func isTerminal(s domain.SessionState) bool { return s == domain.SessionDone || s == domain.SessionTerminated } -// buildPrompt assembles the spawn prompt from the explicit config only; the full -// 3-layer assembly (base protocol + config-derived + user rules) lands later. +func seedRecord(cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { + return domain.SessionRecord{ + ProjectID: cfg.ProjectID, + IssueID: cfg.IssueID, + Kind: cfg.Kind, + CreatedAt: now, + UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: domain.SessionNotStarted}, + Harness: cfg.Harness, + }, + } +} + +// buildPrompt assembles the spawn prompt from the explicit config (the full +// 3-layer assembly lands later). func buildPrompt(cfg ports.SpawnConfig) string { switch { case cfg.AgentRules == "": @@ -407,8 +287,6 @@ func buildPrompt(cfg ports.SpawnConfig) string { } } -// spawnEnv overlays the AO_* identity vars onto the agent's environment without -// mutating the map the agent returned. func spawnEnv(base map[string]string, id domain.SessionID, project domain.ProjectID, issue domain.IssueID) map[string]string { env := make(map[string]string, len(base)+3) for k, v := range base { @@ -420,70 +298,15 @@ func spawnEnv(base map[string]string, id domain.SessionID, project domain.Projec return env } -func seedRecord(id domain.SessionID, cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, - ProjectID: cfg.ProjectID, - IssueID: cfg.IssueID, - Kind: cfg.Kind, - CreatedAt: now, - UpdatedAt: now, - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete}, - PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, - }, - } -} - -// runtimeHandle / workspaceInfo reconstruct teardown handles from the metadata -// the LCM persisted in OnSpawnCompleted (the metadata-key contract is shared -// with the lifecycle package). -func runtimeHandle(meta map[string]string) ports.RuntimeHandle { - return ports.RuntimeHandle{ - ID: meta[lifecycle.MetaRuntimeHandleID], - RuntimeName: meta[lifecycle.MetaRuntimeName], - } +func runtimeHandle(meta domain.SessionMetadata) ports.RuntimeHandle { + return ports.RuntimeHandle{ID: meta.RuntimeHandleID, RuntimeName: meta.RuntimeName} } -func workspaceInfo(rec domain.SessionRecord, meta map[string]string) ports.WorkspaceInfo { +func workspaceInfo(rec domain.SessionRecord) ports.WorkspaceInfo { return ports.WorkspaceInfo{ - Path: meta[lifecycle.MetaWorkspacePath], - Branch: meta[lifecycle.MetaBranch], + Path: rec.Metadata.WorkspacePath, + Branch: rec.Metadata.Branch, SessionID: rec.ID, ProjectID: rec.ProjectID, } } - -// validRuntimeHandle reports whether the handle identifies a runtime to destroy. -// An adapter needs the handle id to target the right process; an empty handle -// would be ambiguous, so we refuse to call Destroy with one. -func validRuntimeHandle(h ports.RuntimeHandle) bool { - return h.ID != "" -} - -// validWorkspaceInfo reports whether there is a concrete path to reclaim. An -// empty path handed to a worktree-remove could resolve to an unsafe target. -func validWorkspaceInfo(w ports.WorkspaceInfo) bool { - return w.Path != "" -} - -func defaultNewID(cfg ports.SpawnConfig) domain.SessionID { - base := string(cfg.IssueID) - if base == "" { - base = string(cfg.Kind) - } - if base == "" { - base = "session" - } - return domain.SessionID(base + "-" + randHex(4)) -} - -func randHex(n int) string { - b := make([]byte, n) - if _, err := rand.Read(b); err != nil { - return strconv.FormatInt(time.Now().UnixNano(), 16) - } - return hex.EncodeToString(b) -} diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go index 5bb20d07..669e0c25 100644 --- a/backend/internal/session/manager_test.go +++ b/backend/internal/session/manager_test.go @@ -3,644 +3,294 @@ package session import ( "context" "errors" + "fmt" "testing" + "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -const ( - testProject = domain.ProjectID("proj") - testIssue = domain.IssueID("42") -) - -func spawnCfg() ports.SpawnConfig { - return ports.SpawnConfig{ - ProjectID: testProject, - IssueID: testIssue, - Kind: domain.KindWorker, - Branch: "feat/42", - Prompt: "do the thing", - AgentRules: "be careful", - } -} - -func TestSpawn_HappyPath(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - - sess, err := h.sm.Spawn(ctx, spawnCfg()) - if err != nil { - t.Fatalf("spawn: %v", err) - } - - // Display status is derived (single producer) — a freshly spawned, not_started - // session shows as spawning. - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want %q", sess.Status, domain.StatusSpawning) - } +var ctx = context.Background() - // Record seeded by the LCM with identity + initial lifecycle, then OnSpawnCompleted flipped - // the runtime axis to alive. - rec, ok, err := h.store.Get(ctx, "sess-1") - if err != nil || !ok { - t.Fatalf("get seeded record: ok=%v err=%v", ok, err) - } - if rec.ProjectID != testProject || rec.IssueID != testIssue || rec.Kind != domain.KindWorker { - t.Errorf("identity = %+v, want proj/42/worker", rec) - } - if !rec.CreatedAt.Equal(fixedTime) { - t.Errorf("createdAt = %v, want %v", rec.CreatedAt, fixedTime) - } - if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { - t.Errorf("session substate = %+v, want not_started/spawn_requested", got) - } - if got := rec.Lifecycle.Runtime; got.State != domain.RuntimeAlive || got.Reason != domain.RuntimeReasonProcessRunning { - t.Errorf("runtime substate = %+v, want alive/process_running", got) - } +// ---- fakes ---- - // Pipeline order: workspace -> runtime -> LCM seed command -> LCM completion. - wantOrder := []string{"Workspace.Create", "Runtime.Create", "OnSpawnInitiated", "OnSpawnCompleted"} - if got := h.log.snapshot(); !equalStrings(got, wantOrder) { - t.Errorf("call order = %v, want %v", got, wantOrder) - } +type fakeStore struct { + sessions map[domain.SessionID]domain.SessionRecord + pr map[domain.SessionID]domain.PRFacts + num int +} - // Identity env wired onto the runtime config, layered over the agent's env. - if len(h.runtime.created) != 1 { - t.Fatalf("runtime.created = %d, want 1", len(h.runtime.created)) - } - env := h.runtime.created[0].Env - for k, want := range map[string]string{ - EnvSessionID: "sess-1", - EnvProjectID: "proj", - EnvIssueID: "42", - "BASE": "1", - } { - if env[k] != want { - t.Errorf("env[%q] = %q, want %q", k, env[k], want) - } - } +func newFakeStore() *fakeStore { + return &fakeStore{sessions: map[domain.SessionID]domain.SessionRecord{}, pr: map[domain.SessionID]domain.PRFacts{}} +} - // Handles persisted to metadata for later teardown/restore. The prompt is - // persisted too so a later Restore that finds no captured agent session id - // can still fall back to a fresh launch using the same prompt. - meta, _ := h.store.GetMetadata(ctx, "sess-1") - for k, want := range map[string]string{ - lifecycle.MetaBranch: "feat/42", - lifecycle.MetaWorkspacePath: "/tmp/ws/sess-1", - lifecycle.MetaRuntimeHandleID: "rt-sess-1", - lifecycle.MetaRuntimeName: "tmux", - lifecycle.MetaPrompt: "do the thing\n\nbe careful", - } { - if meta[k] != want { - t.Errorf("meta[%q] = %q, want %q", k, meta[k], want) +func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + f.num++ + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) + f.sessions[rec.ID] = rec + return rec, nil +} +func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { + f.sessions[rec.ID] = rec + return nil +} +func (f *fakeStore) GetSession(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + r, ok := f.sessions[id] + return r, ok, nil +} +func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domain.SessionRecord, error) { + var out []domain.SessionRecord + for _, r := range f.sessions { + if r.ProjectID == p { + out = append(out, r) } } + return out, nil } - -func TestSpawn_RuntimeCreateFailure_RollsBack(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - h.runtime.createErr = errors.New("boom") - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error, got nil") - } - - // No record seeded for a spawn that never completed. - if _, ok, _ := h.store.Get(ctx, "sess-1"); ok { - t.Error("record was seeded despite runtime-create failure") - } - // The already-created workspace was rolled back (eager rollback), since a - // late-seeded record means Cleanup could never find this orphan. - if len(h.workspace.destroyed) != 1 || h.workspace.destroyed[0].Path != "/tmp/ws/sess-1" { - t.Errorf("workspace.destroyed = %+v, want the created worktree", h.workspace.destroyed) - } - // LCM never told a spawn completed. - if h.log.indexOf("OnSpawnCompleted") != -1 { - t.Error("OnSpawnCompleted should not fire on a failed spawn") +func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { + out := make([]domain.SessionRecord, 0, len(f.sessions)) + for _, r := range f.sessions { + out = append(out, r) } + return out, nil } - -func TestSpawn_ExistingSessionIDRejectedBeforeWork(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", - ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("seed existing row: %v", err) - } - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error for existing session id, got nil") - } - if len(h.workspace.created) != 0 { - t.Error("workspace should not be created when session id already exists") - } - if len(h.runtime.created) != 0 { - t.Error("runtime should not be created when session id already exists") - } - if h.log.indexOf("OnSpawnInitiated") != -1 || h.log.indexOf("OnSpawnCompleted") != -1 { - t.Error("LCM should not be called when session id already exists") - } +func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { + return f.pr[id], nil } -func TestSpawn_OnSpawnCompletedFailure_RoutesOrphanToErrored(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - h.lcm.onSpawnErr = errors.New("lcm boom") - - _, err := h.sm.Spawn(ctx, spawnCfg()) - if err == nil { - t.Fatal("spawn: want error, got nil") - } - - // Runtime + workspace are torn down on the failure path. - if len(h.runtime.destroyed) != 1 { - t.Errorf("runtime.destroyed = %d, want 1", len(h.runtime.destroyed)) - } - if len(h.workspace.destroyed) != 1 { - t.Errorf("workspace.destroyed = %d, want 1", len(h.workspace.destroyed)) - } - // The record was already seeded and the store has no delete, so the orphan is - // routed to a terminal errored state (via OnKillRequested(KillError)) rather - // than stranded forever as "spawning". - rec, ok, _ := h.store.Get(ctx, "sess-1") - if !ok { - t.Fatal("seeded record vanished; expected it parked as errored") - } - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonErrorInProcess { - t.Errorf("session substate = %+v, want terminated/error_in_process", got) - } - if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusErrored { - t.Errorf("status = %q, want errored", status) - } +// fakeLCM is the minimal lifecycle the Session Manager drives: it persists the +// spawn/kill canonical writes into the store so Get reflects them. +type fakeLCM struct { + store *fakeStore + completed int } -func TestKill_OrderingAndTerminalState(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - - res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) - if err != nil { - t.Fatalf("kill: %v", err) - } - if !res.WorkspaceFreed { - t.Error("WorkspaceFreed = false, want true") - } - - // Intent recorded with the LCM BEFORE any teardown, runtime before workspace. - iKill := h.log.indexOf("OnKillRequested") - iRT := h.log.indexOf("Runtime.Destroy") - iWS := h.log.indexOf("Workspace.Destroy") - if !(iKill >= 0 && iKill < iRT && iRT < iWS) { - t.Errorf("kill order indices: OnKillRequested=%d Runtime.Destroy=%d Workspace.Destroy=%d (want ascending)", iKill, iRT, iWS) - } - - // Terminal canonical written by the LCM; display derives to killed. - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled { - t.Errorf("session substate = %+v, want terminated/manually_killed", got) - } - if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusKilled { - t.Errorf("status = %q, want killed", status) - } +func (l *fakeLCM) OnSpawnCompleted(_ context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + l.completed++ + rec := l.store.sessions[id] + rec.Lifecycle.Session.State = domain.SessionNotStarted + rec.Lifecycle.IsAlive = true + rec.Lifecycle.TerminationReason = domain.TermNone + rec.Metadata = domain.SessionMetadata{ + Branch: o.Branch, WorkspacePath: o.WorkspacePath, + RuntimeHandleID: o.RuntimeHandle.ID, RuntimeName: o.RuntimeHandle.RuntimeName, + AgentSessionID: o.AgentSessionID, Prompt: o.Prompt, + } + l.store.sessions[id] = rec + return nil } - -func TestKill_WorktreeRemoveRefusalSurfaced(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - // The worktree path is still registered after prune (uncommitted work). - h.workspace.refuse["/tmp/ws/sess-1"] = true - - res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) - if err == nil { - t.Fatal("kill: want refusal error, got nil") - } - if res.WorkspaceFreed { - t.Error("WorkspaceFreed = true, want false on refusal") - } - // The refusal must be honored — the path is never force-deleted. - if len(h.workspace.destroyed) != 0 { - t.Errorf("workspace.destroyed = %+v, want none (refused)", h.workspace.destroyed) - } - // Runtime still torn down and intent still recorded — only the worktree is spared. - if h.log.indexOf("Runtime.Destroy") == -1 || h.log.indexOf("OnKillRequested") == -1 { - t.Error("runtime teardown / kill intent should still happen on a workspace refusal") - } +func (l *fakeLCM) OnKillRequested(_ context.Context, id domain.SessionID, reason domain.TerminationReason) error { + rec := l.store.sessions[id] + rec.Lifecycle.Session.State = domain.SessionTerminated + rec.Lifecycle.TerminationReason = reason + rec.Lifecycle.IsAlive = false + l.store.sessions[id] = rec + return nil } - -func TestKill_IncompleteMetadata_RefusesTeardown(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - // A record with no teardown metadata (empty runtime handle + workspace path), - // e.g. a partially-seeded or corrupted record. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); !errors.Is(err, ErrIncompleteTeardownMetadata) { - t.Fatalf("kill: err = %v, want ErrIncompleteTeardownMetadata", err) - } - // Nothing destroyed with empty args, and no intent recorded. - if len(h.runtime.destroyed) != 0 || len(h.workspace.destroyed) != 0 { - t.Errorf("teardown ran despite incomplete metadata: rt=%v ws=%v", h.runtime.destroyed, h.workspace.destroyed) - } - if h.log.indexOf("OnKillRequested") != -1 { - t.Error("kill intent recorded despite incomplete metadata") - } +func (l *fakeLCM) ApplyRuntimeObservation(context.Context, domain.SessionID, ports.RuntimeFacts) error { + return nil +} +func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { + return nil +} +func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { + return nil +} +func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { return nil } +func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { + return nil, nil } -func TestCleanup_IncompleteMetadata_Skipped(t *testing.T) { - h := newHarness("unused") - ctx := context.Background() - // Terminal session but no workspace path persisted — must be skipped, never - // handed to Destroy with an empty path. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "orphan-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } +type fakeRuntime struct { + createErr error + created, destroyed int +} - res, err := h.sm.Cleanup(ctx, testProject) - if err != nil { - t.Fatalf("cleanup: %v", err) - } - if !equalIDSet(res.Skipped, []domain.SessionID{"orphan-1"}) { - t.Errorf("skipped = %v, want [orphan-1]", res.Skipped) - } - if len(res.Cleaned) != 0 { - t.Errorf("cleaned = %v, want none", res.Cleaned) - } - if len(h.workspace.destroyed) != 0 { - t.Errorf("workspace.destroyed = %v, want none (empty path must not reach Destroy)", h.workspace.destroyed) +func (r *fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { + if r.createErr != nil { + return ports.RuntimeHandle{}, r.createErr } + r.created++ + return ports.RuntimeHandle{ID: "h1", RuntimeName: "tmux"}, nil +} +func (r *fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { r.destroyed++; return nil } +func (r *fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { + return true, nil } -func TestRestore_LiveSession_Rejected(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - // The session is live (never torn down). Capture an agent id so the only thing - // blocking restore is the non-terminal lifecycle, not missing metadata. - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } - createdBefore := len(h.runtime.created) - restoresBefore := len(h.workspace.restoredID) +type fakeAgent struct{} - if _, err := h.sm.Restore(ctx, "sess-1"); !errors.Is(err, ErrNotRestorable) { - t.Fatalf("restore: err = %v, want ErrNotRestorable", err) - } - // No second runtime/workspace spun up for the still-live session. - if len(h.runtime.created) != createdBefore { - t.Error("runtime created for a live-session restore") - } - if len(h.workspace.restoredID) != restoresBefore { - t.Error("workspace restored for a live-session restore") - } +func (fakeAgent) GetLaunchCommand(ports.AgentConfig) string { return "launch" } +func (fakeAgent) GetEnvironment(ports.AgentConfig) map[string]string { + return map[string]string{"X": "1"} } +func (fakeAgent) GetRestoreCommand(id string) string { return "resume " + id } -func TestListAndGet_DeriveStatus(t *testing.T) { - cases := []struct { - name string - lc domain.CanonicalSessionLifecycle - want domain.SessionStatus - }{ - {"not_started", lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.PRNone, ""), domain.StatusSpawning}, - {"working", lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), domain.StatusWorking}, - {"idle", lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.PRNone, ""), domain.StatusIdle}, - {"needs_input", lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.PRNone, ""), domain.StatusNeedsInput}, - {"pr_ci_failed", lc(domain.SessionWorking, domain.ReasonFixingCI, domain.PROpen, domain.PRReasonCIFailing), domain.StatusCIFailed}, - {"pr_merged", lc(domain.SessionIdle, domain.ReasonMergedWaitingDecision, domain.PRMerged, domain.PRReasonMerged), domain.StatusMerged}, - {"killed", lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), domain.StatusKilled}, - } +type fakeWorkspace struct { + destroyErr error + destroyed int +} - h := newHarness("unused") - ctx := context.Background() - for _, c := range cases { - if err := h.store.Upsert(ctx, domain.SessionRecord{ID: domain.SessionID(c.name), ProjectID: testProject, Lifecycle: c.lc}, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert %s: %v", c.name, err) - } - } +func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return ports.WorkspaceInfo{Path: "/ws/" + string(cfg.SessionID), Branch: cfg.Branch, SessionID: cfg.SessionID, ProjectID: cfg.ProjectID}, nil +} +func (w *fakeWorkspace) Destroy(context.Context, ports.WorkspaceInfo) error { + w.destroyed++ + return w.destroyErr +} +func (w *fakeWorkspace) Restore(ctx context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return w.Create(ctx, cfg) +} - // Get derives per-record. - for _, c := range cases { - got, err := h.sm.Get(ctx, domain.SessionID(c.name)) - if err != nil { - t.Fatalf("get %s: %v", c.name, err) - } - if got.Status != c.want { - t.Errorf("get %s: status = %q, want %q", c.name, got.Status, c.want) - } - } +type fakeMessenger struct{ msgs []string } - // List derives for every record in the project. - got, err := h.sm.List(ctx, testProject) - if err != nil { - t.Fatalf("list: %v", err) - } - if len(got) != len(cases) { - t.Fatalf("list len = %d, want %d", len(got), len(cases)) - } - byID := map[domain.SessionID]domain.SessionStatus{} - for _, s := range got { - byID[s.ID] = s.Status - } - for _, c := range cases { - if byID[domain.SessionID(c.name)] != c.want { - t.Errorf("list %s: status = %q, want %q", c.name, byID[domain.SessionID(c.name)], c.want) - } - } +func (m *fakeMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { + m.msgs = append(m.msgs, msg) + return nil } -func TestGet_NotFound(t *testing.T) { - h := newHarness("sess-1") - if _, err := h.sm.Get(context.Background(), "missing"); !errors.Is(err, ErrNotFound) { - t.Errorf("get missing: err = %v, want ErrNotFound", err) - } +func newManager() (*Manager, *fakeStore, *fakeRuntime, *fakeWorkspace) { + st := newFakeStore() + rt := &fakeRuntime{} + ws := &fakeWorkspace{} + m := New(Deps{ + Runtime: rt, Agent: fakeAgent{}, Workspace: ws, + Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, + }) + return m, st, rt, ws } -func TestSend_RoutesToMessenger(t *testing.T) { - h := newHarness("sess-1") - if err := h.sm.Send(context.Background(), "sess-1", "hello"); err != nil { - t.Fatalf("send: %v", err) - } - if len(h.messenger.sent) != 1 || h.messenger.sent[0].ID != "sess-1" || h.messenger.sent[0].Message != "hello" { - t.Errorf("messenger.sent = %+v, want one {sess-1, hello}", h.messenger.sent) +func seedTerminal(st *fakeStore, id domain.SessionID, meta domain.SessionMetadata) { + st.sessions[id] = domain.SessionRecord{ + ID: id, ProjectID: "mer", Metadata: meta, + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionTerminated}}, } } -func TestRestore_RelaunchesWithResumeCommand(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // The agent's resume id is captured in metadata (here set explicitly). - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } +// ---- tests ---- - sess, err := h.sm.Restore(ctx, "sess-1") - if err != nil { - t.Fatalf("restore: %v", err) - } +func TestSpawn_AssignsIDAndGoesLive(t *testing.T) { + m, st, rt, _ := newManager() - // Reopened: terminal session reset to a fresh spawn, PR cleared, runtime alive. - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want spawning", sess.Status) - } - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { - t.Errorf("session substate = %+v, want not_started/spawn_requested", got) - } - if got := rec.Lifecycle.PR; got.State != domain.PRNone || got.Reason != domain.PRReasonClearedOnRestore { - t.Errorf("pr substate = %+v, want none/cleared_on_restore", got) + s, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "do it"}) + if err != nil { + t.Fatal(err) } - if rec.Lifecycle.Runtime.State != domain.RuntimeAlive { - t.Errorf("runtime state = %q, want alive", rec.Lifecycle.Runtime.State) + if s.ID != "mer-1" { + t.Fatalf("store should assign mer-1, got %q", s.ID) } - - // Relaunched via the agent's resume command (created[0] is the original spawn). - if len(h.runtime.created) != 2 { - t.Fatalf("runtime.created = %d, want 2 (spawn + restore)", len(h.runtime.created)) + if s.Status != domain.StatusSpawning { + t.Fatalf("fresh session displays spawning, got %q", s.Status) } - if got := h.runtime.created[1].LaunchCommand; got != "claude --resume agent-xyz" { - t.Errorf("restore launch command = %q, want resume", got) + if rt.created != 1 { + t.Fatalf("runtime not created") } - if h.log.indexOf("Workspace.Restore") == -1 { - t.Error("Workspace.Restore was not called") + if st.sessions["mer-1"].Metadata.RuntimeHandleID != "h1" { + t.Fatal("spawn handle not folded into the row") } } -func TestRestore_NoAgentSessionID_FreshLaunchFallback(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - // No agent session id was ever captured (the capture hook is a separate - // path that may never have run), but Spawn persisted the prompt, so Restore - // must fall back to a fresh launch instead of failing. - createdBefore := len(h.runtime.created) +func TestSpawn_RollsBackOnRuntimeFailure(t *testing.T) { + m, st, _, ws := newManager() + m.runtime = &fakeRuntime{createErr: errors.New("boom")} - sess, err := h.sm.Restore(ctx, "sess-1") - if err != nil { - t.Fatalf("restore: %v", err) + if _, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer"}); err == nil { + t.Fatal("expected spawn to fail") } - if sess.Status != domain.StatusSpawning { - t.Errorf("status = %q, want spawning", sess.Status) + if ws.destroyed != 1 { + t.Fatal("workspace should be rolled back") } - if len(h.runtime.created) != createdBefore+1 { - t.Fatalf("runtime.created grew by %d, want 1 (fresh-launch fallback)", len(h.runtime.created)-createdBefore) - } - // Fresh launch uses GetLaunchCommand (returns "claude" in the fake) — not - // the resume command, which would have read "claude --resume ". - if got := h.runtime.created[createdBefore].LaunchCommand; got != "claude" { - t.Errorf("restore launch command = %q, want fresh-launch %q", got, "claude") + if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionTerminated { + t.Fatal("orphaned spawn should be parked terminal") } } -func TestRestore_NoIDAndNoPrompt_Errors(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - // Seed a terminal record directly without any metadata — no agent session id, - // no prompt. Restore has nothing to resume and nothing to relaunch from, so - // it must fail early without touching workspace/runtime. - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "sess-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert: %v", err) - } - beforeRestores := len(h.workspace.restoredID) - beforeCreated := len(h.runtime.created) +func TestKill_TearsDownRuntimeAndWorkspace(t *testing.T) { + m, st, rt, ws := newManager() + st.sessions["mer-1"] = mkLive("mer-1") - if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { - t.Fatal("restore: want error for missing agent session id and prompt, got nil") - } - if len(h.workspace.restoredID) != beforeRestores { - t.Error("workspace was touched despite a doomed restore") - } - if len(h.runtime.created) != beforeCreated { - t.Error("runtime was created despite a doomed restore") + freed, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled) + if err != nil || !freed { + t.Fatalf("kill should free the workspace: freed=%v err=%v", freed, err) } - // The session stays terminal — a failed restore does not reopen it. - rec, _, _ := h.store.Get(ctx, "sess-1") - if rec.Lifecycle.Session.State != domain.SessionTerminated { - t.Errorf("session state = %q, want terminated (unchanged)", rec.Lifecycle.Session.State) + if rt.destroyed != 1 || ws.destroyed != 1 { + t.Fatal("kill should destroy runtime and workspace") } } -func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) { - h := newHarness("sess-1") - ctx := context.Background() - if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { - t.Fatalf("spawn: %v", err) - } - if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { - t.Fatalf("kill: %v", err) - } - if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { - t.Fatalf("patch metadata: %v", err) - } - beforeMeta, _ := h.store.GetMetadata(ctx, "sess-1") - - // Fail the post-create LCM call; capture teardown counts just before restore. - h.lcm.onSpawnErr = errors.New("lcm boom") - before, _, _ := h.store.Get(ctx, "sess-1") - destroyedBefore := len(h.runtime.destroyed) - wsDestroyedBefore := len(h.workspace.destroyed) - - if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { - t.Fatal("restore: want error, got nil") - } - - rec, _, _ := h.store.Get(ctx, "sess-1") - if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled { - t.Fatalf("restore failure should restore terminal lifecycle, got %+v", got) - } - if rec.Lifecycle.Revision != before.Lifecycle.Revision+2 { - t.Fatalf("restore failure should advance revision twice, got %d want %d", rec.Lifecycle.Revision, before.Lifecycle.Revision+2) - } - afterMeta, _ := h.store.GetMetadata(ctx, "sess-1") - if !equalStringMap(afterMeta, beforeMeta) { - t.Fatalf("restore failure should restore metadata, got %+v want %+v", afterMeta, beforeMeta) +func TestKill_RefusesIncompleteHandle(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = domain.SessionRecord{ // live, but no teardown handles + ID: "mer-1", ProjectID: "mer", + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, } - // The runtime created during restore is torn back down so no process is - // stranded; the workspace is left intact (it holds the agent's prior work). - if len(h.runtime.destroyed) != destroyedBefore+1 { - t.Errorf("runtime.destroyed grew by %d, want 1 (restore rollback)", len(h.runtime.destroyed)-destroyedBefore) - } - if len(h.workspace.destroyed) != wsDestroyedBefore { - t.Errorf("workspace was destroyed on restore rollback; it must be preserved") + if _, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled); !errors.Is(err, ErrIncompleteHandle) { + t.Fatalf("want ErrIncompleteHandle, got %v", err) } } -func TestCleanup_SkipsUncommittedWork(t *testing.T) { - h := newHarness("unused") - ctx := context.Background() - - // Two terminal sessions (reclaimable) + one working session (must be ignored). - seedTerminal(t, h, "done-1", "/tmp/ws/done-1") - seedTerminal(t, h, "dirty-1", "/tmp/ws/dirty-1") - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: "live-1", ProjectID: testProject, - Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert live: %v", err) - } - // dirty-1's worktree still holds uncommitted work — Destroy refuses it. - h.workspace.refuse["/tmp/ws/dirty-1"] = true +func TestRestore_ReopensTerminal(t *testing.T) { + m, st, rt, _ := newManager() + seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "b", AgentSessionID: "agent-x"}) - res, err := h.sm.Cleanup(ctx, testProject) + s, err := m.Restore(ctx, "mer-1") if err != nil { - t.Fatalf("cleanup: %v", err) - } - - if !equalIDSet(res.Cleaned, []domain.SessionID{"done-1"}) { - t.Errorf("cleaned = %v, want [done-1]", res.Cleaned) + t.Fatal(err) } - if !equalIDSet(res.Skipped, []domain.SessionID{"dirty-1"}) { - t.Errorf("skipped = %v, want [dirty-1]", res.Skipped) + if s.Status != domain.StatusSpawning { + t.Fatalf("restored session displays spawning, got %q", s.Status) } - // The live session was never a candidate. - if contains(res.Cleaned, "live-1") || contains(res.Skipped, "live-1") { - t.Error("non-terminal session must not be cleaned or skipped") + if rt.created != 1 { + t.Fatal("restore should relaunch the runtime") } } -// ---- test helpers ---- +func TestRestore_RefusesLiveSession(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = mkLive("mer-1") -func lc(s domain.SessionState, r domain.SessionReason, prs domain.PRState, prr domain.PRReason) domain.CanonicalSessionLifecycle { - return domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: s, Reason: r}, - PR: domain.PRSubstate{State: prs, Reason: prr}, - Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + if _, err := m.Restore(ctx, "mer-1"); !errors.Is(err, ErrNotRestorable) { + t.Fatalf("want ErrNotRestorable, got %v", err) } } -func seedTerminal(t *testing.T, h *harness, id domain.SessionID, wsPath string) { - t.Helper() - ctx := context.Background() - if err := h.store.Upsert(ctx, domain.SessionRecord{ - ID: id, ProjectID: testProject, - Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), - }, ports.EventSessionCreated); err != nil { - t.Fatalf("upsert %s: %v", id, err) - } - if err := h.store.PatchMetadata(ctx, id, map[string]string{lifecycle.MetaWorkspacePath: wsPath}); err != nil { - t.Fatalf("patch metadata %s: %v", id, err) - } -} +func TestList_DerivesStatusFromPRFacts(t *testing.T) { + m, st, _, _ := newManager() + st.sessions["mer-1"] = mkLive("mer-1") + st.pr["mer-1"] = domain.PRFacts{Exists: true, CI: domain.CIFailing} -func equalStrings(a, b []string) bool { - if len(a) != len(b) { - return false + list, err := m.List(ctx, "mer") + if err != nil { + t.Fatal(err) } - for i := range a { - if a[i] != b[i] { - return false - } + if len(list) != 1 || list[0].Status != domain.StatusCIFailed { + t.Fatalf("status should reflect PR facts, got %+v", list) } - return true } -func equalStringMap(a, b map[string]string) bool { - if len(a) != len(b) { - return false +func TestCleanup_ReclaimsTerminalWorkspaces(t *testing.T) { + m, st, _, ws := newManager() + seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1"}) + st.sessions["mer-2"] = mkLive("mer-2") // live: must be skipped + + cleaned, err := m.Cleanup(ctx, "mer") + if err != nil { + t.Fatal(err) } - for k, v := range a { - if b[k] != v { - return false - } + if len(cleaned) != 1 || cleaned[0] != "mer-1" { + t.Fatalf("only the terminal session should be reclaimed, got %v", cleaned) } - return true -} - -func contains(ids []domain.SessionID, id domain.SessionID) bool { - for _, x := range ids { - if x == id { - return true - } + if ws.destroyed != 1 { + t.Fatal("the live session's workspace must not be destroyed") } - return false } -func equalIDSet(got, want []domain.SessionID) bool { - if len(got) != len(want) { - return false - } - for _, w := range want { - if !contains(got, w) { - return false - } +func mkLive(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ + ID: id, ProjectID: "mer", + Metadata: domain.SessionMetadata{WorkspacePath: "/ws/" + string(id), RuntimeHandleID: "h1", RuntimeName: "tmux"}, + Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, } - return true } diff --git a/backend/internal/storage/sqlite/changelog_store.go b/backend/internal/storage/sqlite/changelog_store.go new file mode 100644 index 00000000..927d7968 --- /dev/null +++ b/backend/internal/storage/sqlite/changelog_store.go @@ -0,0 +1,89 @@ +package sqlite + +import ( + "context" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ChangeLogRow is one durable CDC event. These rows are written by the DB +// triggers (migration 0001), never by application code; the store only reads +// them, for the CDC poller. +type ChangeLogRow struct { + Seq int64 + ProjectID string + SessionID string // empty when the event is project-level (NULL in the DB) + EventType string + Payload string + CreatedAt time.Time +} + +// ReadChangeLogAfter returns up to limit events with seq > after, in seq order +// — the CDC poller's read. The frontend's offset is `after`. +func (s *Store) ReadChangeLogAfter(ctx context.Context, after int64, limit int) ([]ChangeLogRow, error) { + rows, err := s.qr.ReadChangeLogAfter(ctx, gen.ReadChangeLogAfterParams{Seq: after, Limit: int64(limit)}) + if err != nil { + return nil, fmt.Errorf("read change_log after %d: %w", after, err) + } + out := make([]ChangeLogRow, 0, len(rows)) + for _, r := range rows { + out = append(out, changeLogRowFromGen(r)) + } + return out, nil +} + +// ReadChangeLogAfterForProject is the project-scoped variant — a client +// subscribed to one project reads only its events. +func (s *Store) ReadChangeLogAfterForProject(ctx context.Context, project string, after int64, limit int) ([]ChangeLogRow, error) { + rows, err := s.qr.ReadChangeLogAfterForProject(ctx, gen.ReadChangeLogAfterForProjectParams{ + ProjectID: project, Seq: after, Limit: int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("read change_log for %s after %d: %w", project, after, err) + } + out := make([]ChangeLogRow, 0, len(rows)) + for _, r := range rows { + out = append(out, changeLogRowFromGen(r)) + } + return out, nil +} + +// MaxChangeLogSeq returns the highest seq (0 if empty) — a fresh consumer's +// starting offset. +func (s *Store) MaxChangeLogSeq(ctx context.Context) (int64, error) { + v, err := s.qr.MaxChangeLogSeq(ctx) + if err != nil { + return 0, fmt.Errorf("max change_log seq: %w", err) + } + return asInt64(v), nil +} + +func changeLogRowFromGen(r gen.ChangeLog) ChangeLogRow { + row := ChangeLogRow{ + Seq: r.Seq, + ProjectID: r.ProjectID, + EventType: r.EventType, + Payload: r.Payload, + CreatedAt: r.CreatedAt, + } + if r.SessionID.Valid { + row.SessionID = r.SessionID.String + } + return row +} + +// asInt64 coerces sqlc's interface{} result for COALESCE(MAX(...)) — sqlc's +// SQLite type inference can't narrow the aggregate, so the generated signature +// is interface{}. modernc returns int64 for an integer aggregate. +func asInt64(v interface{}) int64 { + switch n := v.(type) { + case int64: + return n + case int: + return int64(n) + default: + return 0 + } +} diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go new file mode 100644 index 00000000..8b001d11 --- /dev/null +++ b/backend/internal/storage/sqlite/db.go @@ -0,0 +1,83 @@ +// Package sqlite is the durable persistence adapter: the 6-table schema (goose +// migrations), typed CRUD over sqlc-generated queries, and the read side of the +// trigger-driven CDC (it reads change_log; the DB triggers write it). +package sqlite + +import ( + "database/sql" + "embed" + "fmt" + "os" + "path/filepath" + + "github.com/pressly/goose/v3" + // modernc.org/sqlite is the pure-Go (CGO-free) SQLite driver — chosen so the + // daemon cross-compiles and ships as a static binary with no libsqlite/CGO + // toolchain dependency, at the cost of some raw throughput vs a C-backed driver. + _ "modernc.org/sqlite" +) + +//go:embed migrations/*.sql +var migrationsFS embed.FS + +// pragmas are applied on every connection open. WAL + NORMAL lets readers run +// concurrently with the writer; busy_timeout absorbs brief writer contention; +// foreign_keys enforces the cascades and the CDC triggers' lookups. +const pragmas = "?_pragma=journal_mode(WAL)" + + "&_pragma=busy_timeout(5000)" + + "&_pragma=foreign_keys(ON)" + + "&_pragma=synchronous(NORMAL)" + +// maxReaders caps the reader pool. WAL allows many concurrent readers. +const maxReaders = 8 + +// Open opens (creating if absent) the SQLite database under dataDir and returns +// a Store. It uses TWO pools against the same file: +// +// - a single WRITER connection (writeDB, MaxOpenConns=1): every write goes +// here, so a write and the CDC triggers' subqueries it fires always see the +// prior writes on the same connection (read-your-writes). This is required +// because the pr/pr_checks triggers SELECT from sessions/pr to fill in the +// event's project_id; a pooled writer could land that read on a connection +// that hasn't caught up to the commit and read NULL. +// - a READER pool (readDB, MaxOpenConns=maxReaders): all reads scale across +// it; WAL readers see the latest committed snapshot. +func Open(dataDir string) (*Store, error) { + if err := os.MkdirAll(dataDir, 0o755); err != nil { + return nil, fmt.Errorf("create data dir: %w", err) + } + dsn := "file:" + filepath.Join(dataDir, "ao.db") + pragmas + + writeDB, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("open sqlite writer: %w", err) + } + writeDB.SetMaxOpenConns(1) + writeDB.SetMaxIdleConns(1) + if err := migrate(writeDB); err != nil { + writeDB.Close() + return nil, err + } + + readDB, err := sql.Open("sqlite", dsn) + if err != nil { + writeDB.Close() + return nil, fmt.Errorf("open sqlite reader: %w", err) + } + readDB.SetMaxOpenConns(maxReaders) + readDB.SetMaxIdleConns(maxReaders) + + return NewStore(writeDB, readDB), nil +} + +func migrate(db *sql.DB) error { + goose.SetBaseFS(migrationsFS) + goose.SetLogger(goose.NopLogger()) + if err := goose.SetDialect("sqlite3"); err != nil { + return fmt.Errorf("set goose dialect: %w", err) + } + if err := goose.Up(db, "migrations"); err != nil { + return fmt.Errorf("run migrations: %w", err) + } + return nil +} diff --git a/backend/internal/storage/sqlite/gen/changelog.sql.go b/backend/internal/storage/sqlite/gen/changelog.sql.go new file mode 100644 index 00000000..6568fdcc --- /dev/null +++ b/backend/internal/storage/sqlite/gen/changelog.sql.go @@ -0,0 +1,102 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: changelog.sql + +package gen + +import ( + "context" +) + +const maxChangeLogSeq = `-- name: MaxChangeLogSeq :one +SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log +` + +func (q *Queries) MaxChangeLogSeq(ctx context.Context) (interface{}, error) { + row := q.db.QueryRowContext(ctx, maxChangeLogSeq) + var seq interface{} + err := row.Scan(&seq) + return seq, err +} + +const readChangeLogAfter = `-- name: ReadChangeLogAfter :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE seq > ? ORDER BY seq LIMIT ? +` + +type ReadChangeLogAfterParams struct { + Seq int64 + Limit int64 +} + +func (q *Queries) ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfterParams) ([]ChangeLog, error) { + rows, err := q.db.QueryContext(ctx, readChangeLogAfter, arg.Seq, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ChangeLog{} + for rows.Next() { + var i ChangeLog + if err := rows.Scan( + &i.Seq, + &i.ProjectID, + &i.SessionID, + &i.EventType, + &i.Payload, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const readChangeLogAfterForProject = `-- name: ReadChangeLogAfterForProject :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ? +` + +type ReadChangeLogAfterForProjectParams struct { + ProjectID string + Seq int64 + Limit int64 +} + +func (q *Queries) ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) { + rows, err := q.db.QueryContext(ctx, readChangeLogAfterForProject, arg.ProjectID, arg.Seq, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ChangeLog{} + for rows.Next() { + var i ChangeLog + if err := rows.Scan( + &i.Seq, + &i.ProjectID, + &i.SessionID, + &i.EventType, + &i.Payload, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/backend/internal/storage/sqlite/gen/db.go b/backend/internal/storage/sqlite/gen/db.go new file mode 100644 index 00000000..b6fcf6be --- /dev/null +++ b/backend/internal/storage/sqlite/gen/db.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "context" + "database/sql" +) + +type DBTX interface { + ExecContext(context.Context, string, ...interface{}) (sql.Result, error) + PrepareContext(context.Context, string) (*sql.Stmt, error) + QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) + QueryRowContext(context.Context, string, ...interface{}) *sql.Row +} + +func New(db DBTX) *Queries { + return &Queries{db: db} +} + +type Queries struct { + db DBTX +} + +func (q *Queries) WithTx(tx *sql.Tx) *Queries { + return &Queries{ + db: tx, + } +} diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go new file mode 100644 index 00000000..0c5b5c91 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/models.go @@ -0,0 +1,86 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "database/sql" + "time" +) + +type ChangeLog struct { + Seq int64 + ProjectID string + SessionID sql.NullString + EventType string + Payload string + CreatedAt time.Time +} + +type Pr struct { + Url string + SessionID string + Number int64 + PrState string + ReviewDecision string + CiState string + Mergeability string + UpdatedAt time.Time +} + +type PrCheck struct { + PrUrl string + Name string + CommitHash string + Status string + Url string + LogTail string + CreatedAt time.Time +} + +type PrComment struct { + PrUrl string + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time +} + +type Project struct { + ID string + Path string + RepoOriginUrl string + DisplayName string + RegisteredAt time.Time + ArchivedAt sql.NullTime +} + +type Session struct { + ID string + ProjectID string + Num int64 + IssueID string + Kind string + Harness string + SessionState string + TerminationReason string + IsAlive int64 + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time +} diff --git a/backend/internal/storage/sqlite/gen/pr.sql.go b/backend/internal/storage/sqlite/gen/pr.sql.go new file mode 100644 index 00000000..f9fa3620 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr.sql.go @@ -0,0 +1,114 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr.sql + +package gen + +import ( + "context" + "time" +) + +const deletePR = `-- name: DeletePR :exec +DELETE FROM pr WHERE url = ? +` + +func (q *Queries) DeletePR(ctx context.Context, url string) error { + _, err := q.db.ExecContext(ctx, deletePR, url) + return err +} + +const getPR = `-- name: GetPR :one +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE url = ? +` + +func (q *Queries) GetPR(ctx context.Context, url string) (Pr, error) { + row := q.db.QueryRowContext(ctx, getPR, url) + var i Pr + err := row.Scan( + &i.Url, + &i.SessionID, + &i.Number, + &i.PrState, + &i.ReviewDecision, + &i.CiState, + &i.Mergeability, + &i.UpdatedAt, + ) + return i, err +} + +const listPRsBySession = `-- name: ListPRsBySession :many +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE session_id = ? ORDER BY updated_at DESC +` + +func (q *Queries) ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) { + rows, err := q.db.QueryContext(ctx, listPRsBySession, sessionID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Pr{} + for rows.Next() { + var i Pr + if err := rows.Scan( + &i.Url, + &i.SessionID, + &i.Number, + &i.PrState, + &i.ReviewDecision, + &i.CiState, + &i.Mergeability, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPR = `-- name: UpsertPR :exec +INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (url) DO UPDATE SET + session_id = excluded.session_id, + number = excluded.number, + pr_state = excluded.pr_state, + review_decision = excluded.review_decision, + ci_state = excluded.ci_state, + mergeability = excluded.mergeability, + updated_at = excluded.updated_at +` + +type UpsertPRParams struct { + Url string + SessionID string + Number int64 + PrState string + ReviewDecision string + CiState string + Mergeability string + UpdatedAt time.Time +} + +func (q *Queries) UpsertPR(ctx context.Context, arg UpsertPRParams) error { + _, err := q.db.ExecContext(ctx, upsertPR, + arg.Url, + arg.SessionID, + arg.Number, + arg.PrState, + arg.ReviewDecision, + arg.CiState, + arg.Mergeability, + arg.UpdatedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/pr_checks.sql.go b/backend/internal/storage/sqlite/gen/pr_checks.sql.go new file mode 100644 index 00000000..58668ab1 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr_checks.sql.go @@ -0,0 +1,119 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr_checks.sql + +package gen + +import ( + "context" + "time" +) + +const listChecksByPR = `-- name: ListChecksByPR :many +SELECT pr_url, name, commit_hash, status, url, log_tail, created_at FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at +` + +func (q *Queries) ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) { + rows, err := q.db.QueryContext(ctx, listChecksByPR, prUrl) + if err != nil { + return nil, err + } + defer rows.Close() + items := []PrCheck{} + for rows.Next() { + var i PrCheck + if err := rows.Scan( + &i.PrUrl, + &i.Name, + &i.CommitHash, + &i.Status, + &i.Url, + &i.LogTail, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const listRecentChecks = `-- name: ListRecentChecks :many +SELECT status, commit_hash, created_at FROM pr_checks +WHERE pr_url = ? AND name = ? +ORDER BY created_at DESC LIMIT ? +` + +type ListRecentChecksParams struct { + PrUrl string + Name string + Limit int64 +} + +type ListRecentChecksRow struct { + Status string + CommitHash string + CreatedAt time.Time +} + +func (q *Queries) ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) { + rows, err := q.db.QueryContext(ctx, listRecentChecks, arg.PrUrl, arg.Name, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + items := []ListRecentChecksRow{} + for rows.Next() { + var i ListRecentChecksRow + if err := rows.Scan(&i.Status, &i.CommitHash, &i.CreatedAt); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPRCheck = `-- name: UpsertPRCheck :exec +INSERT INTO pr_checks (pr_url, name, commit_hash, status, url, log_tail, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET + status = excluded.status, + url = excluded.url, + log_tail = excluded.log_tail +` + +type UpsertPRCheckParams struct { + PrUrl string + Name string + CommitHash string + Status string + Url string + LogTail string + CreatedAt time.Time +} + +func (q *Queries) UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error { + _, err := q.db.ExecContext(ctx, upsertPRCheck, + arg.PrUrl, + arg.Name, + arg.CommitHash, + arg.Status, + arg.Url, + arg.LogTail, + arg.CreatedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/pr_comment.sql.go b/backend/internal/storage/sqlite/gen/pr_comment.sql.go new file mode 100644 index 00000000..a2f09f34 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/pr_comment.sql.go @@ -0,0 +1,89 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: pr_comment.sql + +package gen + +import ( + "context" + "time" +) + +const deletePRComments = `-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE pr_url = ? +` + +func (q *Queries) DeletePRComments(ctx context.Context, prUrl string) error { + _, err := q.db.ExecContext(ctx, deletePRComments, prUrl) + return err +} + +const listPRComments = `-- name: ListPRComments :many +SELECT pr_url, comment_id, author, file, line, body, resolved, created_at FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id +` + +func (q *Queries) ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) { + rows, err := q.db.QueryContext(ctx, listPRComments, prUrl) + if err != nil { + return nil, err + } + defer rows.Close() + items := []PrComment{} + for rows.Next() { + var i PrComment + if err := rows.Scan( + &i.PrUrl, + &i.CommentID, + &i.Author, + &i.File, + &i.Line, + &i.Body, + &i.Resolved, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertPRComment = `-- name: UpsertPRComment :exec +INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, comment_id) DO UPDATE SET + author = excluded.author, file = excluded.file, line = excluded.line, + body = excluded.body, resolved = excluded.resolved +` + +type UpsertPRCommentParams struct { + PrUrl string + CommentID string + Author string + File string + Line int64 + Body string + Resolved int64 + CreatedAt time.Time +} + +func (q *Queries) UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error { + _, err := q.db.ExecContext(ctx, upsertPRComment, + arg.PrUrl, + arg.CommentID, + arg.Author, + arg.File, + arg.Line, + arg.Body, + arg.Resolved, + arg.CreatedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/projects.sql.go b/backend/internal/storage/sqlite/gen/projects.sql.go new file mode 100644 index 00000000..a7c953cd --- /dev/null +++ b/backend/internal/storage/sqlite/gen/projects.sql.go @@ -0,0 +1,111 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: projects.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const archiveProject = `-- name: ArchiveProject :exec +UPDATE projects SET archived_at = ? WHERE id = ? +` + +type ArchiveProjectParams struct { + ArchivedAt sql.NullTime + ID string +} + +func (q *Queries) ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error { + _, err := q.db.ExecContext(ctx, archiveProject, arg.ArchivedAt, arg.ID) + return err +} + +const getProject = `-- name: GetProject :one +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE id = ? +` + +func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { + row := q.db.QueryRowContext(ctx, getProject, id) + var i Project + err := row.Scan( + &i.ID, + &i.Path, + &i.RepoOriginUrl, + &i.DisplayName, + &i.RegisteredAt, + &i.ArchivedAt, + ) + return i, err +} + +const listProjects = `-- name: ListProjects :many +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE archived_at IS NULL ORDER BY id +` + +func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { + rows, err := q.db.QueryContext(ctx, listProjects) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Project{} + for rows.Next() { + var i Project + if err := rows.Scan( + &i.ID, + &i.Path, + &i.RepoOriginUrl, + &i.DisplayName, + &i.RegisteredAt, + &i.ArchivedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const upsertProject = `-- name: UpsertProject :exec +INSERT INTO projects (id, path, repo_origin_url, display_name, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (id) DO UPDATE SET + path = excluded.path, + repo_origin_url = excluded.repo_origin_url, + display_name = excluded.display_name, + archived_at = excluded.archived_at +` + +type UpsertProjectParams struct { + ID string + Path string + RepoOriginUrl string + DisplayName string + RegisteredAt time.Time + ArchivedAt sql.NullTime +} + +func (q *Queries) UpsertProject(ctx context.Context, arg UpsertProjectParams) error { + _, err := q.db.ExecContext(ctx, upsertProject, + arg.ID, + arg.Path, + arg.RepoOriginUrl, + arg.DisplayName, + arg.RegisteredAt, + arg.ArchivedAt, + ) + return err +} diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go new file mode 100644 index 00000000..365113b1 --- /dev/null +++ b/backend/internal/storage/sqlite/gen/querier.go @@ -0,0 +1,38 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 + +package gen + +import ( + "context" +) + +type Querier interface { + ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error + DeletePR(ctx context.Context, url string) error + DeletePRComments(ctx context.Context, prUrl string) error + DeleteSession(ctx context.Context, id string) error + GetPR(ctx context.Context, url string) (Pr, error) + GetProject(ctx context.Context, id string) (Project, error) + GetSession(ctx context.Context, id string) (Session, error) + InsertSession(ctx context.Context, arg InsertSessionParams) error + ListAllSessions(ctx context.Context) ([]Session, error) + ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) + ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) + ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) + ListProjects(ctx context.Context) ([]Project, error) + ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) + ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) + MaxChangeLogSeq(ctx context.Context) (interface{}, error) + NextSessionNum(ctx context.Context, projectID string) (int64, error) + ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfterParams) ([]ChangeLog, error) + ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) + UpdateSession(ctx context.Context, arg UpdateSessionParams) error + UpsertPR(ctx context.Context, arg UpsertPRParams) error + UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error + UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error + UpsertProject(ctx context.Context, arg UpsertProjectParams) error +} + +var _ Querier = (*Queries)(nil) diff --git a/backend/internal/storage/sqlite/gen/sessions.sql.go b/backend/internal/storage/sqlite/gen/sessions.sql.go new file mode 100644 index 00000000..5365a22c --- /dev/null +++ b/backend/internal/storage/sqlite/gen/sessions.sql.go @@ -0,0 +1,295 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.31.1 +// source: sessions.sql + +package gen + +import ( + "context" + "database/sql" + "time" +) + +const deleteSession = `-- name: DeleteSession :exec +DELETE FROM sessions WHERE id = ? +` + +func (q *Queries) DeleteSession(ctx context.Context, id string) error { + _, err := q.db.ExecContext(ctx, deleteSession, id) + return err +} + +const getSession = `-- name: GetSession :one +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE id = ? +` + +func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { + row := q.db.QueryRowContext(ctx, getSession, id) + var i Session + err := row.Scan( + &i.ID, + &i.ProjectID, + &i.Num, + &i.IssueID, + &i.Kind, + &i.Harness, + &i.SessionState, + &i.TerminationReason, + &i.IsAlive, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, + ) + return i, err +} + +const insertSession = `-- name: InsertSession :exec +INSERT INTO sessions ( + id, project_id, num, issue_id, kind, harness, + session_state, termination_reason, is_alive, + activity_state, activity_last_at, activity_source, + detecting_attempts, detecting_started_at, detecting_evidence_hash, + branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + created_at, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +` + +type InsertSessionParams struct { + ID string + ProjectID string + Num int64 + IssueID string + Kind string + Harness string + SessionState string + TerminationReason string + IsAlive int64 + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time +} + +func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) error { + _, err := q.db.ExecContext(ctx, insertSession, + arg.ID, + arg.ProjectID, + arg.Num, + arg.IssueID, + arg.Kind, + arg.Harness, + arg.SessionState, + arg.TerminationReason, + arg.IsAlive, + arg.ActivityState, + arg.ActivityLastAt, + arg.ActivitySource, + arg.DetectingAttempts, + arg.DetectingStartedAt, + arg.DetectingEvidenceHash, + arg.Branch, + arg.WorkspacePath, + arg.RuntimeHandleID, + arg.RuntimeName, + arg.AgentSessionID, + arg.Prompt, + arg.CreatedAt, + arg.UpdatedAt, + ) + return err +} + +const listAllSessions = `-- name: ListAllSessions :many +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions ORDER BY project_id, num +` + +func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { + rows, err := q.db.QueryContext(ctx, listAllSessions) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Session{} + for rows.Next() { + var i Session + if err := rows.Scan( + &i.ID, + &i.ProjectID, + &i.Num, + &i.IssueID, + &i.Kind, + &i.Harness, + &i.SessionState, + &i.TerminationReason, + &i.IsAlive, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const listSessionsByProject = `-- name: ListSessionsByProject :many +SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE project_id = ? ORDER BY num +` + +func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) { + rows, err := q.db.QueryContext(ctx, listSessionsByProject, projectID) + if err != nil { + return nil, err + } + defer rows.Close() + items := []Session{} + for rows.Next() { + var i Session + if err := rows.Scan( + &i.ID, + &i.ProjectID, + &i.Num, + &i.IssueID, + &i.Kind, + &i.Harness, + &i.SessionState, + &i.TerminationReason, + &i.IsAlive, + &i.ActivityState, + &i.ActivityLastAt, + &i.ActivitySource, + &i.DetectingAttempts, + &i.DetectingStartedAt, + &i.DetectingEvidenceHash, + &i.Branch, + &i.WorkspacePath, + &i.RuntimeHandleID, + &i.RuntimeName, + &i.AgentSessionID, + &i.Prompt, + &i.CreatedAt, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const nextSessionNum = `-- name: NextSessionNum :one +SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ? +` + +func (q *Queries) NextSessionNum(ctx context.Context, projectID string) (int64, error) { + row := q.db.QueryRowContext(ctx, nextSessionNum, projectID) + var next int64 + err := row.Scan(&next) + return next, err +} + +const updateSession = `-- name: UpdateSession :exec +UPDATE sessions SET + issue_id = ?, kind = ?, harness = ?, + session_state = ?, termination_reason = ?, is_alive = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, + detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + updated_at = ? +WHERE id = ? +` + +type UpdateSessionParams struct { + IssueID string + Kind string + Harness string + SessionState string + TerminationReason string + IsAlive int64 + ActivityState string + ActivityLastAt time.Time + ActivitySource string + DetectingAttempts sql.NullInt64 + DetectingStartedAt sql.NullTime + DetectingEvidenceHash sql.NullString + Branch string + WorkspacePath string + RuntimeHandleID string + RuntimeName string + AgentSessionID string + Prompt string + UpdatedAt time.Time + ID string +} + +func (q *Queries) UpdateSession(ctx context.Context, arg UpdateSessionParams) error { + _, err := q.db.ExecContext(ctx, updateSession, + arg.IssueID, + arg.Kind, + arg.Harness, + arg.SessionState, + arg.TerminationReason, + arg.IsAlive, + arg.ActivityState, + arg.ActivityLastAt, + arg.ActivitySource, + arg.DetectingAttempts, + arg.DetectingStartedAt, + arg.DetectingEvidenceHash, + arg.Branch, + arg.WorkspacePath, + arg.RuntimeHandleID, + arg.RuntimeName, + arg.AgentSessionID, + arg.Prompt, + arg.UpdatedAt, + arg.ID, + ) + return err +} diff --git a/backend/internal/storage/sqlite/mapping.go b/backend/internal/storage/sqlite/mapping.go new file mode 100644 index 00000000..792854cf --- /dev/null +++ b/backend/internal/storage/sqlite/mapping.go @@ -0,0 +1,125 @@ +package sqlite + +import ( + "database/sql" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +func boolToInt(b bool) int64 { + if b { + return 1 + } + return 0 +} + +// rowToRecord maps a stored session row to a domain record. The folded-in +// operational columns become Metadata; the canonical lifecycle is reassembled +// from the typed columns. Display status is never reconstructed here. +func rowToRecord(row gen.Session) domain.SessionRecord { + return domain.SessionRecord{ + ID: domain.SessionID(row.ID), + ProjectID: domain.ProjectID(row.ProjectID), + IssueID: domain.IssueID(row.IssueID), + Kind: domain.SessionKind(row.Kind), + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Harness: domain.AgentHarness(row.Harness), + IsAlive: row.IsAlive != 0, + Session: domain.SessionSubstate{State: domain.SessionState(row.SessionState)}, + TerminationReason: domain.TerminationReason(row.TerminationReason), + Activity: domain.ActivitySubstate{ + State: domain.ActivityState(row.ActivityState), + LastActivityAt: row.ActivityLastAt, + Source: domain.ActivitySource(row.ActivitySource), + }, + Detecting: nullToDetecting(row), + }, + Metadata: domain.SessionMetadata{ + Branch: row.Branch, + WorkspacePath: row.WorkspacePath, + RuntimeHandleID: row.RuntimeHandleID, + RuntimeName: row.RuntimeName, + AgentSessionID: row.AgentSessionID, + Prompt: row.Prompt, + }, + CreatedAt: row.CreatedAt, + UpdatedAt: row.UpdatedAt, + } +} + +func recordToInsert(rec domain.SessionRecord, num int64) gen.InsertSessionParams { + da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) + return gen.InsertSessionParams{ + ID: string(rec.ID), + ProjectID: string(rec.ProjectID), + Num: num, + IssueID: string(rec.IssueID), + Kind: string(rec.Kind), + Harness: string(rec.Lifecycle.Harness), + SessionState: string(rec.Lifecycle.Session.State), + TerminationReason: string(rec.Lifecycle.TerminationReason), + IsAlive: boolToInt(rec.Lifecycle.IsAlive), + ActivityState: string(rec.Lifecycle.Activity.State), + ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, + ActivitySource: string(rec.Lifecycle.Activity.Source), + DetectingAttempts: da, + DetectingStartedAt: ds, + DetectingEvidenceHash: dh, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + RuntimeName: rec.Metadata.RuntimeName, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + CreatedAt: rec.CreatedAt, + UpdatedAt: rec.UpdatedAt, + } +} + +func recordToUpdate(rec domain.SessionRecord) gen.UpdateSessionParams { + da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) + return gen.UpdateSessionParams{ + IssueID: string(rec.IssueID), + Kind: string(rec.Kind), + Harness: string(rec.Lifecycle.Harness), + SessionState: string(rec.Lifecycle.Session.State), + TerminationReason: string(rec.Lifecycle.TerminationReason), + IsAlive: boolToInt(rec.Lifecycle.IsAlive), + ActivityState: string(rec.Lifecycle.Activity.State), + ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, + ActivitySource: string(rec.Lifecycle.Activity.Source), + DetectingAttempts: da, + DetectingStartedAt: ds, + DetectingEvidenceHash: dh, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + RuntimeName: rec.Metadata.RuntimeName, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + UpdatedAt: rec.UpdatedAt, + ID: string(rec.ID), + } +} + +func detectingToNull(d *domain.DetectingState) (sql.NullInt64, sql.NullTime, sql.NullString) { + if d == nil { + return sql.NullInt64{}, sql.NullTime{}, sql.NullString{} + } + return sql.NullInt64{Int64: int64(d.Attempts), Valid: true}, + sql.NullTime{Time: d.StartedAt, Valid: true}, + sql.NullString{String: d.EvidenceHash, Valid: true} +} + +func nullToDetecting(row gen.Session) *domain.DetectingState { + if !row.DetectingAttempts.Valid { + return nil + } + return &domain.DetectingState{ + Attempts: int(row.DetectingAttempts.Int64), + StartedAt: row.DetectingStartedAt.Time, + EvidenceHash: row.DetectingEvidenceHash.String, + } +} diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql new file mode 100644 index 00000000..9d5a6a22 --- /dev/null +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -0,0 +1,232 @@ +-- +goose Up +-- +goose StatementBegin + +-- projects is the durable registry of repos AO manages (the SQLite twin of the +-- YAML config). id is a short human/LLM-friendly slug (mer, ao) with a numeric +-- suffix on collision (ao, ao1, ao2). Soft-delete via archived_at keeps the row +-- so a session's project_id always resolves. +CREATE TABLE projects ( + id TEXT PRIMARY KEY, + path TEXT NOT NULL, + repo_origin_url TEXT NOT NULL DEFAULT '', + display_name TEXT NOT NULL DEFAULT '', + registered_at TIMESTAMP NOT NULL, + archived_at TIMESTAMP +); + +-- sessions is the canonical record. id is "{project_id}-{num}" (e.g. mer-1) — a +-- single string key, so every inbound FK is single-column. num is the per-project +-- counter (computed at insert under the write mutex). Operational metadata is +-- folded in (no separate table). is_alive replaces the old runtime axis; there is +-- no revision column — the per-session write mutex serializes and change_log.seq +-- orders. The display status is derived on read (from this + the pr row), never +-- stored. +CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + project_id TEXT NOT NULL REFERENCES projects (id), + num INTEGER NOT NULL, + issue_id TEXT NOT NULL DEFAULT '', + kind TEXT NOT NULL DEFAULT 'worker', + harness TEXT NOT NULL DEFAULT '' + CHECK (harness IN ('', 'claude-code', 'codex', 'aider', 'opencode')), + + session_state TEXT NOT NULL + CHECK (session_state IN ('not_started', 'working', 'idle', 'needs_input', 'stuck', 'detecting', 'done', 'terminated')), + -- only terminal sessions carry a reason; '' otherwise. + termination_reason TEXT NOT NULL DEFAULT '' + CHECK (termination_reason IN ('', 'manually_killed', 'runtime_lost', 'agent_process_exited', 'probe_failure', 'error_in_process', 'auto_cleanup', 'pr_merged')), + is_alive INTEGER NOT NULL DEFAULT 0, + + activity_state TEXT NOT NULL DEFAULT 'idle', + activity_last_at TIMESTAMP NOT NULL, + activity_source TEXT NOT NULL DEFAULT 'none', + + -- detecting quarantine memory; NULL when the session is not in detecting. + detecting_attempts INTEGER, + detecting_started_at TIMESTAMP, + detecting_evidence_hash TEXT, + + -- folded-in operational handles (was the session_metadata table) + branch TEXT NOT NULL DEFAULT '', + workspace_path TEXT NOT NULL DEFAULT '', + runtime_handle_id TEXT NOT NULL DEFAULT '', + runtime_name TEXT NOT NULL DEFAULT '', + agent_session_id TEXT NOT NULL DEFAULT '', + prompt TEXT NOT NULL DEFAULT '', + + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL, + + UNIQUE (project_id, num) +); +CREATE INDEX idx_sessions_project ON sessions (project_id); + +-- pr holds PR facts keyed by the normalized PR URL. One session can own many PRs +-- (session_id FK), but a PR belongs to one session (enforced at runtime). ci_state +-- is the rolled-up status; the per-check history lives in pr_checks. +CREATE TABLE pr ( + url TEXT PRIMARY KEY, + session_id TEXT NOT NULL REFERENCES sessions (id) ON DELETE CASCADE, + number INTEGER NOT NULL DEFAULT 0, + pr_state TEXT NOT NULL DEFAULT 'open' + CHECK (pr_state IN ('draft', 'open', 'merged', 'closed')), + review_decision TEXT NOT NULL DEFAULT 'none' + CHECK (review_decision IN ('none', 'approved', 'changes_requested', 'review_required')), + ci_state TEXT NOT NULL DEFAULT 'unknown' + CHECK (ci_state IN ('unknown', 'pending', 'passing', 'failing')), + mergeability TEXT NOT NULL DEFAULT 'unknown' + CHECK (mergeability IN ('unknown', 'mergeable', 'conflicting', 'blocked', 'unstable')), + updated_at TIMESTAMP NOT NULL +); +CREATE INDEX idx_pr_session ON pr (session_id); + +-- pr_checks is CI run history: one row per (PR, check, commit). The CI-fix-loop +-- brake is a LIMIT 3 query over it ("last 3 runs of this check all failed?") — no +-- counter is stored. Re-polling the same commit upserts the same row. +CREATE TABLE pr_checks ( + pr_url TEXT NOT NULL REFERENCES pr (url) ON DELETE CASCADE, + name TEXT NOT NULL, + commit_hash TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'unknown' + CHECK (status IN ('unknown', 'queued', 'in_progress', 'passed', 'failed', 'skipped', 'cancelled')), + url TEXT NOT NULL DEFAULT '', + log_tail TEXT NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL, + PRIMARY KEY (pr_url, name, commit_hash) +); +CREATE INDEX idx_pr_checks_lookup ON pr_checks (pr_url, name, created_at); + +-- pr_comment holds review comments, persisted so a session page does not wait on +-- GitHub. Cascades from pr. +CREATE TABLE pr_comment ( + pr_url TEXT NOT NULL REFERENCES pr (url) ON DELETE CASCADE, + comment_id TEXT NOT NULL, + author TEXT NOT NULL DEFAULT '', + file TEXT NOT NULL DEFAULT '', + line INTEGER NOT NULL DEFAULT 0, + body TEXT NOT NULL DEFAULT '', + resolved INTEGER NOT NULL DEFAULT 0, + created_at TIMESTAMP NOT NULL, + PRIMARY KEY (pr_url, comment_id) +); + +-- change_log is the durable, append-only CDC event log. seq is the monotonic +-- ordering + idempotency key. Rows are written by TRIGGERS on the user-visible +-- tables (DB-native capture, atomic with the change) — never by application +-- emit-code. project_id is required, session_id is nullable (project-level events +-- have no session). The log is immutable (no published flag); consumers track +-- their own offset (SSE Last-Event-ID). +CREATE TABLE change_log ( + seq INTEGER PRIMARY KEY AUTOINCREMENT, + project_id TEXT NOT NULL REFERENCES projects (id), + session_id TEXT REFERENCES sessions (id), + event_type TEXT NOT NULL, + payload TEXT NOT NULL, + created_at TIMESTAMP NOT NULL DEFAULT (datetime('now')) +); +CREATE INDEX idx_change_log_project ON change_log (project_id, seq); + +-- +goose StatementEnd + +-- CDC capture triggers. Each is its own goose statement (the trigger body holds +-- semicolons). They write change_log atomically with the originating change, so +-- the application never emits events — it just writes sessions/pr/pr_checks. + +-- +goose StatementBegin +CREATE TRIGGER sessions_cdc_insert +AFTER INSERT ON sessions +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES (NEW.project_id, NEW.id, 'session_created', + json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, + 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER sessions_cdc_update +AFTER UPDATE ON sessions +WHEN OLD.session_state <> NEW.session_state + OR OLD.termination_reason <> NEW.termination_reason + OR OLD.is_alive <> NEW.is_alive + OR OLD.activity_state <> NEW.activity_state +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES (NEW.project_id, NEW.id, 'session_updated', + json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, + 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER pr_cdc_insert +AFTER INSERT ON pr +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ((SELECT project_id FROM sessions WHERE id = NEW.session_id), NEW.session_id, 'pr_created', + json_object('url', NEW.url, 'session', NEW.session_id, 'state', NEW.pr_state, + 'ci', NEW.ci_state, 'review', NEW.review_decision, 'mergeability', NEW.mergeability), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER pr_cdc_update +AFTER UPDATE ON pr +WHEN OLD.pr_state <> NEW.pr_state + OR OLD.ci_state <> NEW.ci_state + OR OLD.review_decision <> NEW.review_decision + OR OLD.mergeability <> NEW.mergeability +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ((SELECT project_id FROM sessions WHERE id = NEW.session_id), NEW.session_id, 'pr_updated', + json_object('url', NEW.url, 'session', NEW.session_id, 'state', NEW.pr_state, + 'ci', NEW.ci_state, 'review', NEW.review_decision, 'mergeability', NEW.mergeability), + NEW.updated_at); +END; +-- +goose StatementEnd + +-- +goose StatementBegin +CREATE TRIGGER pr_checks_cdc_insert +AFTER INSERT ON pr_checks +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ( + (SELECT s.project_id FROM pr p JOIN sessions s ON s.id = p.session_id WHERE p.url = NEW.pr_url), + (SELECT session_id FROM pr WHERE url = NEW.pr_url), + 'pr_check_recorded', + json_object('pr', NEW.pr_url, 'name', NEW.name, 'commit', NEW.commit_hash, 'status', NEW.status), + NEW.created_at); +END; +-- +goose StatementEnd + +-- A re-polled check can change status on the same commit (in_progress -> failed) +-- via UpsertPRCheck's ON CONFLICT DO UPDATE. Without this trigger that status +-- transition would update the row silently, so CDC consumers would never see it. +-- Guarded on the status so a no-op re-poll emits nothing. +-- +goose StatementBegin +CREATE TRIGGER pr_checks_cdc_update +AFTER UPDATE ON pr_checks +WHEN OLD.status <> NEW.status +BEGIN + INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) + VALUES ( + (SELECT s.project_id FROM pr p JOIN sessions s ON s.id = p.session_id WHERE p.url = NEW.pr_url), + (SELECT session_id FROM pr WHERE url = NEW.pr_url), + 'pr_check_recorded', + json_object('pr', NEW.pr_url, 'name', NEW.name, 'commit', NEW.commit_hash, 'status', NEW.status), + NEW.created_at); +END; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE change_log; +DROP TABLE pr_comment; +DROP TABLE pr_checks; +DROP TABLE pr; +DROP TABLE sessions; +DROP TABLE projects; +-- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/pr_cdc_test.go b/backend/internal/storage/sqlite/pr_cdc_test.go new file mode 100644 index 00000000..8c8f7ea2 --- /dev/null +++ b/backend/internal/storage/sqlite/pr_cdc_test.go @@ -0,0 +1,86 @@ +package sqlite + +import ( + "context" + "strings" + "testing" + "time" +) + +// A check can change status on the same commit (in_progress -> failed) via +// UpsertPRCheck's ON CONFLICT DO UPDATE. CDC must emit on that transition, not +// only on the first insert — otherwise live clients never see the status change. +func TestPRChecksCDC_EmitsOnInsertAndStatusUpdate(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + rec, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) + } + url := "https://example/pr/1" + if err := s.UpsertPR(ctx, PRRow{URL: url, SessionID: string(rec.ID), Number: 1}); err != nil { + t.Fatal(err) + } + + now := time.Now() + mustCheck := func(status string) { + if err := s.RecordCheck(ctx, PRCheckRow{PRURL: url, Name: "build", CommitHash: "c1", Status: status, CreatedAt: now}); err != nil { + t.Fatal(err) + } + } + mustCheck("in_progress") // insert -> event + mustCheck("failed") // status change on same commit (update) -> event + mustCheck("failed") // no-op re-poll (status unchanged) -> NO event + + rows, err := s.ReadChangeLogAfter(ctx, 0, 100) + if err != nil { + t.Fatal(err) + } + var checkEvents []ChangeLogRow + for _, r := range rows { + if r.EventType == "pr_check_recorded" { + checkEvents = append(checkEvents, r) + } + } + if len(checkEvents) != 2 { + t.Fatalf("want 2 check CDC events (insert + status change, no-op suppressed), got %d", len(checkEvents)) + } + if !strings.Contains(checkEvents[1].Payload, `"status":"failed"`) { + t.Fatalf("the update event should carry the new status, got %q", checkEvents[1].Payload) + } +} + +// WritePRObservation persists scalar facts, checks, and comments in one tx; all +// three should be queryable afterward. +func TestWritePRObservation_PersistsScalarsChecksAndComments(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + rec, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) + } + url := "https://example/pr/7" + now := time.Now() + + err = s.WritePRObservation(ctx, + PRRow{URL: url, SessionID: string(rec.ID), Number: 7, CIState: "failing", UpdatedAt: now}, + []PRCheckRow{{PRURL: url, Name: "build", CommitHash: "c1", Status: "failed", CreatedAt: now}}, + []PRCommentRow{{PRURL: url, CommentID: "1", Author: "reviewer", Body: "use a const", CreatedAt: now}}, + ) + if err != nil { + t.Fatal(err) + } + + pr, ok, err := s.GetPR(ctx, url) + if err != nil || !ok || pr.CIState != "failing" { + t.Fatalf("scalar facts not persisted: ok=%v ci=%q err=%v", ok, pr.CIState, err) + } + if checks, _ := s.ListChecks(ctx, url); len(checks) != 1 || checks[0].Status != "failed" { + t.Fatalf("check not persisted: %+v", checks) + } + if comments, _ := s.ListPRComments(ctx, url); len(comments) != 1 || comments[0].Body != "use a const" { + t.Fatalf("comment not persisted: %+v", comments) + } +} diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go new file mode 100644 index 00000000..8b41396c --- /dev/null +++ b/backend/internal/storage/sqlite/pr_store.go @@ -0,0 +1,270 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// PRRow is the scalar PR facts row (the pr table), keyed by normalized URL. One +// session can own many PRs; a PR belongs to one session (session_id FK). +type PRRow struct { + URL string + SessionID string + Number int64 + State string // draft | open | merged | closed + ReviewDecision string // none | approved | changes_requested | review_required + CIState string // unknown | pending | passing | failing + Mergeability string // unknown | mergeable | conflicting | blocked | unstable + UpdatedAt time.Time +} + +// UpsertPR inserts or replaces the scalar PR facts for a PR URL. Empty enum +// fields default to their "nothing known yet" value so a partial row is valid +// against the CHECK constraints (matches the domain zero values none/unknown). +func (s *Store) UpsertPR(ctx context.Context, r PRRow) error { + r = r.withDefaults() + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpsertPR(ctx, gen.UpsertPRParams{ + Url: r.URL, + SessionID: r.SessionID, + Number: r.Number, + PrState: r.State, + ReviewDecision: r.ReviewDecision, + CiState: r.CIState, + Mergeability: r.Mergeability, + UpdatedAt: r.UpdatedAt, + }) +} + +// WritePRObservation persists a full PR observation — scalar facts, check runs, +// and the replacement comment set — in one write transaction, so the rows and +// the change_log events their triggers emit are committed all-or-nothing. The +// scalar PR upsert runs first so the checks'/comments' CDC triggers can resolve +// the session id from the pr row within the same transaction. +func (s *Store) WritePRObservation(ctx context.Context, pr PRRow, checks []PRCheckRow, comments []PRCommentRow) error { + pr = pr.withDefaults() + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.inTx(ctx, "write pr observation", func(q *gen.Queries) error { + if err := q.UpsertPR(ctx, gen.UpsertPRParams{ + Url: pr.URL, SessionID: pr.SessionID, Number: pr.Number, + PrState: pr.State, ReviewDecision: pr.ReviewDecision, + CiState: pr.CIState, Mergeability: pr.Mergeability, UpdatedAt: pr.UpdatedAt, + }); err != nil { + return err + } + for _, c := range checks { + if c.Status == "" { + c.Status = "unknown" + } + if err := q.UpsertPRCheck(ctx, gen.UpsertPRCheckParams{ + PrUrl: c.PRURL, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, Url: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + }); err != nil { + return err + } + } + if err := q.DeletePRComments(ctx, pr.URL); err != nil { + return err + } + for _, cm := range comments { + if err := q.UpsertPRComment(ctx, gen.UpsertPRCommentParams{ + PrUrl: pr.URL, CommentID: cm.CommentID, Author: cm.Author, File: cm.File, + Line: cm.Line, Body: cm.Body, Resolved: boolToInt(cm.Resolved), CreatedAt: cm.CreatedAt, + }); err != nil { + return fmt.Errorf("comment %q: %w", cm.CommentID, err) + } + } + return nil + }) +} + +// withDefaults fills empty enum fields with their "nothing known yet" value so a +// partial row satisfies the CHECK constraints (matches UpsertPR). +func (r PRRow) withDefaults() PRRow { + if r.State == "" { + r.State = "open" + } + if r.ReviewDecision == "" { + r.ReviewDecision = "none" + } + if r.CIState == "" { + r.CIState = "unknown" + } + if r.Mergeability == "" { + r.Mergeability = "unknown" + } + return r +} + +// GetPR returns the PR facts for a URL, or ok=false if absent. +func (s *Store) GetPR(ctx context.Context, url string) (PRRow, bool, error) { + p, err := s.qr.GetPR(ctx, url) + if errors.Is(err, sql.ErrNoRows) { + return PRRow{}, false, nil + } + if err != nil { + return PRRow{}, false, fmt.Errorf("get pr %s: %w", url, err) + } + return prRowFromGen(p), true, nil +} + +// ListPRsBySession returns every PR owned by a session, newest first. +func (s *Store) ListPRsBySession(ctx context.Context, sessionID string) ([]PRRow, error) { + rows, err := s.qr.ListPRsBySession(ctx, sessionID) + if err != nil { + return nil, fmt.Errorf("list prs for %s: %w", sessionID, err) + } + out := make([]PRRow, 0, len(rows)) + for _, p := range rows { + out = append(out, prRowFromGen(p)) + } + return out, nil +} + +// DeletePR removes a PR (cascades to its checks + comments). +func (s *Store) DeletePR(ctx context.Context, url string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.DeletePR(ctx, url) +} + +func prRowFromGen(p gen.Pr) PRRow { + return PRRow{ + URL: p.Url, + SessionID: p.SessionID, + Number: p.Number, + State: p.PrState, + ReviewDecision: p.ReviewDecision, + CIState: p.CiState, + Mergeability: p.Mergeability, + UpdatedAt: p.UpdatedAt, + } +} + +// ---- pr_checks: CI run history ---- + +// PRCheckRow is one CI check run for a PR (one row per check name per commit). +type PRCheckRow struct { + PRURL string + Name string + CommitHash string + Status string // unknown | queued | in_progress | passed | failed | skipped | cancelled + URL string + LogTail string + CreatedAt time.Time +} + +// RecordCheck upserts a CI check run. Re-polling the same (pr, name, commit) +// updates the same row; a new commit creates a new row (a fresh agent attempt). +func (s *Store) RecordCheck(ctx context.Context, r PRCheckRow) error { + if r.Status == "" { + r.Status = "unknown" + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpsertPRCheck(ctx, gen.UpsertPRCheckParams{ + PrUrl: r.PRURL, + Name: r.Name, + CommitHash: r.CommitHash, + Status: r.Status, + Url: r.URL, + LogTail: r.LogTail, + CreatedAt: r.CreatedAt, + }) +} + +// RecentCheckStatuses returns the statuses of the last `limit` runs of a check, +// most-recent first. The CI-fix-loop brake reads this: "last 3 all failed?". +func (s *Store) RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) { + rows, err := s.qr.ListRecentChecks(ctx, gen.ListRecentChecksParams{ + PrUrl: prURL, Name: name, Limit: int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("recent checks %s/%s: %w", prURL, name, err) + } + out := make([]string, 0, len(rows)) + for _, r := range rows { + out = append(out, r.Status) + } + return out, nil +} + +// ListChecks returns every recorded check run for a PR. +func (s *Store) ListChecks(ctx context.Context, prURL string) ([]PRCheckRow, error) { + rows, err := s.qr.ListChecksByPR(ctx, prURL) + if err != nil { + return nil, fmt.Errorf("list checks %s: %w", prURL, err) + } + out := make([]PRCheckRow, 0, len(rows)) + for _, c := range rows { + out = append(out, PRCheckRow{ + PRURL: c.PrUrl, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, URL: c.Url, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + }) + } + return out, nil +} + +// ---- pr_comment ---- + +// PRCommentRow is one review comment on a PR. +type PRCommentRow struct { + PRURL string + CommentID string + Author string + File string + Line int64 + Body string + Resolved bool + CreatedAt time.Time +} + +// ReplacePRComments atomically replaces the full comment set for a PR (each SCM +// fetch reports the current set, so a replace keeps it in sync). +func (s *Store) ReplacePRComments(ctx context.Context, prURL string, comments []PRCommentRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.inTx(ctx, "replace pr comments", func(q *gen.Queries) error { + if err := q.DeletePRComments(ctx, prURL); err != nil { + return err + } + for _, c := range comments { + if err := q.UpsertPRComment(ctx, gen.UpsertPRCommentParams{ + PrUrl: prURL, + CommentID: c.CommentID, + Author: c.Author, + File: c.File, + Line: c.Line, + Body: c.Body, + Resolved: boolToInt(c.Resolved), + CreatedAt: c.CreatedAt, + }); err != nil { + return fmt.Errorf("comment %q: %w", c.CommentID, err) + } + } + return nil + }) +} + +// ListPRComments returns a PR's review comments, oldest first. +func (s *Store) ListPRComments(ctx context.Context, prURL string) ([]PRCommentRow, error) { + rows, err := s.qr.ListPRComments(ctx, prURL) + if err != nil { + return nil, fmt.Errorf("list pr comments %s: %w", prURL, err) + } + out := make([]PRCommentRow, 0, len(rows)) + for _, c := range rows { + out = append(out, PRCommentRow{ + PRURL: c.PrUrl, CommentID: c.CommentID, Author: c.Author, File: c.File, + Line: c.Line, Body: c.Body, Resolved: c.Resolved != 0, CreatedAt: c.CreatedAt, + }) + } + return out, nil +} diff --git a/backend/internal/storage/sqlite/project_store.go b/backend/internal/storage/sqlite/project_store.go new file mode 100644 index 00000000..d81943c3 --- /dev/null +++ b/backend/internal/storage/sqlite/project_store.go @@ -0,0 +1,93 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ProjectRow is one registered repo (the projects table). id is a short slug +// (mer, ao). ArchivedAt zero means active. +type ProjectRow struct { + ID string + Path string + RepoOriginURL string + DisplayName string + RegisteredAt time.Time + ArchivedAt time.Time +} + +// UpsertProject inserts or updates a registered project. +func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpsertProject(ctx, gen.UpsertProjectParams{ + ID: r.ID, + Path: r.Path, + RepoOriginUrl: r.RepoOriginURL, + DisplayName: r.DisplayName, + RegisteredAt: r.RegisteredAt, + ArchivedAt: nullTime(r.ArchivedAt), + }) +} + +// GetProject returns a project by id (active or archived), or ok=false. +func (s *Store) GetProject(ctx context.Context, id string) (ProjectRow, bool, error) { + p, err := s.qr.GetProject(ctx, id) + if errors.Is(err, sql.ErrNoRows) { + return ProjectRow{}, false, nil + } + if err != nil { + return ProjectRow{}, false, fmt.Errorf("get project %s: %w", id, err) + } + return projectRowFromGen(p), true, nil +} + +// ListProjects returns active (non-archived) projects, ordered by id. +func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { + rows, err := s.qr.ListProjects(ctx) + if err != nil { + return nil, fmt.Errorf("list projects: %w", err) + } + out := make([]ProjectRow, 0, len(rows)) + for _, p := range rows { + out = append(out, projectRowFromGen(p)) + } + return out, nil +} + +// ArchiveProject soft-deletes a project (the row stays so session.project_id +// still resolves). +func (s *Store) ArchiveProject(ctx context.Context, id string, at time.Time) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.ArchiveProject(ctx, gen.ArchiveProjectParams{ + ArchivedAt: nullTime(at), + ID: id, + }) +} + +func projectRowFromGen(p gen.Project) ProjectRow { + r := ProjectRow{ + ID: p.ID, + Path: p.Path, + RepoOriginURL: p.RepoOriginUrl, + DisplayName: p.DisplayName, + RegisteredAt: p.RegisteredAt, + } + if p.ArchivedAt.Valid { + r.ArchivedAt = p.ArchivedAt.Time + } + return r +} + +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} diff --git a/backend/internal/storage/sqlite/queries/changelog.sql b/backend/internal/storage/sqlite/queries/changelog.sql new file mode 100644 index 00000000..0e11899c --- /dev/null +++ b/backend/internal/storage/sqlite/queries/changelog.sql @@ -0,0 +1,10 @@ +-- name: ReadChangeLogAfter :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE seq > ? ORDER BY seq LIMIT ?; + +-- name: ReadChangeLogAfterForProject :many +SELECT seq, project_id, session_id, event_type, payload, created_at +FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ?; + +-- name: MaxChangeLogSeq :one +SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log; diff --git a/backend/internal/storage/sqlite/queries/pr.sql b/backend/internal/storage/sqlite/queries/pr.sql new file mode 100644 index 00000000..e6b41cf1 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr.sql @@ -0,0 +1,20 @@ +-- name: UpsertPR :exec +INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (url) DO UPDATE SET + session_id = excluded.session_id, + number = excluded.number, + pr_state = excluded.pr_state, + review_decision = excluded.review_decision, + ci_state = excluded.ci_state, + mergeability = excluded.mergeability, + updated_at = excluded.updated_at; + +-- name: GetPR :one +SELECT * FROM pr WHERE url = ?; + +-- name: ListPRsBySession :many +SELECT * FROM pr WHERE session_id = ? ORDER BY updated_at DESC; + +-- name: DeletePR :exec +DELETE FROM pr WHERE url = ?; diff --git a/backend/internal/storage/sqlite/queries/pr_checks.sql b/backend/internal/storage/sqlite/queries/pr_checks.sql new file mode 100644 index 00000000..2e3e3c15 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr_checks.sql @@ -0,0 +1,15 @@ +-- name: UpsertPRCheck :exec +INSERT INTO pr_checks (pr_url, name, commit_hash, status, url, log_tail, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET + status = excluded.status, + url = excluded.url, + log_tail = excluded.log_tail; + +-- name: ListRecentChecks :many +SELECT status, commit_hash, created_at FROM pr_checks +WHERE pr_url = ? AND name = ? +ORDER BY created_at DESC LIMIT ?; + +-- name: ListChecksByPR :many +SELECT * FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at; diff --git a/backend/internal/storage/sqlite/queries/pr_comment.sql b/backend/internal/storage/sqlite/queries/pr_comment.sql new file mode 100644 index 00000000..df4f99d0 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/pr_comment.sql @@ -0,0 +1,12 @@ +-- name: UpsertPRComment :exec +INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +ON CONFLICT (pr_url, comment_id) DO UPDATE SET + author = excluded.author, file = excluded.file, line = excluded.line, + body = excluded.body, resolved = excluded.resolved; + +-- name: DeletePRComments :exec +DELETE FROM pr_comment WHERE pr_url = ?; + +-- name: ListPRComments :many +SELECT * FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id; diff --git a/backend/internal/storage/sqlite/queries/projects.sql b/backend/internal/storage/sqlite/queries/projects.sql new file mode 100644 index 00000000..3dc28950 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/projects.sql @@ -0,0 +1,19 @@ +-- name: UpsertProject :exec +INSERT INTO projects (id, path, repo_origin_url, display_name, registered_at, archived_at) +VALUES (?, ?, ?, ?, ?, ?) +ON CONFLICT (id) DO UPDATE SET + path = excluded.path, + repo_origin_url = excluded.repo_origin_url, + display_name = excluded.display_name, + archived_at = excluded.archived_at; + +-- name: GetProject :one +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE id = ?; + +-- name: ListProjects :many +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE archived_at IS NULL ORDER BY id; + +-- name: ArchiveProject :exec +UPDATE projects SET archived_at = ? WHERE id = ?; diff --git a/backend/internal/storage/sqlite/queries/sessions.sql b/backend/internal/storage/sqlite/queries/sessions.sql new file mode 100644 index 00000000..9b294de3 --- /dev/null +++ b/backend/internal/storage/sqlite/queries/sessions.sql @@ -0,0 +1,34 @@ +-- name: NextSessionNum :one +SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ?; + +-- name: InsertSession :exec +INSERT INTO sessions ( + id, project_id, num, issue_id, kind, harness, + session_state, termination_reason, is_alive, + activity_state, activity_last_at, activity_source, + detecting_attempts, detecting_started_at, detecting_evidence_hash, + branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + created_at, updated_at +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); + +-- name: UpdateSession :exec +UPDATE sessions SET + issue_id = ?, kind = ?, harness = ?, + session_state = ?, termination_reason = ?, is_alive = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, + detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + updated_at = ? +WHERE id = ?; + +-- name: GetSession :one +SELECT * FROM sessions WHERE id = ?; + +-- name: ListSessionsByProject :many +SELECT * FROM sessions WHERE project_id = ? ORDER BY num; + +-- name: ListAllSessions :many +SELECT * FROM sessions ORDER BY project_id, num; + +-- name: DeleteSession :exec +DELETE FROM sessions WHERE id = ?; diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go new file mode 100644 index 00000000..800c1824 --- /dev/null +++ b/backend/internal/storage/sqlite/store.go @@ -0,0 +1,134 @@ +package sqlite + +import ( + "context" + "database/sql" + "errors" + "fmt" + "sync" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// Store is the SQLite-backed persistence layer. It routes writes to a single +// writer connection (qw) and reads to a reader pool (qr) — see Open. writeMu +// guards the read-modify-write write methods (e.g. CreateSession's +// next-num-then-insert) so concurrent writes can't interleave them. +// +// CDC is captured by DB triggers (migration 0001), NOT by this layer: the store +// never writes change_log, it only reads it for the CDC poller. +type Store struct { + writeDB *sql.DB + readDB *sql.DB + qw *gen.Queries // bound to the single writer connection + qr *gen.Queries // bound to the reader pool + writeMu sync.Mutex +} + +// NewStore wraps an opened writer + reader *sql.DB (see Open) as a Store. +func NewStore(writeDB, readDB *sql.DB) *Store { + return &Store{ + writeDB: writeDB, + readDB: readDB, + qw: gen.New(writeDB), + qr: gen.New(readDB), + } +} + +// Close closes both pools. +func (s *Store) Close() error { + err := s.writeDB.Close() + if e := s.readDB.Close(); e != nil && err == nil { + err = e + } + return err +} + +// ---- sessions ---- + +// CreateSession assigns the per-project identity ("{project}-{num}") and inserts +// the record, returning it with ID populated. The next-num read and the insert +// run on the writer connection under writeMu, so two concurrent creates in the +// same project can't collide on num. +func (s *Store) CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + num, err := s.qw.NextSessionNum(ctx, string(rec.ProjectID)) + if err != nil { + return domain.SessionRecord{}, fmt.Errorf("next session num for %s: %w", rec.ProjectID, err) + } + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, num)) + if err := s.qw.InsertSession(ctx, recordToInsert(rec, num)); err != nil { + return domain.SessionRecord{}, fmt.Errorf("insert session %s: %w", rec.ID, err) + } + return rec, nil +} + +// UpdateSession writes the full mutable state of an existing session. The +// id/project/num/created_at are immutable and not touched here. +func (s *Store) UpdateSession(ctx context.Context, rec domain.SessionRecord) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpdateSession(ctx, recordToUpdate(rec)) +} + +// GetSession returns the full record for a session, or ok=false if absent. +func (s *Store) GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + row, err := s.qr.GetSession(ctx, string(id)) + if errors.Is(err, sql.ErrNoRows) { + return domain.SessionRecord{}, false, nil + } + if err != nil { + return domain.SessionRecord{}, false, fmt.Errorf("get session %s: %w", id, err) + } + return rowToRecord(row), true, nil +} + +// ListSessions returns every session in a project, ordered by num. +func (s *Store) ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListSessionsByProject(ctx, string(project)) + if err != nil { + return nil, fmt.Errorf("list sessions for %s: %w", project, err) + } + return mapSessionRows(rows), nil +} + +// ListAllSessions returns every session across all projects. +func (s *Store) ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListAllSessions(ctx) + if err != nil { + return nil, fmt.Errorf("list all sessions: %w", err) + } + return mapSessionRows(rows), nil +} + +// DeleteSession removes a session (cascades to its pr/checks/comments). +func (s *Store) DeleteSession(ctx context.Context, id domain.SessionID) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.DeleteSession(ctx, string(id)) +} + +func mapSessionRows(rows []gen.Session) []domain.SessionRecord { + out := make([]domain.SessionRecord, 0, len(rows)) + for _, r := range rows { + out = append(out, rowToRecord(r)) + } + return out +} + +// inTx runs fn inside a single write transaction on the writer connection, +// rolling back on error. The caller must already hold writeMu. +func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { + tx, err := s.writeDB.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin %s: %w", what, err) + } + defer tx.Rollback() + if err := fn(s.qw.WithTx(tx)); err != nil { + return fmt.Errorf("%s: %w", what, err) + } + return tx.Commit() +} diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store_test.go new file mode 100644 index 00000000..55165c41 --- /dev/null +++ b/backend/internal/storage/sqlite/store_test.go @@ -0,0 +1,316 @@ +package sqlite + +import ( + "context" + "fmt" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +func newTestStore(t *testing.T) *Store { + t.Helper() + s, err := Open(t.TempDir()) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +func seedProject(t *testing.T, s *Store, id string) { + t.Helper() + if err := s.UpsertProject(context.Background(), ProjectRow{ + ID: id, Path: "/tmp/" + id, RegisteredAt: time.Now().UTC().Truncate(time.Second), + }); err != nil { + t.Fatalf("seed project %s: %v", id, err) + } +} + +func sampleRecord(project string) domain.SessionRecord { + now := time.Now().UTC().Truncate(time.Second) + return domain.SessionRecord{ + ProjectID: domain.ProjectID(project), + Kind: domain.KindWorker, + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Harness: domain.HarnessClaudeCode, + IsAlive: true, + Session: domain.SessionSubstate{State: domain.SessionWorking}, + Activity: domain.ActivitySubstate{ + State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative, + }, + }, + Metadata: domain.SessionMetadata{Branch: "feat/x", WorkspacePath: "/ws"}, + CreatedAt: now, + UpdatedAt: now, + } +} + +func TestProjectCRUDAndArchive(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + + got, ok, err := s.GetProject(ctx, "mer") + if err != nil || !ok { + t.Fatalf("get: ok=%v err=%v", ok, err) + } + if got.ID != "mer" || got.Path != "/tmp/mer" { + t.Fatalf("project = %+v", got) + } + if list, _ := s.ListProjects(ctx); len(list) != 1 { + t.Fatalf("active list = %d, want 1", len(list)) + } + // archive hides from the active list but still resolves by id. + if err := s.ArchiveProject(ctx, "mer", time.Now().UTC()); err != nil { + t.Fatal(err) + } + if list, _ := s.ListProjects(ctx); len(list) != 0 { + t.Fatalf("after archive, active list = %d, want 0", len(list)) + } + if _, ok, _ := s.GetProject(ctx, "mer"); !ok { + t.Fatal("archived project must still resolve by id") + } +} + +func TestSessionCreateAssignsPerProjectID(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + seedProject(t, s, "ao") + + r1, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Fatal(err) + } + r2, _ := s.CreateSession(ctx, sampleRecord("mer")) + r3, _ := s.CreateSession(ctx, sampleRecord("ao")) + if r1.ID != "mer-1" || r2.ID != "mer-2" || r3.ID != "ao-1" { + t.Fatalf("ids = %s, %s, %s; want mer-1, mer-2, ao-1", r1.ID, r2.ID, r3.ID) + } + got, ok, err := s.GetSession(ctx, "mer-1") + if err != nil || !ok { + t.Fatalf("get: ok=%v err=%v", ok, err) + } + if got.Lifecycle.Session.State != domain.SessionWorking || !got.Lifecycle.IsAlive || + got.Lifecycle.Harness != domain.HarnessClaudeCode || got.Metadata.Branch != "feat/x" { + t.Fatalf("round-trip mismatch: %+v", got) + } + if list, _ := s.ListSessions(ctx, "mer"); len(list) != 2 { + t.Fatalf("list mer = %d, want 2", len(list)) + } + if all, _ := s.ListAllSessions(ctx); len(all) != 3 { + t.Fatalf("list all = %d, want 3", len(all)) + } +} + +func TestSessionUpdateAndDetecting(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionDetecting} + r.Lifecycle.IsAlive = false + r.Lifecycle.Detecting = &domain.DetectingState{Attempts: 2, StartedAt: r.CreatedAt, EvidenceHash: "abc"} + if err := s.UpdateSession(ctx, r); err != nil { + t.Fatal(err) + } + got, _, _ := s.GetSession(ctx, r.ID) + if got.Lifecycle.Session.State != domain.SessionDetecting || got.Lifecycle.IsAlive { + t.Fatalf("update not persisted: %+v", got.Lifecycle.Session) + } + if got.Lifecycle.Detecting == nil || got.Lifecycle.Detecting.Attempts != 2 || got.Lifecycle.Detecting.EvidenceHash != "abc" { + t.Fatalf("detecting not round-tripped: %+v", got.Lifecycle.Detecting) + } + // clearing detecting persists as nil. + got.Lifecycle.Detecting = nil + got.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionWorking} + _ = s.UpdateSession(ctx, got) + again, _, _ := s.GetSession(ctx, r.ID) + if again.Lifecycle.Detecting != nil { + t.Fatalf("detecting should clear to nil, got %+v", again.Lifecycle.Detecting) + } +} + +func TestPRCRUD(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + + pr := PRRow{ + URL: "https://gh/pr/1", SessionID: string(r.ID), Number: 1, State: "open", + ReviewDecision: "review_required", CIState: "failing", Mergeability: "blocked", UpdatedAt: now, + } + if err := s.UpsertPR(ctx, pr); err != nil { + t.Fatal(err) + } + got, ok, err := s.GetPR(ctx, pr.URL) + if err != nil || !ok || got != pr { + t.Fatalf("get pr: ok=%v err=%v got=%+v", ok, err, got) + } + if list, _ := s.ListPRsBySession(ctx, string(r.ID)); len(list) != 1 { + t.Fatalf("list prs = %d, want 1", len(list)) + } + if err := s.DeletePR(ctx, pr.URL); err != nil { + t.Fatal(err) + } + if _, ok, _ := s.GetPR(ctx, pr.URL); ok { + t.Fatal("pr should be gone") + } +} + +func TestPRChecksLoopBrakeQuery(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: now}) + + // three consecutive failing runs of "build" (one per commit). + for i := 1; i <= 3; i++ { + if err := s.RecordCheck(ctx, PRCheckRow{ + PRURL: "pr1", Name: "build", CommitHash: fmt.Sprintf("c%d", i), + Status: "failed", CreatedAt: now.Add(time.Duration(i) * time.Second), + }); err != nil { + t.Fatal(err) + } + } + last3, err := s.RecentCheckStatuses(ctx, "pr1", "build", 3) + if err != nil { + t.Fatal(err) + } + if len(last3) != 3 || last3[0] != "failed" || last3[1] != "failed" || last3[2] != "failed" { + t.Fatalf("recent statuses = %v, want 3x failed (loop brake would trip)", last3) + } + // a pass on a newer commit breaks the streak. + _ = s.RecordCheck(ctx, PRCheckRow{PRURL: "pr1", Name: "build", CommitHash: "c4", Status: "passed", CreatedAt: now.Add(4 * time.Second)}) + last3, _ = s.RecentCheckStatuses(ctx, "pr1", "build", 3) + if last3[0] != "passed" { + t.Fatalf("most recent should be passed, got %v", last3) + } +} + +func TestPRCommentsReplace(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: now}) + + _ = s.ReplacePRComments(ctx, "pr1", []PRCommentRow{ + {PRURL: "pr1", CommentID: "c1", Author: "a", File: "a.go", Line: 1, Body: "nit", CreatedAt: now}, + {PRURL: "pr1", CommentID: "c2", Author: "b", File: "b.go", Line: 2, Body: "bug", Resolved: true, CreatedAt: now.Add(time.Second)}, + }) + if list, _ := s.ListPRComments(ctx, "pr1"); len(list) != 2 { + t.Fatalf("comments = %d, want 2", len(list)) + } + // replace with a smaller set drops the rest. + _ = s.ReplacePRComments(ctx, "pr1", []PRCommentRow{{PRURL: "pr1", CommentID: "c1", Body: "x", CreatedAt: now}}) + if list, _ := s.ListPRComments(ctx, "pr1"); len(list) != 1 { + t.Fatalf("after replace, comments = %d, want 1", len(list)) + } +} + +func TestCDCTriggersPopulateChangeLog(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + // a real state change logs; a metadata-only change does not (WHEN guard). + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionIdle} + _ = s.UpdateSession(ctx, r) + r.Metadata.Prompt = "only metadata changed" + _ = s.UpdateSession(ctx, r) + // a PR insert logs too. + _ = s.UpsertPR(ctx, PRRow{URL: "pr1", SessionID: string(r.ID), State: "open", UpdatedAt: r.UpdatedAt}) + + evs, err := s.ReadChangeLogAfter(ctx, 0, 100) + if err != nil { + t.Fatal(err) + } + var types []string + for _, e := range evs { + if e.ProjectID != "mer" { + t.Fatalf("event project = %s, want mer", e.ProjectID) + } + types = append(types, e.EventType) + } + want := []string{"session_created", "session_updated", "pr_created"} + if len(types) != 3 || types[0] != want[0] || types[1] != want[1] || types[2] != want[2] { + t.Fatalf("change_log event types = %v, want %v (metadata-only update suppressed)", types, want) + } + max, _ := s.MaxChangeLogSeq(ctx) + if max != int64(len(evs)) { + t.Fatalf("max seq = %d, want %d", max, len(evs)) + } +} + +func TestConcurrentSessionCreateAssignsUniqueNums(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + + const n = 20 + var wg sync.WaitGroup + ids := make([]string, n) + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + r, err := s.CreateSession(ctx, sampleRecord("mer")) + if err != nil { + t.Errorf("create: %v", err) + return + } + ids[i] = string(r.ID) + }(i) + } + wg.Wait() + + seen := map[string]bool{} + for _, id := range ids { + if id == "" || seen[id] { + t.Fatalf("duplicate or empty id: %q in %v", id, ids) + } + seen[id] = true + } + if all, _ := s.ListAllSessions(ctx); len(all) != n { + t.Fatalf("created %d sessions, want %d", len(all), n) + } +} + +func TestTerminationReasonRoundTripAndCheck(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + r, _ := s.CreateSession(ctx, sampleRecord("mer")) + + // terminate with a valid reason -> round-trips. + r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionTerminated} + r.Lifecycle.TerminationReason = domain.TermManuallyKilled + if err := s.UpdateSession(ctx, r); err != nil { + t.Fatal(err) + } + got, _, _ := s.GetSession(ctx, r.ID) + if got.Lifecycle.TerminationReason != domain.TermManuallyKilled { + t.Fatalf("termination_reason = %q, want manually_killed", got.Lifecycle.TerminationReason) + } + if domain.DeriveStatus(got.Lifecycle, domain.PRFacts{}) != domain.StatusKilled { + t.Fatal("terminated+manually_killed should derive to killed") + } + + // an off-enum reason is rejected by the CHECK constraint. + r.Lifecycle.TerminationReason = domain.TerminationReason("definitely_not_a_reason") + if err := s.UpdateSession(ctx, r); err == nil { + t.Fatal("expected CHECK constraint to reject an invalid termination_reason") + } +} diff --git a/backend/lifecycle_wiring.go b/backend/lifecycle_wiring.go new file mode 100644 index 00000000..d736d653 --- /dev/null +++ b/backend/lifecycle_wiring.go @@ -0,0 +1,139 @@ +package main + +import ( + "context" + "log/slog" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// lifecycleStack owns the running LCM + reaper. The LCM is the sole writer of +// canonical transitions; the reaper is the OBSERVE-layer timer that probes live +// runtimes and reports facts back through it. +type lifecycleStack struct { + LCM *lifecycle.Manager + reaperDone <-chan struct{} +} + +// startLifecycle constructs the LCM over the store adapter and starts the reaper. +// The goroutine stops when ctx is cancelled; Stop waits for it to drain. +// +// TEMPORARY STUBS (replace as the daemon lane lands the collaborators): +// - noopNotifier — swap for the notifier multiplexer (desktop/Slack/webhook). +// - noopMessenger — swap for the runtime/agent-plugin-backed AgentMessenger. +// - reaper.MapRegistry{} — empty runtime registry, so the reaper ticks +// escalations but probes nothing until the runtime plugins exist. +func startLifecycle(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*lifecycleStack, error) { + a := storeAdapter{store} + lcm := lifecycle.New(a, a, noopNotifier{}, noopMessenger{}) + rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Logger: logger}) + return &lifecycleStack{LCM: lcm, reaperDone: rp.Start(ctx)}, nil +} + +// Stop waits for the reaper goroutine to exit (the caller must have cancelled the +// ctx passed to startLifecycle). +func (l *lifecycleStack) Stop() { <-l.reaperDone } + +// storeAdapter bridges *sqlite.Store to the engine's ports. It embeds the store +// (so CreateSession/UpdateSession/GetSession/ListSessions/ListAllSessions and +// RecentCheckStatuses promote directly) and adds the PR conversions + the +// PRFacts read-model the display status needs. +type storeAdapter struct{ *sqlite.Store } + +var ( + _ ports.SessionStore = storeAdapter{} + _ ports.PRWriter = storeAdapter{} +) + +// PRFactsForSession picks the PR that drives display status — the most-recently +// updated non-closed PR, else the most recent — and folds in whether it has +// unresolved review comments. +func (a storeAdapter) PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) { + rows, err := a.Store.ListPRsBySession(ctx, string(id)) // newest first + if err != nil { + return domain.PRFacts{}, err + } + if len(rows) == 0 { + return domain.PRFacts{}, nil + } + pick := rows[0] + for _, r := range rows { + if r.State == "draft" || r.State == "open" { + pick = r + break + } + } + facts := domain.PRFacts{ + URL: pick.URL, Number: int(pick.Number), Exists: true, + Draft: pick.State == "draft", Merged: pick.State == "merged", Closed: pick.State == "closed", + CI: domain.CIState(pick.CIState), + Review: domain.ReviewDecision(pick.ReviewDecision), + Mergeability: domain.Mergeability(pick.Mergeability), + } + comments, err := a.Store.ListPRComments(ctx, pick.URL) + if err != nil { + return domain.PRFacts{}, err + } + for _, c := range comments { + if !c.Resolved { + facts.ReviewComments = true + break + } + } + return facts, nil +} + +func (a storeAdapter) WritePR(ctx context.Context, pr ports.PRRow, checks []ports.PRCheckRow, comments []ports.PRComment) error { + row := sqlite.PRRow{ + URL: pr.URL, SessionID: pr.SessionID, Number: int64(pr.Number), + State: prState(pr), + ReviewDecision: string(pr.Review), + CIState: string(pr.CI), + Mergeability: string(pr.Mergeability), + UpdatedAt: pr.UpdatedAt, + } + checkRows := make([]sqlite.PRCheckRow, len(checks)) + for i, c := range checks { + checkRows[i] = sqlite.PRCheckRow{ + PRURL: c.PRURL, Name: c.Name, CommitHash: c.CommitHash, + Status: c.Status, URL: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + } + } + commentRows := make([]sqlite.PRCommentRow, len(comments)) + for i, c := range comments { + commentRows[i] = sqlite.PRCommentRow{ + PRURL: pr.URL, CommentID: c.ID, Author: c.Author, File: c.File, + Line: int64(c.Line), Body: c.Body, Resolved: c.Resolved, CreatedAt: c.CreatedAt, + } + } + return a.Store.WritePRObservation(ctx, row, checkRows, commentRows) +} + +// prState collapses the PR's bools into the single pr.state column value. +func prState(r ports.PRRow) string { + switch { + case r.Merged: + return "merged" + case r.Closed: + return "closed" + case r.Draft: + return "draft" + default: + return "open" + } +} + +// noopNotifier / noopMessenger are TEMPORARY stubs (see startLifecycle): the +// write path and CDC work without them; only the human push / agent nudge are +// absent until the real plugins are wired. +type noopNotifier struct{} + +func (noopNotifier) Notify(context.Context, ports.Event) error { return nil } + +type noopMessenger struct{} + +func (noopMessenger) Send(context.Context, domain.SessionID, string) error { return nil } diff --git a/backend/main.go b/backend/main.go index 78a23292..60d9e26e 100644 --- a/backend/main.go +++ b/backend/main.go @@ -15,6 +15,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/httpd" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) func main() { @@ -46,12 +47,60 @@ func run() error { return err } + // Open the durable store and bring up the CDC substrate: the DB triggers + // capture changes into change_log, the poller tails it, and the broadcaster + // fans events out to the SSE transport. The LCM/Session Manager and the HTTP + // API routes that drive and read this store are owned by the daemon lane and + // are wired there once their collaborators (Notifier, AgentMessenger, and the + // runtime/agent/workspace plugins) have production implementations; here we + // stand up the persistence + change-delivery foundation they build on. + store, err := sqlite.Open(cfg.DataDir) + if err != nil { + return fmt.Errorf("open store: %w", err) + } + defer store.Close() + // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the - // graceful shutdown inside Server.Run. + // graceful shutdown inside Server.Run and stops the background goroutines. ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() - return srv.Run(ctx) + cdcPipe, err := startCDC(ctx, store, log) + if err != nil { + return err + } + + // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE + // timer). This makes the write path live end-to-end: LCM write -> store -> DB + // trigger -> change_log -> poller -> broadcaster. The collaborators it needs + // that don't yet have production implementations (Notifier, AgentMessenger, + // runtime registry) are stubbed in lifecycle_wiring.go with TODO markers. + // + // NOT wired here yet — both await collaborators the daemon lane owns: + // - Session Manager: session.New needs Runtime/Agent/Workspace plugins to + // construct. Stubbing them would make Spawn a silent no-op (a footgun), + // so it's deferred rather than faked. The LCM already exposes the read + // surface (RunningSessions) the SM would wrap. + // - HTTP API routes: httpd.New takes no SM/LCM today; surfacing the store + // over HTTP needs a constructor signature change + handlers, tracked with + // the SM work since the routes call into it. + lcStack, err := startLifecycle(ctx, store, log) + if err != nil { + return err + } + + runErr := srv.Run(ctx) + + // Shut the background goroutines down in order: cancel the context FIRST so + // their loops exit, then wait for them to drain. Doing this explicitly (not + // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel + // runs before the cancel — which would hang any non-signal exit path. + stop() + lcStack.Stop() + if err := cdcPipe.Stop(); err != nil { + log.Error("cdc pipeline shutdown", "err", err) + } + return runErr } // newLogger returns the daemon's slog logger. It writes to stderr so the diff --git a/backend/sqlc.yaml b/backend/sqlc.yaml new file mode 100644 index 00000000..9659bf77 --- /dev/null +++ b/backend/sqlc.yaml @@ -0,0 +1,13 @@ +version: "2" +sql: + - engine: "sqlite" + schema: "internal/storage/sqlite/migrations" + queries: "internal/storage/sqlite/queries" + gen: + go: + package: "gen" + out: "internal/storage/sqlite/gen" + emit_json_tags: false + emit_prepared_queries: false + emit_interface: true + emit_empty_slices: true diff --git a/backend/wiring_test.go b/backend/wiring_test.go new file mode 100644 index 00000000..74b314b0 --- /dev/null +++ b/backend/wiring_test.go @@ -0,0 +1,71 @@ +package main + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +// TestWiring_WriteFlowsToBroadcaster exercises the real boot path end to end: +// a lifecycle write -> sqlite -> DB trigger -> change_log -> CDC poller -> +// broadcaster, through the production storeAdapter and cdcSource. +func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { + ctx := context.Background() + store, err := sqlite.Open(t.TempDir()) + if err != nil { + t.Fatal(err) + } + defer store.Close() + + a := storeAdapter{store} + lcm := lifecycle.New(a, a, noopNotifier{}, noopMessenger{}) + + bcast := cdc.NewBroadcaster() + poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{}) + if err := poller.SeekToHead(ctx); err != nil { + t.Fatal(err) + } + + var mu sync.Mutex + var got []cdc.Event + bcast.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) + + if err := store.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/repo/mer"}); err != nil { + t.Fatal(err) + } + rec, err := store.CreateSession(ctx, domain.SessionRecord{ + ProjectID: "mer", Kind: domain.KindWorker, + Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion, Session: domain.SessionSubstate{State: domain.SessionNotStarted}}, + }) + if err != nil { + t.Fatal(err) + } + // A real transition through the engine, which writes the row and fires the + // is_alive/activity_state CDC trigger. + if err := lcm.ApplyActivitySignal(ctx, rec.ID, ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Timestamp: time.Now()}); err != nil { + t.Fatal(err) + } + + if err := poller.Poll(ctx); err != nil { + t.Fatal(err) + } + + mu.Lock() + defer mu.Unlock() + var sawSession bool + for _, e := range got { + if e.SessionID == string(rec.ID) { + sawSession = true + } + } + if !sawSession { + t.Fatalf("expected a change_log event for %s to reach the broadcaster, got %d events", rec.ID, len(got)) + } +}