From 4671d27307f05184c643f3d68f482717b9fc936c Mon Sep 17 00:00:00 2001 From: Dhruv Sharma Date: Sun, 31 May 2026 21:10:04 +0530 Subject: [PATCH 01/12] feat(backend): add cobra cli foundation --- README.md | 21 +- backend/cmd/ao/main.go | 15 + backend/go.mod | 3 + backend/go.sum | 8 + backend/internal/cli/completion.go | 31 ++ backend/internal/cli/doctor.go | 115 +++++ backend/internal/cli/output.go | 12 + backend/internal/cli/process.go | 30 ++ backend/internal/cli/process_unix.go | 24 ++ backend/internal/cli/process_windows.go | 29 ++ backend/internal/cli/root.go | 136 ++++++ backend/internal/cli/root_test.go | 159 +++++++ backend/internal/cli/start.go | 138 ++++++ backend/internal/cli/status.go | 187 +++++++++ backend/internal/cli/stop.go | 106 +++++ backend/internal/cli/version.go | 37 ++ backend/{ => internal/daemon}/cdc_wiring.go | 2 +- backend/internal/daemon/daemon.go | 126 ++++++ .../{ => internal/daemon}/lifecycle_wiring.go | 2 +- backend/{ => internal/daemon}/wiring_test.go | 2 +- backend/main.go | 125 +----- docs/README.md | 1 + docs/cli/README.md | 393 ++++++++++++++++++ 23 files changed, 1571 insertions(+), 131 deletions(-) create mode 100644 backend/cmd/ao/main.go create mode 100644 backend/internal/cli/completion.go create mode 100644 backend/internal/cli/doctor.go create mode 100644 backend/internal/cli/output.go create mode 100644 backend/internal/cli/process.go create mode 100644 backend/internal/cli/process_unix.go create mode 100644 backend/internal/cli/process_windows.go create mode 100644 backend/internal/cli/root.go create mode 100644 backend/internal/cli/root_test.go create mode 100644 backend/internal/cli/start.go create mode 100644 backend/internal/cli/status.go create mode 100644 backend/internal/cli/stop.go create mode 100644 backend/internal/cli/version.go rename backend/{ => internal/daemon}/cdc_wiring.go (99%) create mode 100644 backend/internal/daemon/daemon.go rename backend/{ => internal/daemon}/lifecycle_wiring.go (99%) rename backend/{ => internal/daemon}/wiring_test.go (99%) create mode 100644 docs/cli/README.md diff --git a/README.md b/README.md index f5c17deb..61a639d2 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,24 @@ Lifecycle Manager + Session Manager lane in [`docs/architecture.md`](docs/archit ## Backend daemon -The Go binary in [`backend/`](backend/) is the HTTP daemon — a loopback-only -sidecar the Electron supervisor will spawn (Phase 1c). Phase 1a landed the -skeleton: chi router, middleware stack (recoverer → request-id → logger → -real-ip), `/healthz` + `/readyz`, atomic `running.json` PID/port handshake, -graceful shutdown on SIGINT/SIGTERM. +The Go backend now has a Cobra-based `ao` CLI in [`backend/cmd/ao`](backend/cmd/ao). +The CLI controls the HTTP daemon — a loopback-only sidecar the Electron +supervisor will also use. The daemon skeleton includes the chi router, +middleware stack (recoverer → request-id → logger → real-ip), `/healthz` + +`/readyz`, atomic `running.json` PID/port handshake, graceful shutdown on +SIGINT/SIGTERM, SQLite storage, CDC polling, and lifecycle/reaper wiring. ### Run ```bash cd backend -go run . # binds 127.0.0.1:3001 with all defaults -AO_PORT=3019 go run . # override per invocation +go run ./cmd/ao start # start the daemon and wait for readiness +go run ./cmd/ao status # inspect PID/port/health/readiness +go run ./cmd/ao stop # gracefully stop the daemon +go run ./cmd/ao daemon # internal daemon entrypoint + +go run . # compatibility wrapper; starts the daemon +AO_PORT=3019 go run ./cmd/ao start # override per invocation ``` Health check: @@ -48,4 +54,3 @@ is intentionally not env-configurable. cd backend gofmt -l . && go build ./... && go vet ./... && go test -race ./... ``` - diff --git a/backend/cmd/ao/main.go b/backend/cmd/ao/main.go new file mode 100644 index 00000000..1ee35d62 --- /dev/null +++ b/backend/cmd/ao/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "fmt" + "os" + + "github.com/aoagents/agent-orchestrator/backend/internal/cli" +) + +func main() { + if err := cli.Execute(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} diff --git a/backend/go.mod b/backend/go.mod index f270a14c..adc56039 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -7,6 +7,7 @@ require ( github.com/creack/pty v1.1.24 github.com/go-chi/chi/v5 v5.1.0 github.com/pressly/goose/v3 v3.27.1 + github.com/spf13/cobra v1.10.1 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.51.0 ) @@ -14,11 +15,13 @@ require ( require ( github.com/dustin/go-humanize v1.0.1 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/mattn/go-isatty v0.0.21 // indirect github.com/mfridman/interpolate v0.0.2 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/sethvargo/go-retry v0.3.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/sync v0.20.0 // indirect golang.org/x/sys v0.43.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 84823bbd..cf3f0029 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,5 +1,6 @@ github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -14,6 +15,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= @@ -26,8 +29,13 @@ github.com/pressly/goose/v3 v3.27.1 h1:6uEvcprBybDmW4hcz3gYujhARhye+GoWKhEWyzD5s github.com/pressly/goose/v3 v3.27.1/go.mod h1:maruOxsPnIG2yHHyo8UqKWXYKFcH7Q76csUV7+7KYoM= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= +github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= +github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= diff --git a/backend/internal/cli/completion.go b/backend/internal/cli/completion.go new file mode 100644 index 00000000..97970395 --- /dev/null +++ b/backend/internal/cli/completion.go @@ -0,0 +1,31 @@ +package cli + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +func newCompletionCommand() *cobra.Command { + return &cobra.Command{ + Use: "completion [bash|zsh|fish|powershell]", + Short: "Generate shell completion scripts", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + root := cmd.Root() + out := cmd.OutOrStdout() + switch args[0] { + case "bash": + return root.GenBashCompletion(out) + case "zsh": + return root.GenZshCompletion(out) + case "fish": + return root.GenFishCompletion(out, true) + case "powershell": + return root.GenPowerShellCompletion(out) + default: + return fmt.Errorf("unsupported shell %q", args[0]) + } + }, + } +} diff --git a/backend/internal/cli/doctor.go b/backend/internal/cli/doctor.go new file mode 100644 index 00000000..3a452ac1 --- /dev/null +++ b/backend/internal/cli/doctor.go @@ -0,0 +1,115 @@ +package cli + +import ( + "context" + "fmt" + "os" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" +) + +type doctorLevel string + +const ( + doctorPass doctorLevel = "PASS" + doctorWarn doctorLevel = "WARN" + doctorFail doctorLevel = "FAIL" +) + +type doctorCheck struct { + Level doctorLevel + Name string + Message string +} + +func newDoctorCommand(ctx *commandContext) *cobra.Command { + return &cobra.Command{ + Use: "doctor", + Short: "Run local AO health checks", + RunE: func(cmd *cobra.Command, args []string) error { + checks := ctx.runDoctor(cmd.Context()) + for _, check := range checks { + if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s: %s\n", check.Level, check.Name, check.Message); err != nil { + return err + } + } + var failures int + for _, check := range checks { + if check.Level == doctorFail { + failures++ + } + } + if failures > 0 { + return fmt.Errorf("doctor found %d failing check(s)", failures) + } + return nil + }, + } +} + +func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { + checks := []doctorCheck{} + + cfg, err := config.Load() + if err != nil { + return append(checks, doctorCheck{Level: doctorFail, Name: "config", Message: err.Error()}) + } + checks = append(checks, doctorCheck{ + Level: doctorPass, Name: "config", + Message: fmt.Sprintf("runFile=%s dataDir=%s port=%d", cfg.RunFilePath, cfg.DataDir, cfg.Port), + }) + + if err := os.MkdirAll(cfg.DataDir, 0o755); err != nil { + checks = append(checks, doctorCheck{Level: doctorFail, Name: "data-dir", Message: err.Error()}) + } else { + checks = append(checks, doctorCheck{Level: doctorPass, Name: "data-dir", Message: cfg.DataDir}) + } + + store, err := sqlite.Open(cfg.DataDir) + if err != nil { + checks = append(checks, doctorCheck{Level: doctorFail, Name: "sqlite", Message: err.Error()}) + } else { + _ = store.Close() + checks = append(checks, doctorCheck{Level: doctorPass, Name: "sqlite", Message: "opened database and applied migrations"}) + } + + st, err := c.inspectDaemon(ctx) + if err != nil { + checks = append(checks, doctorCheck{Level: doctorFail, Name: "daemon", Message: err.Error()}) + } else { + level := doctorPass + switch st.State { + case "stale", "not_ready": + level = doctorWarn + case "unhealthy": + level = doctorFail + } + msg := st.State + if st.PID != 0 { + msg = fmt.Sprintf("%s pid=%d port=%d", msg, st.PID, st.Port) + } + if st.Error != "" { + msg += " (" + st.Error + ")" + } + checks = append(checks, doctorCheck{Level: level, Name: "daemon", Message: msg}) + } + + checks = append(checks, c.checkTool("git", true)) + checks = append(checks, c.checkTool("tmux", false)) + checks = append(checks, c.checkTool("zellij", false)) + return checks +} + +func (c *commandContext) checkTool(name string, required bool) doctorCheck { + path, err := c.deps.LookPath(name) + if err == nil { + return doctorCheck{Level: doctorPass, Name: name, Message: path} + } + if required { + return doctorCheck{Level: doctorFail, Name: name, Message: "not found in PATH"} + } + return doctorCheck{Level: doctorWarn, Name: name, Message: "not found in PATH"} +} diff --git a/backend/internal/cli/output.go b/backend/internal/cli/output.go new file mode 100644 index 00000000..df76e23e --- /dev/null +++ b/backend/internal/cli/output.go @@ -0,0 +1,12 @@ +package cli + +import ( + "encoding/json" + "io" +) + +func writeJSON(w io.Writer, v any) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(v) +} diff --git a/backend/internal/cli/process.go b/backend/internal/cli/process.go new file mode 100644 index 00000000..3db43209 --- /dev/null +++ b/backend/internal/cli/process.go @@ -0,0 +1,30 @@ +package cli + +import ( + "os" + "os/exec" +) + +type processStartConfig struct { + Path string + Args []string + Env []string + Stdout *os.File + Stderr *os.File +} + +type processHandle struct { + PID int +} + +func startProcess(cfg processStartConfig) (processHandle, error) { + cmd := exec.Command(cfg.Path, cfg.Args...) + cmd.Env = cfg.Env + cmd.Stdout = cfg.Stdout + cmd.Stderr = cfg.Stderr + if err := cmd.Start(); err != nil { + return processHandle{}, err + } + go func() { _ = cmd.Wait() }() + return processHandle{PID: cmd.Process.Pid}, nil +} diff --git a/backend/internal/cli/process_unix.go b/backend/internal/cli/process_unix.go new file mode 100644 index 00000000..bd28047c --- /dev/null +++ b/backend/internal/cli/process_unix.go @@ -0,0 +1,24 @@ +//go:build !windows + +package cli + +import ( + "errors" + "os" + "syscall" +) + +func processAlive(pid int) bool { + if pid <= 0 { + return false + } + err := syscall.Kill(pid, 0) + return err == nil || errors.Is(err, syscall.EPERM) +} + +func signalTerm(pid int) error { + if pid <= 0 { + return os.ErrProcessDone + } + return syscall.Kill(pid, syscall.SIGTERM) +} diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go new file mode 100644 index 00000000..da109f11 --- /dev/null +++ b/backend/internal/cli/process_windows.go @@ -0,0 +1,29 @@ +//go:build windows + +package cli + +import "os" + +func processAlive(pid int) bool { + if pid <= 0 { + return false + } + p, err := os.FindProcess(pid) + if err != nil { + return false + } + _ = p.Release() + return true +} + +func signalTerm(pid int) error { + if pid <= 0 { + return os.ErrProcessDone + } + p, err := os.FindProcess(pid) + if err != nil { + return err + } + defer p.Release() + return p.Kill() +} diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go new file mode 100644 index 00000000..cb40363f --- /dev/null +++ b/backend/internal/cli/root.go @@ -0,0 +1,136 @@ +// Package cli implements the user-facing ao command. It stays thin: commands +// discover the local daemon, call its loopback HTTP API, and format output. +package cli + +import ( + "io" + "net/http" + "os" + "os/exec" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemon" +) + +// Execute runs the ao CLI with process stdio. +func Execute() error { + return NewRootCommand(DefaultDeps()).Execute() +} + +// Deps holds the small set of side effects the CLI needs. Tests replace these +// functions without reaching into process-global state. +type Deps struct { + In io.Reader + Out io.Writer + Err io.Writer + + HTTPClient *http.Client + Executable func() (string, error) + StartProcess func(processStartConfig) (processHandle, error) + SignalTerm func(pid int) error + ProcessAlive func(pid int) bool + LookPath func(file string) (string, error) + Now func() time.Time + Sleep func(time.Duration) +} + +// DefaultDeps returns production dependencies. +func DefaultDeps() Deps { + return Deps{ + In: os.Stdin, + Out: os.Stdout, + Err: os.Stderr, + HTTPClient: &http.Client{Timeout: 2 * time.Second}, + Executable: os.Executable, + StartProcess: startProcess, + SignalTerm: signalTerm, + ProcessAlive: processAlive, + LookPath: exec.LookPath, + Now: time.Now, + Sleep: time.Sleep, + } +} + +func (d Deps) withDefaults() Deps { + def := DefaultDeps() + if d.In == nil { + d.In = def.In + } + if d.Out == nil { + d.Out = def.Out + } + if d.Err == nil { + d.Err = def.Err + } + if d.HTTPClient == nil { + d.HTTPClient = def.HTTPClient + } + if d.Executable == nil { + d.Executable = def.Executable + } + if d.StartProcess == nil { + d.StartProcess = def.StartProcess + } + if d.SignalTerm == nil { + d.SignalTerm = def.SignalTerm + } + if d.ProcessAlive == nil { + d.ProcessAlive = def.ProcessAlive + } + if d.LookPath == nil { + d.LookPath = def.LookPath + } + if d.Now == nil { + d.Now = def.Now + } + if d.Sleep == nil { + d.Sleep = def.Sleep + } + return d +} + +// NewRootCommand builds a testable root command. +func NewRootCommand(deps Deps) *cobra.Command { + deps = deps.withDefaults() + ctx := &commandContext{deps: deps} + + root := &cobra.Command{ + Use: "ao", + Short: "Agent Orchestrator", + Long: "Agent Orchestrator manages the local daemon that supervises parallel coding-agent sessions.", + Version: VersionString(), + SilenceUsage: true, + SilenceErrors: true, + } + root.SetIn(deps.In) + root.SetOut(deps.Out) + root.SetErr(deps.Err) + root.CompletionOptions.DisableDefaultCmd = true + + root.AddCommand(newDaemonCommand()) + root.AddCommand(newStartCommand(ctx)) + root.AddCommand(newStopCommand(ctx)) + root.AddCommand(newStatusCommand(ctx)) + root.AddCommand(newDoctorCommand(ctx)) + root.AddCommand(newCompletionCommand()) + root.AddCommand(newVersionCommand()) + + return root +} + +type commandContext struct { + deps Deps +} + +func newDaemonCommand() *cobra.Command { + return &cobra.Command{ + Use: "daemon", + Short: "Run the AO backend daemon", + Hidden: true, + RunE: func(cmd *cobra.Command, args []string) error { + return daemon.Run() + }, + } +} diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go new file mode 100644 index 00000000..119bf16a --- /dev/null +++ b/backend/internal/cli/root_test.go @@ -0,0 +1,159 @@ +package cli + +import ( + "bytes" + "net" + "net/http" + "net/http/httptest" + "net/url" + "os" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +func TestRootHelpDoesNotShowDaemon(t *testing.T) { + out, _, err := executeCLI(t, Deps{}, "--help") + if err != nil { + t.Fatal(err) + } + if strings.Contains(out, "\n daemon") { + t.Fatalf("hidden daemon command leaked into help:\n%s", out) + } + for _, want := range []string{"start", "stop", "status", "doctor", "completion", "version"} { + if !strings.Contains(out, want) { + t.Fatalf("help missing %q:\n%s", want, out) + } + } +} + +func TestStatusStoppedJSON(t *testing.T) { + setConfigEnv(t) + + out, _, err := executeCLI(t, Deps{ProcessAlive: func(int) bool { return false }}, "status", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("status did not report stopped:\n%s", out) + } + if strings.Contains(out, "startedAt") { + t.Fatalf("stopped JSON should omit startedAt:\n%s", out) + } +} + +func TestStartReturnsExistingReadyDaemon(t *testing.T) { + cfg := setConfigEnv(t) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = w.Write([]byte(`{"status":"ok"}`)) + case "/readyz": + _, _ = w.Write([]byte(`{"status":"ready"}`)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + port := serverPort(t, srv.URL) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var started bool + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + StartProcess: func(processStartConfig) (processHandle, error) { + started = true + return processHandle{}, nil + }, + Now: func() time.Time { return time.Unix(110, 0).UTC() }, + }, "start", "--json") + if err != nil { + t.Fatal(err) + } + if started { + t.Fatal("start should not spawn when daemon is already ready") + } + if !strings.Contains(out, `"state": "ready"`) { + t.Fatalf("start did not report ready:\n%s", out) + } +} + +func TestStopRemovesStaleRunFile(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 999999, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ProcessAlive: func(int) bool { return false }}, "stop", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("stale run-file was not removed: %#v", info) + } +} + +type testConfig struct { + runFile string + dataDir string +} + +func setConfigEnv(t *testing.T) testConfig { + t.Helper() + dir := t.TempDir() + cfg := testConfig{ + runFile: filepath.Join(dir, "running.json"), + dataDir: filepath.Join(dir, "data"), + } + t.Setenv("AO_RUN_FILE", cfg.runFile) + t.Setenv("AO_DATA_DIR", cfg.dataDir) + t.Setenv("AO_PORT", "3001") + t.Setenv("AO_REQUEST_TIMEOUT", "") + t.Setenv("AO_SHUTDOWN_TIMEOUT", "") + return cfg +} + +func executeCLI(t *testing.T, deps Deps, args ...string) (string, string, error) { + t.Helper() + var out, errOut bytes.Buffer + deps.Out = &out + deps.Err = &errOut + if deps.Sleep == nil { + deps.Sleep = func(time.Duration) {} + } + cmd := NewRootCommand(deps) + cmd.SetArgs(args) + err := cmd.Execute() + return out.String(), errOut.String(), err +} + +func serverPort(t *testing.T, raw string) int { + t.Helper() + u, err := url.Parse(raw) + if err != nil { + t.Fatal(err) + } + _, portRaw, err := net.SplitHostPort(u.Host) + if err != nil { + t.Fatal(err) + } + port, err := strconv.Atoi(portRaw) + if err != nil { + t.Fatal(err) + } + return port +} diff --git a/backend/internal/cli/start.go b/backend/internal/cli/start.go new file mode 100644 index 00000000..9dc4a222 --- /dev/null +++ b/backend/internal/cli/start.go @@ -0,0 +1,138 @@ +package cli + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +const defaultStartTimeout = 10 * time.Second + +type startOptions struct { + timeout time.Duration + logFile string + json bool +} + +func newStartCommand(ctx *commandContext) *cobra.Command { + opts := startOptions{timeout: defaultStartTimeout} + cmd := &cobra.Command{ + Use: "start", + Short: "Start the AO daemon", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.startDaemon(cmd.Context(), opts) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + if st.State == "ready" { + _, err = fmt.Fprintf(cmd.OutOrStdout(), "AO daemon ready (pid %d, port %d)\n", st.PID, st.Port) + return err + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().DurationVar(&opts.timeout, "timeout", defaultStartTimeout, "How long to wait for daemon readiness") + cmd.Flags().StringVar(&opts.logFile, "log-file", "", "Daemon log file path") + cmd.Flags().BoolVar(&opts.json, "json", false, "Output start result as JSON") + return cmd +} + +func (c *commandContext) startDaemon(ctx context.Context, opts startOptions) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + + st, err := c.inspectDaemon(ctx) + if err != nil { + return daemonStatus{}, err + } + if st.State == "ready" { + return st, nil + } + if st.State != "stopped" && st.State != "stale" { + ready, waitErr := c.waitForReady(ctx, opts.timeout) + if waitErr == nil { + return ready, nil + } + return daemonStatus{}, fmt.Errorf("daemon process exists but did not become ready: %w", waitErr) + } + + exe, err := c.deps.Executable() + if err != nil { + return daemonStatus{}, fmt.Errorf("resolve executable: %w", err) + } + + logPath := opts.logFile + if logPath == "" { + logPath = filepath.Join(filepath.Dir(cfg.RunFilePath), "daemon.log") + } + if err := os.MkdirAll(filepath.Dir(logPath), 0o755); err != nil { + return daemonStatus{}, fmt.Errorf("create log dir: %w", err) + } + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return daemonStatus{}, fmt.Errorf("open daemon log: %w", err) + } + defer logFile.Close() + + if _, err := c.deps.StartProcess(processStartConfig{ + Path: exe, + Args: []string{"daemon"}, + Env: os.Environ(), + Stdout: logFile, + Stderr: logFile, + }); err != nil { + return daemonStatus{}, fmt.Errorf("start daemon: %w", err) + } + + ready, err := c.waitForReady(ctx, opts.timeout) + if err != nil { + return daemonStatus{}, fmt.Errorf("%w; see daemon log %s", err, logPath) + } + return ready, nil +} + +func (c *commandContext) waitForReady(ctx context.Context, timeout time.Duration) (daemonStatus, error) { + if timeout <= 0 { + timeout = defaultStartTimeout + } + deadline := c.deps.Now().Add(timeout) + var last daemonStatus + var lastErr error + + for { + select { + case <-ctx.Done(): + return daemonStatus{}, ctx.Err() + default: + } + + st, err := c.inspectDaemon(ctx) + if err != nil { + lastErr = err + } else { + last = st + if st.State == "ready" { + return st, nil + } + } + + if !c.deps.Now().Before(deadline) { + if lastErr != nil { + return daemonStatus{}, fmt.Errorf("daemon did not become ready within %s: %w", timeout, lastErr) + } + return daemonStatus{}, fmt.Errorf("daemon did not become ready within %s (last state: %s)", timeout, last.State) + } + c.deps.Sleep(100 * time.Millisecond) + } +} diff --git a/backend/internal/cli/status.go b/backend/internal/cli/status.go new file mode 100644 index 00000000..04fd755b --- /dev/null +++ b/backend/internal/cli/status.go @@ -0,0 +1,187 @@ +package cli + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +const probeTimeout = 2 * time.Second + +type statusOptions struct { + json bool +} + +type daemonStatus struct { + State string `json:"state"` + PID int `json:"pid,omitempty"` + Port int `json:"port,omitempty"` + StartedAt *time.Time `json:"startedAt,omitempty"` + Uptime string `json:"uptime,omitempty"` + RunFile string `json:"runFile"` + DataDir string `json:"dataDir"` + Health string `json:"health,omitempty"` + Ready string `json:"ready,omitempty"` + Error string `json:"error,omitempty"` +} + +func newStatusCommand(ctx *commandContext) *cobra.Command { + var opts statusOptions + cmd := &cobra.Command{ + Use: "status", + Short: "Show AO daemon status", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.inspectDaemon(cmd.Context()) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().BoolVar(&opts.json, "json", false, "Output status as JSON") + return cmd +} + +func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + st := daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir} + + info, err := runfile.Read(cfg.RunFilePath) + if err != nil { + return daemonStatus{}, err + } + if info == nil { + return st, nil + } + + st.PID = info.PID + st.Port = info.Port + startedAt := info.StartedAt + st.StartedAt = &startedAt + st.Uptime = formatUptime(c.deps.Now().Sub(info.StartedAt)) + + if !c.deps.ProcessAlive(info.PID) { + st.State = "stale" + st.Error = "run-file points to a dead process" + return st, nil + } + + health, err := c.readProbe(ctx, info.Port, "healthz") + if err != nil { + st.State = "unhealthy" + st.Error = err.Error() + return st, nil + } + st.Health = health + + ready, err := c.readProbe(ctx, info.Port, "readyz") + if err != nil { + st.State = "not_ready" + st.Error = err.Error() + return st, nil + } + st.Ready = ready + if ready == "ready" { + st.State = "ready" + return st, nil + } + st.State = "not_ready" + return st, nil +} + +func (c *commandContext) readProbe(ctx context.Context, port int, path string) (string, error) { + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, fmt.Sprintf("http://%s:%d/%s", config.LoopbackHost, port, path), nil) + if err != nil { + return "", err + } + resp, err := c.deps.HTTPClient.Do(req) + if err != nil { + return "", fmt.Errorf("%s: %w", path, err) + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("%s: HTTP %d", path, resp.StatusCode) + } + var body struct { + Status string `json:"status"` + } + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return "", fmt.Errorf("%s: decode response: %w", path, err) + } + if body.Status == "" { + return "", fmt.Errorf("%s: missing status", path) + } + return body.Status, nil +} + +func writeStatus(cmd *cobra.Command, st daemonStatus) error { + out := cmd.OutOrStdout() + if _, err := fmt.Fprintf(out, "AO daemon: %s\n", st.State); err != nil { + return err + } + if st.PID != 0 { + if _, err := fmt.Fprintf(out, " pid: %d\n", st.PID); err != nil { + return err + } + } + if st.Port != 0 { + if _, err := fmt.Fprintf(out, " port: %d\n", st.Port); err != nil { + return err + } + } + if st.StartedAt != nil && !st.StartedAt.IsZero() { + if _, err := fmt.Fprintf(out, " started: %s\n", st.StartedAt.Format(time.RFC3339)); err != nil { + return err + } + } + if st.Uptime != "" { + if _, err := fmt.Fprintf(out, " uptime: %s\n", st.Uptime); err != nil { + return err + } + } + if _, err := fmt.Fprintf(out, " run file: %s\n", st.RunFile); err != nil { + return err + } + if _, err := fmt.Fprintf(out, " data dir: %s\n", st.DataDir); err != nil { + return err + } + if st.Health != "" { + if _, err := fmt.Fprintf(out, " healthz: %s\n", st.Health); err != nil { + return err + } + } + if st.Ready != "" { + if _, err := fmt.Fprintf(out, " readyz: %s\n", st.Ready); err != nil { + return err + } + } + if st.Error != "" { + if _, err := fmt.Fprintf(out, " error: %s\n", st.Error); err != nil { + return err + } + } + return nil +} + +func formatUptime(d time.Duration) string { + if d < 0 { + d = 0 + } + return d.Round(time.Second).String() +} diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go new file mode 100644 index 00000000..5ce43b4f --- /dev/null +++ b/backend/internal/cli/stop.go @@ -0,0 +1,106 @@ +package cli + +import ( + "context" + "fmt" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +const defaultStopTimeout = 10 * time.Second + +type stopOptions struct { + timeout time.Duration + json bool +} + +func newStopCommand(ctx *commandContext) *cobra.Command { + opts := stopOptions{timeout: defaultStopTimeout} + cmd := &cobra.Command{ + Use: "stop", + Short: "Stop the AO daemon", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.stopDaemon(cmd.Context(), opts) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + if st.State == "stopped" { + _, err = fmt.Fprintln(cmd.OutOrStdout(), "AO daemon stopped") + return err + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().DurationVar(&opts.timeout, "timeout", defaultStopTimeout, "How long to wait for daemon shutdown") + cmd.Flags().BoolVar(&opts.json, "json", false, "Output stop result as JSON") + return cmd +} + +func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + st, err := c.inspectDaemon(ctx) + if err != nil { + return daemonStatus{}, err + } + switch st.State { + case "stopped": + return st, nil + case "stale": + if err := runfile.Remove(cfg.RunFilePath); err != nil { + return daemonStatus{}, err + } + return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + } + + if err := c.deps.SignalTerm(st.PID); err != nil { + if c.deps.ProcessAlive(st.PID) { + return daemonStatus{}, fmt.Errorf("signal daemon pid %d: %w", st.PID, err) + } + _ = runfile.Remove(cfg.RunFilePath) + return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + } + return c.waitForStopped(ctx, st.PID, cfg.RunFilePath, cfg.DataDir, opts.timeout) +} + +func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePath, dataDir string, timeout time.Duration) (daemonStatus, error) { + if timeout <= 0 { + timeout = defaultStopTimeout + } + deadline := c.deps.Now().Add(timeout) + for { + select { + case <-ctx.Done(): + return daemonStatus{}, ctx.Err() + default: + } + + info, err := runfile.Read(runFilePath) + if err != nil { + return daemonStatus{}, err + } + alive := c.deps.ProcessAlive(pid) + if info == nil { + return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + } + if !alive { + if err := runfile.Remove(runFilePath); err != nil { + return daemonStatus{}, err + } + return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + } + if !c.deps.Now().Before(deadline) { + return daemonStatus{}, fmt.Errorf("daemon pid %d did not stop within %s", pid, timeout) + } + c.deps.Sleep(100 * time.Millisecond) + } +} diff --git a/backend/internal/cli/version.go b/backend/internal/cli/version.go new file mode 100644 index 00000000..dd8a2598 --- /dev/null +++ b/backend/internal/cli/version.go @@ -0,0 +1,37 @@ +package cli + +import ( + "fmt" + "strings" + + "github.com/spf13/cobra" +) + +// Build metadata. Release tooling can override these with -ldflags. +var ( + Version = "dev" + Commit = "" + Date = "" +) + +func VersionString() string { + parts := []string{Version} + if Commit != "" { + parts = append(parts, "commit "+Commit) + } + if Date != "" { + parts = append(parts, "built "+Date) + } + return strings.Join(parts, " ") +} + +func newVersionCommand() *cobra.Command { + return &cobra.Command{ + Use: "version", + Short: "Print version information", + RunE: func(cmd *cobra.Command, args []string) error { + _, err := fmt.Fprintln(cmd.OutOrStdout(), VersionString()) + return err + }, + } +} diff --git a/backend/cdc_wiring.go b/backend/internal/daemon/cdc_wiring.go similarity index 99% rename from backend/cdc_wiring.go rename to backend/internal/daemon/cdc_wiring.go index d824cbab..a76c5c78 100644 --- a/backend/cdc_wiring.go +++ b/backend/internal/daemon/cdc_wiring.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go new file mode 100644 index 00000000..fd7dcc7c --- /dev/null +++ b/backend/internal/daemon/daemon.go @@ -0,0 +1,126 @@ +// Package daemon owns the Agent Orchestrator backend process: config loading, +// loopback HTTP serving, durable storage, CDC fan-out, lifecycle wiring, and +// graceful shutdown. +package daemon + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/signal" + "syscall" + + "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/httpd" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" + "github.com/aoagents/agent-orchestrator/backend/internal/terminal" +) + +// Run starts the daemon and blocks until it exits. SIGINT/SIGTERM drive +// graceful shutdown through the HTTP server and background workers. +func Run() error { + cfg, err := config.Load() + if err != nil { + return err + } + + log := NewLogger() + + // Fail fast if a live daemon already owns the handshake file. A run-file + // left by a crashed predecessor (dead PID) is treated as stale and + // overwritten when the new server starts. + if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { + return fmt.Errorf("inspect run-file: %w", err) + } else if live != nil { + return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) + } + + // Open the durable store and bring up the CDC substrate: the DB triggers + // capture changes into change_log, the poller tails it, and the broadcaster + // fans events out to the SSE transport. The LCM/Session Manager and the HTTP + // API routes that drive and read this store are owned by the daemon lane and + // are wired there once their collaborators (Notifier, AgentMessenger, and the + // runtime/agent/workspace plugins) have production implementations; here we + // stand up the persistence + change-delivery foundation they build on. + store, err := sqlite.Open(cfg.DataDir) + if err != nil { + return fmt.Errorf("open store: %w", err) + } + defer store.Close() + + // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the + // graceful shutdown inside Server.Run and stops the background goroutines. + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + cdcPipe, err := startCDC(ctx, store, log) + if err != nil { + return err + } + + // Terminal streaming: the tmux runtime supplies the PTY-attach command and + // liveness; the CDC broadcaster feeds the session-state channel. The manager + // is handed to httpd, which mounts it at /mux. Raw PTY bytes never flow + // through the CDC change_log — only session-state events do. + runtimeAdapter := tmux.New(tmux.Options{}) + termMgr := terminal.NewManager(runtimeAdapter, cdcPipe.Broadcaster, log) + defer termMgr.Close() + + srv, err := httpd.New(cfg, log, termMgr) + if err != nil { + return err + } + + // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE + // timer). This makes the write path live end-to-end: LCM write -> store -> DB + // trigger -> change_log -> poller -> broadcaster. + lcStack, err := startLifecycle(ctx, store, log) + if err != nil { + return err + } + + // Bring up the Session Manager. Runtime (tmux) and Workspace (gitworktree) + // are real on main; ports.Agent has no production adapter yet, so a loud + // stub returns a sentinel command that makes any Spawn fail at the runtime + // layer rather than start a broken session quietly. Notifier and + // AgentMessenger remain stubbed alongside the LCM until their multiplexers + // land. No HTTP routes wire to this yet — the daemon lane (#10) owns API + // surfacing — so we hold the SM in a local until it does. + sStack, err := startSession(ctx, cfg, lcStack, log) + if err != nil { + // startSession is the first start* call after this point that can + // realistically fail while the cdc poller and the reaper are already + // running. Mirror the bottom-of-run shutdown sequence so both have + // drained before the deferred store.Close() fires. Defers would hit + // the LIFO trap (see comment after srv.Run), hence explicit. + stop() + lcStack.Stop() + if cdcErr := cdcPipe.Stop(); cdcErr != nil { + log.Error("cdc pipeline shutdown", "err", cdcErr) + } + return err + } + _ = sStack + + runErr := srv.Run(ctx) + + // Shut the background goroutines down in order: cancel the context FIRST so + // their loops exit, then wait for them to drain. Doing this explicitly (not + // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel + // runs before the cancel — which would hang any non-signal exit path. + stop() + lcStack.Stop() + if err := cdcPipe.Stop(); err != nil { + log.Error("cdc pipeline shutdown", "err", err) + } + return runErr +} + +// NewLogger returns the daemon's slog logger. It writes to stderr so supervisors +// can capture it separately from any structured stdout protocol added later. +func NewLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) +} diff --git a/backend/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go similarity index 99% rename from backend/lifecycle_wiring.go rename to backend/internal/daemon/lifecycle_wiring.go index 35eac385..5a791054 100644 --- a/backend/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/wiring_test.go b/backend/internal/daemon/wiring_test.go similarity index 99% rename from backend/wiring_test.go rename to backend/internal/daemon/wiring_test.go index 6a372c6c..c2cfb721 100644 --- a/backend/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/main.go b/backend/main.go index e825039f..5315d510 100644 --- a/backend/main.go +++ b/backend/main.go @@ -1,133 +1,18 @@ -// Command backend is the Agent Orchestrator HTTP daemon: a loopback-only -// sidecar spawned and supervised by the Electron main process. Phase 1a brings -// up the server skeleton — config, 127.0.0.1 bind, middleware stack, health -// probes, the running.json handshake, and graceful shutdown. +// Command backend is a compatibility wrapper for the Agent Orchestrator daemon. +// The user-facing CLI lives at cmd/ao; keep this wrapper so existing `go run .` +// development workflows continue to start the daemon while scripts migrate. package main import ( - "context" "fmt" - "log/slog" "os" - "os/signal" - "syscall" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" - "github.com/aoagents/agent-orchestrator/backend/internal/config" - "github.com/aoagents/agent-orchestrator/backend/internal/httpd" - "github.com/aoagents/agent-orchestrator/backend/internal/runfile" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" - "github.com/aoagents/agent-orchestrator/backend/internal/terminal" + "github.com/aoagents/agent-orchestrator/backend/internal/daemon" ) func main() { - if err := run(); err != nil { + if err := daemon.Run(); err != nil { fmt.Fprintln(os.Stderr, "ao backend daemon: "+err.Error()) os.Exit(1) } } - -func run() error { - cfg, err := config.Load() - if err != nil { - return err - } - - log := newLogger() - - // Fail fast if a live daemon already owns the handshake file. A run-file - // left by a crashed predecessor (dead PID) is treated as stale and - // overwritten when the new server starts. - if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { - return fmt.Errorf("inspect run-file: %w", err) - } else if live != nil { - return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) - } - - // Open the durable store and bring up the CDC substrate: the DB triggers - // capture changes into change_log, the poller tails it, and the broadcaster - // fans events out to the SSE transport. The LCM/Session Manager and the HTTP - // API routes that drive and read this store are owned by the daemon lane and - // are wired there once their collaborators (Notifier, AgentMessenger, and the - // runtime/agent/workspace plugins) have production implementations; here we - // stand up the persistence + change-delivery foundation they build on. - store, err := sqlite.Open(cfg.DataDir) - if err != nil { - return fmt.Errorf("open store: %w", err) - } - defer store.Close() - - // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the - // graceful shutdown inside Server.Run and stops the background goroutines. - ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) - defer stop() - - cdcPipe, err := startCDC(ctx, store, log) - if err != nil { - return err - } - - // Terminal streaming: the tmux runtime supplies the PTY-attach command and - // liveness; the CDC broadcaster feeds the session-state channel. The manager - // is handed to httpd, which mounts it at /mux. Raw PTY bytes never flow - // through the CDC change_log — only session-state events do. - runtimeAdapter := tmux.New(tmux.Options{}) - termMgr := terminal.NewManager(runtimeAdapter, cdcPipe.Broadcaster, log) - defer termMgr.Close() - - srv, err := httpd.New(cfg, log, termMgr) - if err != nil { - return err - } - - // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE - // timer). This makes the write path live end-to-end: LCM write -> store -> DB - // trigger -> change_log -> poller -> broadcaster. - lcStack, err := startLifecycle(ctx, store, log) - if err != nil { - return err - } - - // Bring up the Session Manager. Runtime (tmux) and Workspace (gitworktree) - // are real on main; ports.Agent has no production adapter yet, so a loud - // stub returns a sentinel command that makes any Spawn fail at the runtime - // layer rather than start a broken session quietly. Notifier and - // AgentMessenger remain stubbed alongside the LCM until their multiplexers - // land. No HTTP routes wire to this yet — the daemon lane (#10) owns API - // surfacing — so we hold the SM in a local until it does. - sStack, err := startSession(ctx, cfg, lcStack, log) - if err != nil { - // startSession is the first start* call after this point that can - // realistically fail while the cdc poller and the reaper are already - // running. Mirror the bottom-of-run shutdown sequence so both have - // drained before the deferred store.Close() fires. Defers would hit - // the LIFO trap (see comment after srv.Run), hence explicit. - stop() - lcStack.Stop() - if cdcErr := cdcPipe.Stop(); cdcErr != nil { - log.Error("cdc pipeline shutdown", "err", cdcErr) - } - return err - } - _ = sStack - - runErr := srv.Run(ctx) - - // Shut the background goroutines down in order: cancel the context FIRST so - // their loops exit, then wait for them to drain. Doing this explicitly (not - // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel - // runs before the cancel — which would hang any non-signal exit path. - stop() - lcStack.Stop() - if err := cdcPipe.Stop(); err != nil { - log.Error("cdc pipeline shutdown", "err", err) - } - return runErr -} - -// newLogger returns the daemon's slog logger. It writes to stderr so the -// Electron supervisor can capture it separately from any structured stdout -// protocol added later. -func newLogger() *slog.Logger { - return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) -} diff --git a/docs/README.md b/docs/README.md index f42f222f..220dec40 100644 --- a/docs/README.md +++ b/docs/README.md @@ -15,6 +15,7 @@ fakes) on the `feat/lcm-sm-contracts` integration branch. |-----|----------------| | [architecture.md](architecture.md) | How the lane works: the OBSERVE→DECIDE→ACT loop, the canonical state model, the package layout, every component, and the load-bearing invariants. Read this first. | | [status.md](status.md) | What's done (PR by PR), what's left, the integration to-dos, the open cross-lane contract questions, and how to build/test. | +| [cli/README.md](cli/README.md) | CLI foundation decisions: Cobra, reference projects, old CLI inventory, and the first command surface. | ## The one-paragraph mental model diff --git a/docs/cli/README.md b/docs/cli/README.md new file mode 100644 index 00000000..3d49e00d --- /dev/null +++ b/docs/cli/README.md @@ -0,0 +1,393 @@ +# AO CLI Foundation + +This page is the running decision log for the Agent Orchestrator CLI. Keep new +CLI decisions here as the command surface grows. + +## Current State + +This branch implements the daemon-control foundation. AO now has a Go/Cobra +`ao` binary that can start, inspect, diagnose, and stop the local backend daemon +end to end. + +What works now: + +- `ao start` starts the daemon in the background and waits for `/readyz`. +- `ao status` and `ao status --json` report stopped, stale, unhealthy, + not-ready, or ready daemon state. +- `ao stop` gracefully stops the daemon using the PID in `running.json`. +- `ao daemon` is the hidden internal daemon entrypoint used by `ao start`. +- `ao doctor` checks config, data dir, SQLite migrations, daemon state, and + local tool availability for `git`, `tmux`, and `zellij`. +- `ao completion` generates shell completions for `bash`, `zsh`, `fish`, and + `powershell`. +- `ao version` and `ao --version` print build metadata. +- `go run .` still works as a compatibility wrapper around `internal/daemon.Run`. + +Manual smoke test: + +```bash +cd backend +go build -o /tmp/ao ./cmd/ao + +tmp=$(mktemp -d) +export AO_RUN_FILE="$tmp/running.json" +export AO_DATA_DIR="$tmp/data" +export AO_PORT=3037 + +/tmp/ao status --json +/tmp/ao doctor +/tmp/ao start +/tmp/ao status --json +/tmp/ao stop +/tmp/ao status --json +``` + +What is intentionally not implemented yet: + +- `ao project ...` +- `ao spawn` +- `ao session ...` +- `ao send` +- `ao events ...` + +Next steps: + +1. Add `/api/v1/projects` on the daemon over a small project service. +2. Implement `ao project list/add/show/remove`. +3. Wire production Session Manager dependencies: project-backed repo resolver, + tmux/zellij runtime registry, first agent adapter, and AgentMessenger. +4. Add `/api/v1/sessions`, then implement `ao spawn`, `ao session ...`, and + `ao send`. +5. Add `/events` SSE and durable event-list reads, then implement + `ao events tail/list`. + +## Decision + +AO will use a single Go CLI binary built with +[Cobra](https://github.com/spf13/cobra). + +The CLI is a thin client for the Go daemon. It should not call SQLite, runtime +adapters, agent adapters, workspace adapters, or SCM integrations directly. It +should start, discover, inspect, and command the daemon through the loopback API +and the existing `running.json` handshake. + +Initial rules: + +- The binary name is `ao`. +- `ao daemon` is the hidden/internal entrypoint for the long-running daemon. +- User-facing commands call the daemon over loopback after reading + `running.json`. +- Commands that mutate core AO state go through HTTP API routes, not direct + stores. +- Commands support predictable text output first and `--json` where automation + is likely. +- Do not introduce Viper in the foundation. Start with explicit flags and a + small config/client layer, then add config loading once the shape is real. + +## References + +These projects inform the direction, but AO should keep its own command surface +smaller at first. + +| Project | CLI stack | What to take | +|---|---|---| +| [Gastown](https://github.com/gastownhall/gastown) | Go + Cobra, with Charmbracelet packages for richer terminal UI | Simple `cmd//main.go` delegating to internal command construction. Useful confirmation that Cobra is the right default for this size of Go CLI. | +| [GitHub CLI](https://github.com/cli/cli) | Go + Cobra | Command factories, explicit IO streams, JSON output, and testable command construction. | +| [Docker CLI](https://github.com/docker/cli) | Go + Cobra | Daemon/client split, command groups, signal handling, and plugin-aware CLI layout. | +| [kubectl](https://github.com/kubernetes/kubectl) | Go + Cobra | Large command tree patterns and IO abstractions. It is a useful ceiling, not a shape to copy now. | +| [Tailscale CLI](https://github.com/tailscale/tailscale) | Go + ffcli | Useful daemon-backed product model: a CLI talks to a local daemon. Do not copy the framework choice. | + +The old AO TypeScript CLI is a product/workflow reference only. We should not +port its implementation because it mixes CLI, storage, runtime, and project +logic in-process. The rewrite needs the CLI to sit outside the core daemon. + +## Current Legacy CLI Inventory + +Inventory source: installed `ao` binary at version `0.9.2`, plus +`/Users/dhruvsharma/Development/agent-orchestrator/packages/cli/src/program.ts` +and `packages/cli/src/commands/*.ts`. + +Count: + +- 25 public top-level commands, excluding Commander-generated `help`. +- 26 visible top-level commands if generated `help` is counted. +- 64 explicit public command nodes when nested subcommands are counted. +- 1 hidden internal command: `completion __complete`. +- No aliases are registered in the old Commander source. + +Top-level commands: + +| Command | Legacy purpose | Foundation decision | +|---|---|---| +| `start` | Start orchestrator agent and dashboard | Keep, but redefine as daemon start. | +| `stop` | Stop orchestrator agent and dashboard | Keep, daemon stop. | +| `status` | Show all sessions and project/session health | Keep, daemon and session status. | +| `spawn` | Spawn a single agent session | Keep after session API exists. | +| `batch-spawn` | Spawn many sessions | Defer. | +| `session` | Manage sessions | Keep a smaller subset after session API exists. | +| `send` | Send a message to a session | Keep after messaging API exists. | +| `acknowledge` | Agent self-reporting hook | Defer or replace with internal API. | +| `report` | Agent workflow transition hook | Defer or replace with internal API. | +| `review-check` | Trigger agents from review comments | Defer. | +| `review` | Manage AO-local reviewer runs | Defer. | +| `dashboard` | Start web dashboard | Defer to Electron/frontend lane. | +| `open` | Open terminal/dashboard | Defer. | +| `verify` | Verify issue after staging check | Defer. | +| `doctor` | Run install/env/runtime checks | Keep. | +| `update` | Upgrade AO | Defer to packaging/release lane. | +| `setup` | Configure integrations | Defer. | +| `plugin` | Plugin marketplace/install flow | Defer. | +| `notify` | Notification test commands | Defer. | +| `project` | Manage registered projects | Keep after project API exists. | +| `migrate-storage` | Legacy storage migration | Drop for rewrite unless a real migration appears. | +| `completion` | Generate shell completions | Keep. | +| `events` | Query activity event log | Keep a small `tail`/`list` surface after event API exists. | +| `config` | Read/write old global config | Defer. Avoid until config shape is stable. | +| `config-help` | Print old config schema | Drop. | + +Nested legacy commands: + +| Parent | Subcommands | +|---|---| +| `session` | `ls`, `attach`, `kill`, `cleanup`, `claim-pr`, `restore`, `remap` | +| `review` | `run`, `execute`, `send`, `list` | +| `setup` | `dashboard`, `desktop`, `webhook`, `slack`, `discord`, `composio`, `composio-slack`, `composio-discord`, `composio-discord-bot`, `composio-mail`, `openclaw` | +| `plugin` | `list`, `search`, `create`, `install`, `update`, `uninstall` | +| `project` | `ls`, `add`, `rm`, `set-default` | +| `events` | `list`, `search`, `stats` | +| `config` | `set`, `get` | +| `notify` | `test` | +| `completion` | `zsh`, hidden `__complete` | + +## Initial Command Surface + +The first CLI should make AO installable, startable, inspectable, and stoppable +before trying to recreate the old product surface. + +### Foundation Commands + +These are the first commands to implement. + +| Command | Purpose | Notes | +|---|---|---| +| `ao start` | Start the daemon, wait for `/readyz`, and print PID/port. | Reads the same config env as the daemon. Should be idempotent when an existing healthy daemon is already running. | +| `ao stop` | Stop the running daemon. | Reads `running.json`, sends graceful termination, waits for run-file removal, and reports stale/dead daemon state clearly. | +| `ao status` | Show daemon status and, once APIs exist, project/session summary. | First version can show run-file, process liveness, `/healthz`, `/readyz`, uptime, and port. Add `--json`; add `--watch` once useful. | +| `ao daemon` | Hidden internal daemon entrypoint. | This replaces the current direct `go run .` daemon entrypoint once `main.go` is extracted into `internal/daemon`. | +| `ao doctor` | Diagnose the local environment. | Start with daemon/run-file/port checks, required binaries, config dir/data dir permissions, and runtime availability. | +| `ao completion` | Generate shell completions. | Cobra can support `bash`, `zsh`, `fish`, and `powershell`. | +| `ao version` | Print CLI and build metadata. | Implement as both `ao version` and Cobra's `--version` flag. | + +This gives a useful first release even before project/session mutation routes are +complete. + +### First Core Application Commands + +These are the next commands once daemon HTTP routes expose the needed managers. + +| Command | Purpose | Depends on | +|---|---|---| +| `ao project list` | List registered projects. | Project API. Alias `ls` is acceptable for old muscle memory. | +| `ao project add ` | Register a project. | Project API and project identity rules. | +| `ao project show ` | Inspect project config and health. | Project API. | +| `ao project remove ` | Archive/remove a project. | Project API. Alias `rm` is acceptable. | +| `ao spawn [issue]` | Spawn one coding-agent session. | Session Manager HTTP route, tracker lookup, workspace/runtime/agent adapters. | +| `ao session list` | List sessions across projects or one project. | Session API. Alias `ls` is acceptable. | +| `ao session show ` | Show one session with lifecycle, PR, CI, runtime, and paths. | Session API. | +| `ao session attach ` | Attach to the runtime terminal. | Runtime API or direct terminal attach contract exposed by daemon. | +| `ao session kill ` | Kill a session and clean up safely. | Session Manager `Kill`. | +| `ao session restore ` | Restore a terminated/crashed session. | Session Manager `Restore`. | +| `ao send [message...]` | Send instructions to a running session. | AgentMessenger route. | +| `ao events tail` | Follow daemon activity events. | SSE/CDC API. | +| `ao events list` | List recent activity events. | Event read API. | + +This is the smallest surface that covers the core product loop: + +1. Register a repo. +2. Start AO. +3. Spawn work. +4. Inspect work. +5. Intervene in work. +6. Stop AO. + +## Explicit Deferrals + +Do not include these in the CLI foundation: + +- `batch-spawn`: valuable, but it multiplies error handling before single-spawn + semantics are stable. +- `dashboard` and `open`: frontend/Electron should own the primary dashboard + launch path first. +- `review`, `review-check`, and `verify`: useful workflow automation, but not + required to run core AO. +- `setup`, `plugin`, and `notify`: integration/plugin surface should come after + the daemon API and config model settle. +- `update`: belongs with distribution and release packaging. +- `config` and `config-help`: wait for a stable Go config model. Avoid copying + the old TypeScript global config behavior. +- `migrate-storage`: old storage migration is not part of the rewrite unless a + concrete migration requirement appears. +- `acknowledge` and `report`: these are agent self-reporting hooks. Prefer a + daemon/internal protocol before exposing them as durable user CLI commands. + +## Implementation Plan + +1. Add Cobra to `backend/go.mod`. +2. Move current daemon startup from `backend/main.go` into + `backend/internal/daemon.Run(ctx, opts)`. +3. Add `backend/cmd/ao/main.go` as the only user binary entrypoint. +4. Add `backend/internal/cli` for command construction, IO streams, process + launching, run-file discovery, loopback HTTP client, and output formatting. +5. Implement `ao daemon` first so the current daemon behavior is preserved. +6. Implement `ao start`, `ao stop`, and `ao status` around `running.json` and + `/healthz`/`/readyz`. +7. Add `ao doctor`, `ao completion`, and `ao version`. +8. Add command tests using Cobra command construction with fake IO, fake process + runner, and fake daemon client. Keep daemon integration tests in the daemon + packages. + +Suggested package layout: + +```text +backend/ + cmd/ + ao/ + main.go + internal/ + cli/ + root.go + start.go + stop.go + status.go + doctor.go + completion.go + version.go + client.go + output.go + process.go + daemon/ + daemon.go +``` + +Acceptance criteria for the foundation: + +- `go run ./cmd/ao daemon` behaves like today's `go run .`. +- `go run ./cmd/ao start` starts the daemon and waits until `/readyz` returns + ready. +- `go run ./cmd/ao status --json` works when the daemon is running, stopped, and + stale. +- `go run ./cmd/ao stop` gracefully stops the daemon and removes `running.json`. +- `go test ./...`, `go vet ./...`, and `go test -race ./...` pass. + +## Implementation Readiness + +This section records what the CLI can connect to in the current codebase and +what still needs to be built. Inventory date: 2026-05-31 on `main` at +`0672dbb`. + +### Implemented Foundation + +The daemon-control foundation now exists in `backend/cmd/ao` and +`backend/internal/cli`. + +Implemented commands: + +- `ao daemon` hidden/internal daemon entrypoint. +- `ao start` starts the daemon, waits for `/readyz`, and supports `--json`, + `--timeout`, and `--log-file`. +- `ao stop` stops the daemon from `running.json`, removes stale run-files, and + supports `--json` and `--timeout`. +- `ao status` reports stopped/stale/unhealthy/not-ready/ready states and + supports `--json`. +- `ao doctor` checks config, data dir, SQLite open/migrations, daemon state, and + local tool availability for `git`, `tmux`, and `zellij`. +- `ao completion` generates `bash`, `zsh`, `fish`, and `powershell` + completions. +- `ao version` prints build metadata. + +The old `backend/main.go` remains as a compatibility wrapper around +`internal/daemon.Run`, so `go run .` still starts the daemon while scripts move +to `go run ./cmd/ao ...`. + +### Already Implemented and Directly Usable by the CLI + +These pieces are available now and are enough to build the daemon-management +part of the CLI. + +| Area | Existing code | CLI use | +|---|---|---| +| Daemon config | `backend/internal/config` loads `AO_PORT`, `AO_REQUEST_TIMEOUT`, `AO_SHUTDOWN_TIMEOUT`, `AO_RUN_FILE`, and `AO_DATA_DIR`. Host is fixed to `127.0.0.1`. | `ao start`, `ao daemon`, `ao status`, and `ao doctor` can share the same config resolution. | +| HTTP server lifecycle | `backend/internal/httpd.Server` binds loopback, writes `running.json`, serves until context cancellation, then removes `running.json`. | `ao daemon` can preserve today's daemon behavior after extraction into `internal/daemon`. | +| Health probes | `GET /healthz` and `GET /readyz`. | `ao start` can wait for readiness; `ao status` and `ao doctor` can check daemon health. | +| Run-file handshake | `backend/internal/runfile` reads, writes, removes, and stale-checks `running.json`. | `ao status` can discover PID/port; `ao stop` can find the process; `ao start` can detect an already-running daemon. | +| Durable store | `backend/internal/storage/sqlite` opens SQLite, runs goose migrations, uses WAL, stores projects/sessions/PR/check/comment rows, and reads `change_log`. | Not directly called by user CLI commands, but confirms the daemon has a durable backend once APIs expose it. | +| CDC substrate | `backend/internal/cdc` poller and broadcaster exist; daemon starts the poller with `startCDC`. | Future `ao events tail` can build on this once an SSE/API transport exists. | +| Lifecycle manager | `backend/internal/lifecycle` is implemented and currently wired in daemon startup. | Session/status APIs can use it; CLI must wait for HTTP routes rather than calling it directly. | +| Reaper timer | `backend/internal/observe/reaper` exists and is wired. | Runtime liveness will be available once runtime registry wiring exists. | + +### Implemented Internally but Not Reachable by CLI Yet + +These are real backend components, but the CLI cannot responsibly use them until +they are wired into the daemon and exposed through HTTP. + +| Area | Existing code | Missing before CLI can use it | +|---|---|---| +| Project persistence | `sqlite.Store` has `UpsertProject`, `GetProject`, `ListProjects`, and `ArchiveProject`. | Project domain/service layer, project ID/path/origin validation, and `/api/v1/projects` routes. | +| Session Manager | `backend/internal/session.Manager` implements `Spawn`, `Kill`, `Restore`, `List`, `Get`, `Send`, and `Cleanup`. | Production daemon wiring with real runtime, agent, workspace, messenger, and HTTP routes. | +| Runtime adapters | tmux and zellij adapters implement `ports.Runtime` and also have attach/send/output helpers. | Runtime registry wiring in daemon, attach/send abstractions in ports/API, and selection config. | +| Workspace adapter | git worktree adapter implements create/destroy/restore/list with safety checks. | Repo resolver backed by registered projects and daemon wiring into Session Manager. | +| GitHub issue tracker | `backend/internal/adapters/tracker/github` implements read-only issue `Get`, `List`, and `Preflight`. | Tracker registry/config, spawn prompt hydration, and project tracker metadata. | +| PR facts storage | SQLite PR/check/comment writes and CDC triggers exist. | SCM/PR observer that fetches GitHub PR/CI/review facts and calls `LCM.ApplyPRObservation`. | +| Session read model | `SessionManager.List/Get` derive display status from canonical lifecycle + PR facts. | HTTP response DTOs and API routes for CLI/frontend reads. | + +### Still Missing + +These are the main gaps before the full initial command set is real. + +| Gap | Blocks | +|---|---| +| Cobra dependency and CLI packages. | All CLI commands. | +| Daemon extraction from `backend/main.go` into `internal/daemon`. | `ao daemon`, `ao start`, tests around daemon startup. | +| CLI process runner and PID signal helpers. | `ao start`, `ao stop`. | +| Loopback HTTP client package with run-file discovery. | `ao status`, later all daemon-backed commands. | +| Shutdown mechanism choice: PID signal now, optional `POST /api/v1/daemon/shutdown` later. | `ao stop` polish and cross-platform behavior. | +| HTTP API route surface under `/api/v1`. | `project`, `spawn`, `session`, `send`, `events list`, richer `status`. | +| SSE route for live CDC events plus durable catch-up reads. | `ao events tail`, frontend live updates. | +| Agent adapters for supported harnesses (`codex`, `claude-code`, etc.). | `ao spawn`, `ao session restore`. | +| AgentMessenger implementation over tmux/zellij. | `ao send`, LCM auto-nudge reactions. | +| Runtime registry wired with tmux/zellij. | Reaper liveness, `session attach`, spawn/kill/restore runtime work. | +| Notifier implementation/multiplexer. | Human notifications and LCM escalation side effects. | +| Activity hooks or agent self-report protocol. | Accurate working/idle/needs-input status beyond runtime/PR facts. | +| Project/tracker config model. | `project add/show`, tracker-backed `spawn`, `doctor` config checks. | +| OpenAPI/DTO/error contract. | Stable CLI/frontend API clients and tests. | + +### Command Readiness Matrix + +| Command | Can implement now? | Existing support | Remaining work | +|---|---:|---|---| +| `ao daemon` | Implemented | Current daemon startup is extracted to `internal/daemon.Run`. | None for foundation. | +| `ao start` | Implemented | Config, run-file stale check, HTTP readiness probes. | Later: package-manager/service integration if needed. | +| `ao stop` | Implemented | Run-file discovery gives PID/port; server exits cleanly on SIGINT/SIGTERM. | Optional later shutdown HTTP route. | +| `ao status` | Partially implemented | Run-file, process liveness via PID, `/healthz`, `/readyz`. | Rich project/session summary waits for `/api/v1/projects` and `/api/v1/sessions`. | +| `ao doctor` | Partially implemented | Config resolution, run-file, storage open, runtime binary checks. | Deeper adapter preflights need daemon wiring/config. | +| `ao completion` | Implemented | Cobra generators. | None for foundation. | +| `ao version` | Implemented | Build metadata can be injected with `-ldflags`. | Release tooling needs to set metadata. | +| `ao project list/add/show/remove` | Not yet | SQLite project CRUD exists. | Project service and HTTP routes. CLI must not write SQLite directly. | +| `ao spawn` | Not yet | Session Manager exists; runtime/workspace/tracker pieces partly exist. | Agent adapters, registry/config wiring, project lookup, tracker hydration, HTTP route. | +| `ao session list/show` | Not yet | Store and Session Manager read model exist. | HTTP routes and response DTOs. | +| `ao session attach` | Not yet | tmux/zellij have attach command helpers. | Runtime attach port/API and terminal-launch policy. | +| `ao session kill/restore` | Not yet | Session Manager implements both. | Production wiring and HTTP routes. | +| `ao send` | Not yet | Session Manager has `Send`; tmux/zellij have send helpers. | AgentMessenger implementation, port/API wiring, busy/idle delivery policy. | +| `ao events tail/list` | Not yet | Durable `change_log`, CDC poller, in-process broadcaster. | SSE route and durable event-list route. | + +### Recommended Build Order + +1. Build CLI foundation around the daemon only: `daemon`, `start`, `stop`, + `status`, `doctor`, `completion`, `version`. +2. Add `/api/v1/projects` over a small project service, then implement + `project list/add/show/remove`. +3. Wire production Session Manager dependencies: project-backed repo resolver, + tmux/zellij runtime registry, first agent adapter, and AgentMessenger. +4. Add `/api/v1/sessions` and implement `spawn`, `session list/show/kill/restore`, + and `send`. +5. Add `/events` SSE plus event-list reads, then implement `events tail/list`. From f72facb9e5d345a112b221ba941b40dfd0b6ba89 Mon Sep 17 00:00:00 2001 From: Dhruv Sharma Date: Sun, 31 May 2026 21:37:43 +0530 Subject: [PATCH 02/12] fix(cli): address greptile review comments --- backend/go.mod | 2 +- backend/internal/cli/completion.go | 7 ++++--- backend/internal/cli/process_windows.go | 21 +++++++++++++++++---- docs/cli/README.md | 5 ++--- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/backend/go.mod b/backend/go.mod index adc56039..a2de66a0 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -8,6 +8,7 @@ require ( github.com/go-chi/chi/v5 v5.1.0 github.com/pressly/goose/v3 v3.27.1 github.com/spf13/cobra v1.10.1 + golang.org/x/sys v0.43.0 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.51.0 ) @@ -24,7 +25,6 @@ require ( github.com/spf13/pflag v1.0.9 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.43.0 // indirect modernc.org/libc v1.72.3 // indirect modernc.org/mathutil v1.7.1 // indirect modernc.org/memory v1.11.0 // indirect diff --git a/backend/internal/cli/completion.go b/backend/internal/cli/completion.go index 97970395..61b9483e 100644 --- a/backend/internal/cli/completion.go +++ b/backend/internal/cli/completion.go @@ -8,9 +8,10 @@ import ( func newCompletionCommand() *cobra.Command { return &cobra.Command{ - Use: "completion [bash|zsh|fish|powershell]", - Short: "Generate shell completion scripts", - Args: cobra.ExactArgs(1), + Use: "completion [bash|zsh|fish|powershell]", + Short: "Generate shell completion scripts", + Args: cobra.ExactArgs(1), + ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, RunE: func(cmd *cobra.Command, args []string) error { root := cmd.Root() out := cmd.OutOrStdout() diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go index da109f11..430cea8b 100644 --- a/backend/internal/cli/process_windows.go +++ b/backend/internal/cli/process_windows.go @@ -2,18 +2,31 @@ package cli -import "os" +import ( + "errors" + "os" + + "golang.org/x/sys/windows" +) func processAlive(pid int) bool { if pid <= 0 { return false } - p, err := os.FindProcess(pid) + handle, err := windows.OpenProcess(windows.SYNCHRONIZE, false, uint32(pid)) + if err != nil { + if errors.Is(err, windows.ERROR_ACCESS_DENIED) { + return true + } + return false + } + defer windows.CloseHandle(handle) + + status, err := windows.WaitForSingleObject(handle, 0) if err != nil { return false } - _ = p.Release() - return true + return status == uint32(windows.WAIT_TIMEOUT) } func signalTerm(pid int) error { diff --git a/docs/cli/README.md b/docs/cli/README.md index 3d49e00d..dd83c306 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -103,9 +103,8 @@ logic in-process. The rewrite needs the CLI to sit outside the core daemon. ## Current Legacy CLI Inventory -Inventory source: installed `ao` binary at version `0.9.2`, plus -`/Users/dhruvsharma/Development/agent-orchestrator/packages/cli/src/program.ts` -and `packages/cli/src/commands/*.ts`. +Inventory source: installed `ao` binary at version `0.9.2`, plus the old +`packages/cli/src/program.ts` and `packages/cli/src/commands/*.ts` files. Count: From 925e70763da188d13869743140a569c97d833c3a Mon Sep 17 00:00:00 2001 From: Dhruv Sharma Date: Sun, 31 May 2026 21:45:00 +0530 Subject: [PATCH 03/12] fix(cli): verify daemon ownership before stop signal --- backend/internal/cli/root_test.go | 50 +++++++++++++++++++++++- backend/internal/cli/status.go | 60 ++++++++++++++++++++++------- backend/internal/cli/stop.go | 6 +++ backend/internal/daemonmeta/meta.go | 6 +++ backend/internal/httpd/router.go | 14 ++++++- 5 files changed, 118 insertions(+), 18 deletions(-) create mode 100644 backend/internal/daemonmeta/meta.go diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index 119bf16a..73387be8 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -2,6 +2,7 @@ package cli import ( "bytes" + "fmt" "net" "net/http" "net/http/httptest" @@ -13,6 +14,7 @@ import ( "testing" "time" + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" ) @@ -51,9 +53,9 @@ func TestStartReturnsExistingReadyDaemon(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/healthz": - _, _ = w.Write([]byte(`{"status":"ok"}`)) + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) case "/readyz": - _, _ = w.Write([]byte(`{"status":"ready"}`)) + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) default: http.NotFound(w, r) } @@ -107,6 +109,50 @@ func TestStopRemovesStaleRunFile(t *testing.T) { } } +func TestStopDoesNotSignalUnverifiedReusedPID(t *testing.T) { + cfg := setConfigEnv(t) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = w.Write([]byte(`{"status":"ok"}`)) + case "/readyz": + _, _ = w.Write([]byte(`{"status":"ready"}`)) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: serverPort(t, srv.URL), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var signaled bool + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + SignalTerm: func(pid int) error { + signaled = true + return nil + }, + }, "stop", "--json") + if err != nil { + t.Fatal(err) + } + if signaled { + t.Fatal("stop signaled a PID whose health probe did not prove AO daemon ownership") + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("unverified run-file was not removed: %#v", info) + } +} + type testConfig struct { runFile string dataDir string diff --git a/backend/internal/cli/status.go b/backend/internal/cli/status.go index 04fd755b..ea6cb528 100644 --- a/backend/internal/cli/status.go +++ b/backend/internal/cli/status.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/cobra" "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" ) @@ -30,6 +31,13 @@ type daemonStatus struct { Health string `json:"health,omitempty"` Ready string `json:"ready,omitempty"` Error string `json:"error,omitempty"` + owned bool +} + +type probeResult struct { + Status string `json:"status"` + Service string `json:"service"` + PID int `json:"pid"` } func newStatusCommand(ctx *commandContext) *cobra.Command { @@ -81,11 +89,21 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error health, err := c.readProbe(ctx, info.Port, "healthz") if err != nil { - st.State = "unhealthy" + st.State = "stale" st.Error = err.Error() return st, nil } - st.Health = health + if err := verifyProbeOwner(health, info.PID, "healthz"); err != nil { + st.State = "stale" + st.Error = err.Error() + return st, nil + } + st.owned = true + st.Health = health.Status + if health.Status != "ok" { + st.State = "unhealthy" + return st, nil + } ready, err := c.readProbe(ctx, info.Port, "readyz") if err != nil { @@ -93,8 +111,14 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error st.Error = err.Error() return st, nil } - st.Ready = ready - if ready == "ready" { + if err := verifyProbeOwner(ready, info.PID, "readyz"); err != nil { + st.State = "stale" + st.owned = false + st.Error = err.Error() + return st, nil + } + st.Ready = ready.Status + if ready.Status == "ready" { st.State = "ready" return st, nil } @@ -102,32 +126,40 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error return st, nil } -func (c *commandContext) readProbe(ctx context.Context, port int, path string) (string, error) { +func (c *commandContext) readProbe(ctx context.Context, port int, path string) (probeResult, error) { reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) defer cancel() req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, fmt.Sprintf("http://%s:%d/%s", config.LoopbackHost, port, path), nil) if err != nil { - return "", err + return probeResult{}, err } resp, err := c.deps.HTTPClient.Do(req) if err != nil { - return "", fmt.Errorf("%s: %w", path, err) + return probeResult{}, fmt.Errorf("%s: %w", path, err) } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return "", fmt.Errorf("%s: HTTP %d", path, resp.StatusCode) - } - var body struct { - Status string `json:"status"` + return probeResult{}, fmt.Errorf("%s: HTTP %d", path, resp.StatusCode) } + var body probeResult if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { - return "", fmt.Errorf("%s: decode response: %w", path, err) + return probeResult{}, fmt.Errorf("%s: decode response: %w", path, err) } if body.Status == "" { - return "", fmt.Errorf("%s: missing status", path) + return probeResult{}, fmt.Errorf("%s: missing status", path) } - return body.Status, nil + return body, nil +} + +func verifyProbeOwner(probe probeResult, wantPID int, path string) error { + if probe.Service != daemonmeta.ServiceName { + return fmt.Errorf("%s: response is not from AO daemon", path) + } + if probe.PID != wantPID { + return fmt.Errorf("%s: daemon pid %d does not match run-file pid %d", path, probe.PID, wantPID) + } + return nil } func writeStatus(cmd *cobra.Command, st daemonStatus) error { diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go index 5ce43b4f..1b626940 100644 --- a/backend/internal/cli/stop.go +++ b/backend/internal/cli/stop.go @@ -61,6 +61,12 @@ func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daem } return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil } + if !st.owned { + if err := runfile.Remove(cfg.RunFilePath); err != nil { + return daemonStatus{}, err + } + return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + } if err := c.deps.SignalTerm(st.PID); err != nil { if c.deps.ProcessAlive(st.PID) { diff --git a/backend/internal/daemonmeta/meta.go b/backend/internal/daemonmeta/meta.go new file mode 100644 index 00000000..72f322e5 --- /dev/null +++ b/backend/internal/daemonmeta/meta.go @@ -0,0 +1,6 @@ +package daemonmeta + +// ServiceName identifies the AO daemon in loopback health/readiness probes. +// The CLI uses it with the reported PID to avoid signaling an unrelated process +// when a stale run-file's PID has been reused. +const ServiceName = "agent-orchestrator-daemon" diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 019f7efe..6f142d13 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -7,11 +7,13 @@ package httpd import ( "log/slog" "net/http" + "os" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) @@ -68,12 +70,20 @@ func mountHealth(r chi.Router) { // handleHealthz is the liveness probe: it answers 200 as long as the process is // up and serving. It does no dependency checks by design. func handleHealthz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]string{"status": "ok"}) + writeJSON(w, http.StatusOK, map[string]any{ + "status": "ok", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) } // handleReadyz is the readiness probe. In the 1a skeleton the daemon is ready // as soon as it is listening; later phases will gate this on dependency // initialisation (e.g. store/event-bus warm-up). func handleReadyz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]string{"status": "ready"}) + writeJSON(w, http.StatusOK, map[string]any{ + "status": "ready", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) } From a614462d387d28dd8f03585e33a6fbac3aee47b4 Mon Sep 17 00:00:00 2001 From: Dhruv Sharma Date: Sun, 31 May 2026 21:52:08 +0530 Subject: [PATCH 04/12] fix(cli): preserve live daemon state on probe failures --- backend/internal/cli/root_test.go | 94 +++++++++++++++++++++++++++++++ backend/internal/cli/status.go | 2 +- backend/internal/cli/stop.go | 6 +- 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index 73387be8..307b80a5 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -153,6 +153,81 @@ func TestStopDoesNotSignalUnverifiedReusedPID(t *testing.T) { } } +func TestStatusKeepsLiveProbeFailureUnhealthy(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + }, "status", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "unhealthy"`) { + t.Fatalf("status should keep live probe failures unhealthy:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("live probe failure should not remove run-file") + } +} + +func TestStopRefusesUnverifiedLivePID(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var signaled bool + _, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + SignalTerm: func(pid int) error { + signaled = true + return nil + }, + }, "stop", "--json") + if err == nil { + t.Fatal("stop should fail when daemon ownership cannot be verified") + } + if signaled { + t.Fatal("stop signaled a PID whose ownership was unknown") + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("unverified live PID should remain tracked") + } +} + +func TestStartDoesNotSpawnWhenLiveProbeFails(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var started bool + _, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + StartProcess: func(processStartConfig) (processHandle, error) { + started = true + return processHandle{}, nil + }, + }, "start", "--timeout", "1ns", "--json") + if err == nil { + t.Fatal("start should fail instead of spawning over a live unverified PID") + } + if started { + t.Fatal("start spawned while run-file PID was still alive") + } +} + type testConfig struct { runFile string dataDir string @@ -203,3 +278,22 @@ func serverPort(t *testing.T, raw string) int { } return port } + +func closedPort(t *testing.T) int { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatal(err) + } + defer ln.Close() + + _, portRaw, err := net.SplitHostPort(ln.Addr().String()) + if err != nil { + t.Fatal(err) + } + port, err := strconv.Atoi(portRaw) + if err != nil { + t.Fatal(err) + } + return port +} diff --git a/backend/internal/cli/status.go b/backend/internal/cli/status.go index ea6cb528..85cbf5ac 100644 --- a/backend/internal/cli/status.go +++ b/backend/internal/cli/status.go @@ -89,7 +89,7 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error health, err := c.readProbe(ctx, info.Port, "healthz") if err != nil { - st.State = "stale" + st.State = "unhealthy" st.Error = err.Error() return st, nil } diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go index 1b626940..f6817c0f 100644 --- a/backend/internal/cli/stop.go +++ b/backend/internal/cli/stop.go @@ -62,10 +62,10 @@ func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daem return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil } if !st.owned { - if err := runfile.Remove(cfg.RunFilePath); err != nil { - return daemonStatus{}, err + if st.Error != "" { + return daemonStatus{}, fmt.Errorf("daemon pid %d is alive but ownership could not be verified: %s", st.PID, st.Error) } - return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + return daemonStatus{}, fmt.Errorf("daemon pid %d is alive but ownership could not be verified", st.PID) } if err := c.deps.SignalTerm(st.PID); err != nil { From 2f4662b470811ab77982a44598053a07cb485c16 Mon Sep 17 00:00:00 2001 From: Dhruv Sharma Date: Sun, 31 May 2026 22:10:54 +0530 Subject: [PATCH 05/12] fix(cli): handle stale start and graceful shutdown --- backend/internal/cli/process_unix.go | 8 -- backend/internal/cli/process_windows.go | 13 --- backend/internal/cli/root.go | 5 - backend/internal/cli/root_test.go | 118 ++++++++++++++++++++---- backend/internal/cli/start.go | 6 ++ backend/internal/cli/stop.go | 28 ++++-- backend/internal/httpd/router.go | 23 +++++ backend/internal/httpd/server.go | 33 +++++-- backend/internal/httpd/server_test.go | 43 +++++++++ 9 files changed, 220 insertions(+), 57 deletions(-) diff --git a/backend/internal/cli/process_unix.go b/backend/internal/cli/process_unix.go index bd28047c..61aa333b 100644 --- a/backend/internal/cli/process_unix.go +++ b/backend/internal/cli/process_unix.go @@ -4,7 +4,6 @@ package cli import ( "errors" - "os" "syscall" ) @@ -15,10 +14,3 @@ func processAlive(pid int) bool { err := syscall.Kill(pid, 0) return err == nil || errors.Is(err, syscall.EPERM) } - -func signalTerm(pid int) error { - if pid <= 0 { - return os.ErrProcessDone - } - return syscall.Kill(pid, syscall.SIGTERM) -} diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go index 430cea8b..216431d8 100644 --- a/backend/internal/cli/process_windows.go +++ b/backend/internal/cli/process_windows.go @@ -4,7 +4,6 @@ package cli import ( "errors" - "os" "golang.org/x/sys/windows" ) @@ -28,15 +27,3 @@ func processAlive(pid int) bool { } return status == uint32(windows.WAIT_TIMEOUT) } - -func signalTerm(pid int) error { - if pid <= 0 { - return os.ErrProcessDone - } - p, err := os.FindProcess(pid) - if err != nil { - return err - } - defer p.Release() - return p.Kill() -} diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index cb40363f..c49a0339 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -29,7 +29,6 @@ type Deps struct { HTTPClient *http.Client Executable func() (string, error) StartProcess func(processStartConfig) (processHandle, error) - SignalTerm func(pid int) error ProcessAlive func(pid int) bool LookPath func(file string) (string, error) Now func() time.Time @@ -45,7 +44,6 @@ func DefaultDeps() Deps { HTTPClient: &http.Client{Timeout: 2 * time.Second}, Executable: os.Executable, StartProcess: startProcess, - SignalTerm: signalTerm, ProcessAlive: processAlive, LookPath: exec.LookPath, Now: time.Now, @@ -73,9 +71,6 @@ func (d Deps) withDefaults() Deps { if d.StartProcess == nil { d.StartProcess = def.StartProcess } - if d.SignalTerm == nil { - d.SignalTerm = def.SignalTerm - } if d.ProcessAlive == nil { d.ProcessAlive = def.ProcessAlive } diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index 307b80a5..5b920531 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -11,6 +11,7 @@ import ( "path/filepath" "strconv" "strings" + "sync/atomic" "testing" "time" @@ -87,6 +88,57 @@ func TestStartReturnsExistingReadyDaemon(t *testing.T) { } } +func TestStartClearsStaleRunFileBeforeSpawning(t *testing.T) { + cfg := setConfigEnv(t) + var spawned atomic.Bool + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !spawned.Load() { + _, _ = fmt.Fprintf(w, `{"status":"ok","service":"not-ao","pid":4242}`) + return + } + switch r.URL.Path { + case "/healthz": + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/readyz": + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + port := serverPort(t, srv.URL) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: port, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 || pid == os.Getpid() }, + StartProcess: func(processStartConfig) (processHandle, error) { + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("stale run-file was not removed before spawn: %#v", info) + } + spawned.Store(true) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(110, 0).UTC()}); err != nil { + t.Fatal(err) + } + return processHandle{PID: os.Getpid()}, nil + }, + Now: func() time.Time { return time.Unix(120, 0).UTC() }, + }, "start", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "ready"`) { + t.Fatalf("start did not report ready after clearing stale run-file:\n%s", out) + } +} + func TestStopRemovesStaleRunFile(t *testing.T) { cfg := setConfigEnv(t) if err := runfile.Write(cfg.runFile, runfile.Info{PID: 999999, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { @@ -109,14 +161,18 @@ func TestStopRemovesStaleRunFile(t *testing.T) { } } -func TestStopDoesNotSignalUnverifiedReusedPID(t *testing.T) { +func TestStopDoesNotShutdownUnverifiedReusedPID(t *testing.T) { cfg := setConfigEnv(t) + shutdownCalled := make(chan struct{}, 1) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/healthz": _, _ = w.Write([]byte(`{"status":"ok"}`)) case "/readyz": _, _ = w.Write([]byte(`{"status":"ready"}`)) + case "/shutdown": + shutdownCalled <- struct{}{} + http.Error(w, "unexpected shutdown", http.StatusInternalServerError) default: http.NotFound(w, r) } @@ -127,19 +183,16 @@ func TestStopDoesNotSignalUnverifiedReusedPID(t *testing.T) { t.Fatal(err) } - var signaled bool out, _, err := executeCLI(t, Deps{ ProcessAlive: func(pid int) bool { return pid == 4242 }, - SignalTerm: func(pid int) error { - signaled = true - return nil - }, }, "stop", "--json") if err != nil { t.Fatal(err) } - if signaled { - t.Fatal("stop signaled a PID whose health probe did not prove AO daemon ownership") + select { + case <-shutdownCalled: + t.Fatal("stop requested shutdown from a process whose health probe did not prove AO daemon ownership") + default: } if !strings.Contains(out, `"state": "stopped"`) { t.Fatalf("stop did not report stopped:\n%s", out) @@ -153,6 +206,47 @@ func TestStopDoesNotSignalUnverifiedReusedPID(t *testing.T) { } } +func TestStopUsesShutdownEndpoint(t *testing.T) { + cfg := setConfigEnv(t) + shutdownCalled := make(chan struct{}, 1) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/readyz": + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/shutdown": + if err := runfile.Remove(cfg.runFile); err != nil { + t.Fatal(err) + } + shutdownCalled <- struct{}{} + _, _ = fmt.Fprintf(w, `{"status":"shutting_down","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: serverPort(t, srv.URL), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + }, "stop", "--json") + if err != nil { + t.Fatal(err) + } + select { + case <-shutdownCalled: + default: + t.Fatal("stop did not call daemon shutdown endpoint") + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } +} + func TestStatusKeepsLiveProbeFailureUnhealthy(t *testing.T) { cfg := setConfigEnv(t) if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { @@ -183,20 +277,12 @@ func TestStopRefusesUnverifiedLivePID(t *testing.T) { t.Fatal(err) } - var signaled bool _, _, err := executeCLI(t, Deps{ ProcessAlive: func(pid int) bool { return pid == 4242 }, - SignalTerm: func(pid int) error { - signaled = true - return nil - }, }, "stop", "--json") if err == nil { t.Fatal("stop should fail when daemon ownership cannot be verified") } - if signaled { - t.Fatal("stop signaled a PID whose ownership was unknown") - } info, err := runfile.Read(cfg.runFile) if err != nil { t.Fatal(err) diff --git a/backend/internal/cli/start.go b/backend/internal/cli/start.go index 9dc4a222..0787eba9 100644 --- a/backend/internal/cli/start.go +++ b/backend/internal/cli/start.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/cobra" "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" ) const defaultStartTimeout = 10 * time.Second @@ -66,6 +67,11 @@ func (c *commandContext) startDaemon(ctx context.Context, opts startOptions) (da } return daemonStatus{}, fmt.Errorf("daemon process exists but did not become ready: %w", waitErr) } + if st.State == "stale" { + if err := runfile.Remove(cfg.RunFilePath); err != nil { + return daemonStatus{}, err + } + } exe, err := c.deps.Executable() if err != nil { diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go index f6817c0f..41d42d30 100644 --- a/backend/internal/cli/stop.go +++ b/backend/internal/cli/stop.go @@ -3,6 +3,7 @@ package cli import ( "context" "fmt" + "net/http" "time" "github.com/spf13/cobra" @@ -68,16 +69,31 @@ func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daem return daemonStatus{}, fmt.Errorf("daemon pid %d is alive but ownership could not be verified", st.PID) } - if err := c.deps.SignalTerm(st.PID); err != nil { - if c.deps.ProcessAlive(st.PID) { - return daemonStatus{}, fmt.Errorf("signal daemon pid %d: %w", st.PID, err) - } - _ = runfile.Remove(cfg.RunFilePath) - return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + if err := c.requestShutdown(ctx, st.Port); err != nil { + return daemonStatus{}, fmt.Errorf("request daemon shutdown: %w", err) } return c.waitForStopped(ctx, st.PID, cfg.RunFilePath, cfg.DataDir, opts.timeout) } +func (c *commandContext) requestShutdown(ctx context.Context, port int) error { + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, fmt.Sprintf("http://%s:%d/shutdown", config.LoopbackHost, port), nil) + if err != nil { + return err + } + resp, err := c.deps.HTTPClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("HTTP %d", resp.StatusCode) + } + return nil +} + func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePath, dataDir string, timeout time.Duration) (daemonStatus, error) { if timeout <= 0 { timeout = defaultStopTimeout diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 6f142d13..d406b029 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -36,10 +36,18 @@ func NewRouter(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) c return NewRouterWithAPI(cfg, log, termMgr, APIDeps{}) } +type ControlDeps struct { + RequestShutdown func() +} + // NewRouterWithAPI is the dependency-injected variant. main.go calls it with // real Managers when they exist; tests/dev wiring inject mocks explicitly. // Missing Managers intentionally keep the route-shell 501 behavior. func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps) chi.Router { + return NewRouterWithControl(cfg, log, termMgr, deps, ControlDeps{}) +} + +func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps, control ControlDeps) chi.Router { r := chi.NewRouter() r.Use(middleware.Recoverer) @@ -55,6 +63,7 @@ func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Man mountHealth(r) mountMux(r, termMgr, log) + mountControl(r, control) NewAPI(cfg, deps).Register(r) return r @@ -67,6 +76,20 @@ func mountHealth(r chi.Router) { r.Get("/readyz", handleReadyz) } +func mountControl(r chi.Router, deps ControlDeps) { + if deps.RequestShutdown == nil { + return + } + r.Post("/shutdown", func(w http.ResponseWriter, _ *http.Request) { + writeJSON(w, http.StatusAccepted, map[string]any{ + "status": "shutting_down", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) + deps.RequestShutdown() + }) +} + // handleHealthz is the liveness probe: it answers 200 as long as the process is // up and serving. It does no dependency checks by design. func handleHealthz(w http.ResponseWriter, _ *http.Request) { diff --git a/backend/internal/httpd/server.go b/backend/internal/httpd/server.go index 506f78b5..0ed67eaf 100644 --- a/backend/internal/httpd/server.go +++ b/backend/internal/httpd/server.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "os" + "sync" "time" "github.com/aoagents/agent-orchestrator/backend/internal/config" @@ -23,6 +24,9 @@ type Server struct { log *slog.Logger http *http.Server listen net.Listener + + shutdownRequested chan struct{} + shutdownOnce sync.Once } // New constructs a Server and binds the listener immediately so a port @@ -36,15 +40,18 @@ func New(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) (*Serve } srv := &Server{ - cfg: cfg, - log: log, - listen: ln, - http: &http.Server{ - Handler: NewRouter(cfg, log, termMgr), - // ReadHeaderTimeout guards against slow-loris even on loopback; - // per-request body/handler timeouts are applied per-surface. - ReadHeaderTimeout: 10 * time.Second, - }, + cfg: cfg, + log: log, + listen: ln, + shutdownRequested: make(chan struct{}), + } + srv.http = &http.Server{ + Handler: NewRouterWithControl(cfg, log, termMgr, APIDeps{}, ControlDeps{ + RequestShutdown: srv.requestShutdown, + }), + // ReadHeaderTimeout guards against slow-loris even on loopback; + // per-request body/handler timeouts are applied per-surface. + ReadHeaderTimeout: 10 * time.Second, } return srv, nil } @@ -89,6 +96,8 @@ func (s *Server) Run(ctx context.Context) error { // Serve died on its own (bind already happened, so this is a real // runtime failure) before any shutdown signal. return err + case <-s.shutdownRequested: + s.log.Info("shutdown requested over HTTP", "timeout", s.cfg.ShutdownTimeout) case <-ctx.Done(): s.log.Info("shutdown signal received, draining connections", "timeout", s.cfg.ShutdownTimeout) } @@ -113,3 +122,9 @@ func (s *Server) boundPort() int { } return s.cfg.Port } + +func (s *Server) requestShutdown() { + s.shutdownOnce.Do(func() { + close(s.shutdownRequested) + }) +} diff --git a/backend/internal/httpd/server_test.go b/backend/internal/httpd/server_test.go index 39270d1c..0248e866 100644 --- a/backend/internal/httpd/server_test.go +++ b/backend/internal/httpd/server_test.go @@ -91,6 +91,49 @@ func TestServerLifecycle(t *testing.T) { } } +func TestServerShutdownEndpoint(t *testing.T) { + runPath := filepath.Join(t.TempDir(), "running.json") + cfg := config.Config{ + Host: "127.0.0.1", + Port: 0, + ShutdownTimeout: 5 * time.Second, + RunFilePath: runPath, + } + + srv, err := New(cfg, discardLogger()) + if err != nil { + t.Fatalf("New: %v", err) + } + + runErr := make(chan error, 1) + go func() { runErr <- srv.Run(context.Background()) }() + + base := "http://" + srv.Addr().String() + waitForHealth(t, base) + + resp, err := http.Post(base+"/shutdown", "application/json", nil) + if err != nil { + t.Fatalf("POST /shutdown: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusAccepted { + t.Fatalf("POST /shutdown = %d, want 202", resp.StatusCode) + } + + select { + case err := <-runErr: + if err != nil { + t.Fatalf("Run returned error on shutdown endpoint: %v", err) + } + case <-time.After(10 * time.Second): + t.Fatal("Run did not return after shutdown endpoint") + } + + if after, _ := runfile.Read(runPath); after != nil { + t.Error("run-file still present after shutdown endpoint; want it removed") + } +} + func waitForHealth(t *testing.T, base string) { t.Helper() // Per-request timeout so a stalled connect or hung handshake doesn't park From 0d8ffcd17a6769ff501610ae03d9fd787eb92c25 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari <24b4506@iitb.ac.in> Date: Mon, 1 Jun 2026 01:50:43 +0530 Subject: [PATCH 06/12] fix(httpd): update server_test for termMgr arg after rebase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shutdown endpoint test was authored against the pre-rebase httpd.New(cfg, log) signature. After rebasing onto main, the terminal manager (from #50) made termMgr a required third arg. Pass nil — the test exercises /shutdown, not /mux, so the terminal surface stays off. Co-Authored-By: Claude Opus 4.7 --- backend/internal/httpd/server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/internal/httpd/server_test.go b/backend/internal/httpd/server_test.go index 0248e866..2b7ba4f3 100644 --- a/backend/internal/httpd/server_test.go +++ b/backend/internal/httpd/server_test.go @@ -100,7 +100,7 @@ func TestServerShutdownEndpoint(t *testing.T) { RunFilePath: runPath, } - srv, err := New(cfg, discardLogger()) + srv, err := New(cfg, discardLogger(), nil) if err != nil { t.Fatalf("New: %v", err) } From 2d00e4675d1575a8942989e39cc75176c73b32f2 Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 01:55:14 +0530 Subject: [PATCH 07/12] fix(cli): harden daemon control surface and stop CLI from writing the store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review findings on PR #53 (on top of the rebase onto main). - doctor: stop opening/migrating SQLite. The daemon is the sole store writer/migrator (architecture.md §7); the CLI must not run migrations or open a second writer against a DB a live daemon owns. doctor now reports database-file presence and gains --json. - stop: only remove running.json when it still belongs to the PID we stopped, so a concurrent `ao start` that wrote a new run-file is not clobbered into looking stopped. - httpd: gate POST /shutdown to loopback callers with no Origin header, closing the CSRF / DNS-rebinding vector against an unauthenticated, state-changing endpoint. - start: detach the spawned daemon into its own session/process group so a Ctrl-C while `ao start` waits for readiness doesn't also kill it. - cli: exit 2 for usage errors (bad flag / arg count) vs 1 for runtime failures. - daemon: unexport newLogger (only used in-package). - tests: /shutdown guard (cross-origin + rebinding) and stop run-file ownership guard. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/cmd/ao/main.go | 2 +- backend/internal/cli/completion.go | 11 +++- backend/internal/cli/doctor.go | 76 ++++++++++++++++------ backend/internal/cli/process.go | 4 ++ backend/internal/cli/process_unix.go | 7 +++ backend/internal/cli/process_windows.go | 7 +++ backend/internal/cli/root.go | 27 ++++++++ backend/internal/cli/stop.go | 10 ++- backend/internal/cli/stop_test.go | 83 +++++++++++++++++++++++++ backend/internal/daemon/daemon.go | 6 +- backend/internal/httpd/control_test.go | 52 ++++++++++++++++ backend/internal/httpd/router.go | 37 ++++++++++- docs/cli/README.md | 41 ++++++------ 13 files changed, 316 insertions(+), 47 deletions(-) create mode 100644 backend/internal/cli/stop_test.go create mode 100644 backend/internal/httpd/control_test.go diff --git a/backend/cmd/ao/main.go b/backend/cmd/ao/main.go index 1ee35d62..d1ea897c 100644 --- a/backend/cmd/ao/main.go +++ b/backend/cmd/ao/main.go @@ -10,6 +10,6 @@ import ( func main() { if err := cli.Execute(); err != nil { fmt.Fprintln(os.Stderr, err) - os.Exit(1) + os.Exit(cli.ExitCode(err)) } } diff --git a/backend/internal/cli/completion.go b/backend/internal/cli/completion.go index 61b9483e..f4575de0 100644 --- a/backend/internal/cli/completion.go +++ b/backend/internal/cli/completion.go @@ -8,9 +8,14 @@ import ( func newCompletionCommand() *cobra.Command { return &cobra.Command{ - Use: "completion [bash|zsh|fish|powershell]", - Short: "Generate shell completion scripts", - Args: cobra.ExactArgs(1), + Use: "completion [bash|zsh|fish|powershell]", + Short: "Generate shell completion scripts", + Args: func(cmd *cobra.Command, args []string) error { + if err := cobra.ExactArgs(1)(cmd, args); err != nil { + return usageError{err} + } + return nil + }, ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, RunE: func(cmd *cobra.Command, args []string) error { root := cmd.Root() diff --git a/backend/internal/cli/doctor.go b/backend/internal/cli/doctor.go index 3a452ac1..4c6953f2 100644 --- a/backend/internal/cli/doctor.go +++ b/backend/internal/cli/doctor.go @@ -2,13 +2,15 @@ package cli import ( "context" + "errors" "fmt" + "io/fs" "os" + "path/filepath" "github.com/spf13/cobra" "github.com/aoagents/agent-orchestrator/backend/internal/config" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) type doctorLevel string @@ -20,34 +22,53 @@ const ( ) type doctorCheck struct { - Level doctorLevel - Name string - Message string + Level doctorLevel `json:"level"` + Name string `json:"name"` + Message string `json:"message"` +} + +type doctorReport struct { + OK bool `json:"ok"` + Failures int `json:"failures"` + Checks []doctorCheck `json:"checks"` } func newDoctorCommand(ctx *commandContext) *cobra.Command { - return &cobra.Command{ + var asJSON bool + cmd := &cobra.Command{ Use: "doctor", Short: "Run local AO health checks", RunE: func(cmd *cobra.Command, args []string) error { checks := ctx.runDoctor(cmd.Context()) - for _, check := range checks { - if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s: %s\n", check.Level, check.Name, check.Message); err != nil { - return err - } - } - var failures int + failures := 0 for _, check := range checks { if check.Level == doctorFail { failures++ } } + + if asJSON { + if err := writeJSON(cmd.OutOrStdout(), doctorReport{ + OK: failures == 0, Failures: failures, Checks: checks, + }); err != nil { + return err + } + } else { + for _, check := range checks { + if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s: %s\n", check.Level, check.Name, check.Message); err != nil { + return err + } + } + } + if failures > 0 { return fmt.Errorf("doctor found %d failing check(s)", failures) } return nil }, } + cmd.Flags().BoolVar(&asJSON, "json", false, "Output health checks as JSON") + return cmd } func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { @@ -68,13 +89,7 @@ func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { checks = append(checks, doctorCheck{Level: doctorPass, Name: "data-dir", Message: cfg.DataDir}) } - store, err := sqlite.Open(cfg.DataDir) - if err != nil { - checks = append(checks, doctorCheck{Level: doctorFail, Name: "sqlite", Message: err.Error()}) - } else { - _ = store.Close() - checks = append(checks, doctorCheck{Level: doctorPass, Name: "sqlite", Message: "opened database and applied migrations"}) - } + checks = append(checks, checkStore(cfg.DataDir)) st, err := c.inspectDaemon(ctx) if err != nil { @@ -103,6 +118,31 @@ func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { return checks } +// checkStore inspects the SQLite store WITHOUT opening or migrating it. The +// daemon is the sole writer and migrator of the database (architecture.md §7); +// the CLI must never run migrations or open a second writer against a database +// a live daemon may already own. Migrations are validated by the daemon at +// startup and surfaced through /readyz, so doctor only confirms whether the +// database file exists yet. +func checkStore(dataDir string) doctorCheck { + dbPath := filepath.Join(dataDir, "ao.db") + info, err := os.Stat(dbPath) + switch { + case err == nil: + return doctorCheck{ + Level: doctorPass, Name: "sqlite", + Message: fmt.Sprintf("%s (%d bytes); migrations are applied by the daemon at startup", dbPath, info.Size()), + } + case errors.Is(err, fs.ErrNotExist): + return doctorCheck{ + Level: doctorWarn, Name: "sqlite", + Message: "database not created yet; run `ao start` to initialize and migrate it", + } + default: + return doctorCheck{Level: doctorFail, Name: "sqlite", Message: err.Error()} + } +} + func (c *commandContext) checkTool(name string, required bool) doctorCheck { path, err := c.deps.LookPath(name) if err == nil { diff --git a/backend/internal/cli/process.go b/backend/internal/cli/process.go index 3db43209..19c4d19f 100644 --- a/backend/internal/cli/process.go +++ b/backend/internal/cli/process.go @@ -22,6 +22,10 @@ func startProcess(cfg processStartConfig) (processHandle, error) { cmd.Env = cfg.Env cmd.Stdout = cfg.Stdout cmd.Stderr = cfg.Stderr + // Detach the daemon into its own session/process group so a Ctrl-C in the + // terminal where `ao start` is waiting for readiness doesn't also SIGINT the + // freshly spawned daemon (it would otherwise share the launcher's group). + cmd.SysProcAttr = detachSysProcAttr() if err := cmd.Start(); err != nil { return processHandle{}, err } diff --git a/backend/internal/cli/process_unix.go b/backend/internal/cli/process_unix.go index 61aa333b..9963d9e9 100644 --- a/backend/internal/cli/process_unix.go +++ b/backend/internal/cli/process_unix.go @@ -14,3 +14,10 @@ func processAlive(pid int) bool { err := syscall.Kill(pid, 0) return err == nil || errors.Is(err, syscall.EPERM) } + +// detachSysProcAttr puts the daemon in a new session (Setsid) so it is no +// longer in the launcher's foreground process group and won't receive the +// terminal's SIGINT/SIGHUP. +func detachSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setsid: true} +} diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go index 216431d8..3ff8190a 100644 --- a/backend/internal/cli/process_windows.go +++ b/backend/internal/cli/process_windows.go @@ -4,6 +4,7 @@ package cli import ( "errors" + "syscall" "golang.org/x/sys/windows" ) @@ -27,3 +28,9 @@ func processAlive(pid int) bool { } return status == uint32(windows.WAIT_TIMEOUT) } + +// detachSysProcAttr starts the daemon in a new process group so it does not +// receive the console's CTRL_C/CTRL_BREAK while `ao start` waits for readiness. +func detachSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{CreationFlags: windows.CREATE_NEW_PROCESS_GROUP} +} diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index c49a0339..36e83e5a 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -3,6 +3,7 @@ package cli import ( + "errors" "io" "net/http" "os" @@ -19,6 +20,27 @@ func Execute() error { return NewRootCommand(DefaultDeps()).Execute() } +// usageError marks a command-line misuse (bad flag, wrong arg count). It lets +// the process entrypoint return exit code 2 for usage errors versus 1 for +// runtime failures, matching the convention CLIs are scripted against. +type usageError struct{ err error } + +func (e usageError) Error() string { return e.err.Error() } +func (e usageError) Unwrap() error { return e.err } + +// ExitCode maps a CLI error to a process exit code: 2 for usage errors, 1 for +// any other failure, 0 for success. +func ExitCode(err error) int { + if err == nil { + return 0 + } + var ue usageError + if errors.As(err, &ue) { + return 2 + } + return 1 +} + // Deps holds the small set of side effects the CLI needs. Tests replace these // functions without reaching into process-global state. type Deps struct { @@ -103,6 +125,11 @@ func NewRootCommand(deps Deps) *cobra.Command { root.SetOut(deps.Out) root.SetErr(deps.Err) root.CompletionOptions.DisableDefaultCmd = true + // Tag flag-parse failures as usage errors so the entrypoint can exit 2 for + // misuse versus 1 for runtime failures. Subcommands inherit this func. + root.SetFlagErrorFunc(func(_ *cobra.Command, err error) error { + return usageError{err} + }) root.AddCommand(newDaemonCommand()) root.AddCommand(newStartCommand(ctx)) diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go index 41d42d30..9b00c1c4 100644 --- a/backend/internal/cli/stop.go +++ b/backend/internal/cli/stop.go @@ -115,8 +115,14 @@ func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePat return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil } if !alive { - if err := runfile.Remove(runFilePath); err != nil { - return daemonStatus{}, err + // Only remove the run-file if it still belongs to the process we + // stopped. A concurrent `ao start` may have already written a new + // run-file for a different daemon; removing that would corrupt its + // handshake and make a live daemon look stopped. + if info.PID == pid { + if err := runfile.Remove(runFilePath); err != nil { + return daemonStatus{}, err + } } return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil } diff --git a/backend/internal/cli/stop_test.go b/backend/internal/cli/stop_test.go new file mode 100644 index 00000000..85b6a509 --- /dev/null +++ b/backend/internal/cli/stop_test.go @@ -0,0 +1,83 @@ +package cli + +import ( + "context" + "path/filepath" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// TestWaitForStoppedKeepsRunFileFromConcurrentStart guards against deleting a +// fresh daemon's handshake: if a concurrent `ao start` replaces running.json +// with a new live PID while we are polling the PID we stopped, waitForStopped +// must report stopped but leave the new run-file intact. +func TestWaitForStoppedKeepsRunFileFromConcurrentStart(t *testing.T) { + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + + const stoppedPID, newPID = 1111, 2222 + // running.json now belongs to a different, live daemon. + if err := runfile.Write(runFile, runfile.Info{PID: newPID, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + c := &commandContext{deps: Deps{ + ProcessAlive: func(pid int) bool { return pid == newPID }, // stoppedPID is dead + Now: func() time.Time { return time.Unix(200, 0).UTC() }, + Sleep: func(time.Duration) {}, + }.withDefaults()} + + st, err := c.waitForStopped(context.Background(), stoppedPID, runFile, dir, time.Second) + if err != nil { + t.Fatal(err) + } + if st.State != "stopped" { + t.Fatalf("state = %q, want stopped", st.State) + } + + info, err := runfile.Read(runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("new daemon's run-file was deleted by stop of a different PID") + } + if info.PID != newPID { + t.Fatalf("run-file PID = %d, want %d (new daemon)", info.PID, newPID) + } +} + +// TestWaitForStoppedRemovesOwnRunFile confirms the normal path still cleans up: +// when the dead PID owns the run-file, it is removed. +func TestWaitForStoppedRemovesOwnRunFile(t *testing.T) { + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + + const stoppedPID = 1111 + if err := runfile.Write(runFile, runfile.Info{PID: stoppedPID, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + c := &commandContext{deps: Deps{ + ProcessAlive: func(int) bool { return false }, + Now: func() time.Time { return time.Unix(200, 0).UTC() }, + Sleep: func(time.Duration) {}, + }.withDefaults()} + + st, err := c.waitForStopped(context.Background(), stoppedPID, runFile, dir, time.Second) + if err != nil { + t.Fatal(err) + } + if st.State != "stopped" { + t.Fatalf("state = %q, want stopped", st.State) + } + info, err := runfile.Read(runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("own run-file should have been removed, got %#v", info) + } +} diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index fd7dcc7c..556fe5f0 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -27,7 +27,7 @@ func Run() error { return err } - log := NewLogger() + log := newLogger() // Fail fast if a live daemon already owns the handshake file. A run-file // left by a crashed predecessor (dead PID) is treated as stale and @@ -119,8 +119,8 @@ func Run() error { return runErr } -// NewLogger returns the daemon's slog logger. It writes to stderr so supervisors +// newLogger returns the daemon's slog logger. It writes to stderr so supervisors // can capture it separately from any structured stdout protocol added later. -func NewLogger() *slog.Logger { +func newLogger() *slog.Logger { return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) } diff --git a/backend/internal/httpd/control_test.go b/backend/internal/httpd/control_test.go new file mode 100644 index 00000000..3e8456f8 --- /dev/null +++ b/backend/internal/httpd/control_test.go @@ -0,0 +1,52 @@ +package httpd + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +// TestShutdownGuard verifies that POST /shutdown only fires for a trusted local +// caller: a loopback Host with no Origin header. A cross-site Origin or a +// non-loopback (DNS-rebinding) Host must be rejected without triggering the +// shutdown side effect. +func TestShutdownGuard(t *testing.T) { + cases := []struct { + name string + host string + origin string + wantStatus int + wantFired bool + }{ + {name: "loopback no origin", host: "127.0.0.1:3001", wantStatus: http.StatusAccepted, wantFired: true}, + {name: "localhost no origin", host: "localhost:3001", wantStatus: http.StatusAccepted, wantFired: true}, + {name: "cross-site origin", host: "127.0.0.1:3001", origin: "https://evil.example", wantStatus: http.StatusForbidden, wantFired: false}, + {name: "rebinding host", host: "evil.example", wantStatus: http.StatusForbidden, wantFired: false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + fired := false + r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{}, ControlDeps{ + RequestShutdown: func() { fired = true }, + }) + + req := httptest.NewRequest(http.MethodPost, "http://"+tc.host+"/shutdown", nil) + req.Host = tc.host + if tc.origin != "" { + req.Header.Set("Origin", tc.origin) + } + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != tc.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tc.wantStatus) + } + if fired != tc.wantFired { + t.Fatalf("shutdown fired = %v, want %v", fired, tc.wantFired) + } + }) + } +} diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index d406b029..5d132eb4 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -6,6 +6,7 @@ package httpd import ( "log/slog" + "net" "net/http" "os" @@ -76,11 +77,22 @@ func mountHealth(r chi.Router) { r.Get("/readyz", handleReadyz) } +// mountControl registers the loopback daemon-control endpoints. /shutdown is +// unauthenticated and state-changing, so it is gated by localControlRequest to +// keep a browser the user happens to have open (CSRF / DNS-rebinding) or a +// remote client from being able to kill the daemon. func mountControl(r chi.Router, deps ControlDeps) { if deps.RequestShutdown == nil { return } - r.Post("/shutdown", func(w http.ResponseWriter, _ *http.Request) { + r.Post("/shutdown", func(w http.ResponseWriter, req *http.Request) { + if !localControlRequest(req) { + writeJSON(w, http.StatusForbidden, map[string]any{ + "status": "forbidden", + "service": daemonmeta.ServiceName, + }) + return + } writeJSON(w, http.StatusAccepted, map[string]any{ "status": "shutting_down", "service": daemonmeta.ServiceName, @@ -90,6 +102,29 @@ func mountControl(r chi.Router, deps ControlDeps) { }) } +// localControlRequest reports whether a control request is a trusted local +// caller. The Go CLI client addresses the daemon by its loopback host and +// never sets an Origin header; a cross-site browser fetch always carries an +// Origin, and a DNS-rebinding attempt resolves a non-loopback Host. Rejecting +// either closes the CSRF/rebinding vector while leaving the CLI unaffected. +func localControlRequest(r *http.Request) bool { + if r.Header.Get("Origin") != "" { + return false + } + host := r.Host + if h, _, err := net.SplitHostPort(host); err == nil { + host = h + } + switch host { + case "127.0.0.1", "::1", "localhost": + return true + } + if ip := net.ParseIP(host); ip != nil { + return ip.IsLoopback() + } + return false +} + // handleHealthz is the liveness probe: it answers 200 as long as the process is // up and serving. It does no dependency checks by design. func handleHealthz(w http.ResponseWriter, _ *http.Request) { diff --git a/docs/cli/README.md b/docs/cli/README.md index dd83c306..d78539a0 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -14,10 +14,13 @@ What works now: - `ao start` starts the daemon in the background and waits for `/readyz`. - `ao status` and `ao status --json` report stopped, stale, unhealthy, not-ready, or ready daemon state. -- `ao stop` gracefully stops the daemon using the PID in `running.json`. +- `ao stop` gracefully stops the daemon via the loopback `POST /shutdown` + endpoint, only after verifying the daemon's identity from `running.json`. - `ao daemon` is the hidden internal daemon entrypoint used by `ao start`. -- `ao doctor` checks config, data dir, SQLite migrations, daemon state, and - local tool availability for `git`, `tmux`, and `zellij`. +- `ao doctor` (and `ao doctor --json`) checks config, data dir, the database + file's presence, daemon state, and local tool availability for `git`, `tmux`, + and `zellij`. It never opens or migrates the store — the daemon is the sole + writer/migrator, so doctor only reports whether the database exists yet. - `ao completion` generates shell completions for `bash`, `zsh`, `fish`, and `powershell`. - `ao version` and `ao --version` print build metadata. @@ -52,8 +55,9 @@ What is intentionally not implemented yet: Next steps: -1. Add `/api/v1/projects` on the daemon over a small project service. -2. Implement `ao project list/add/show/remove`. +1. Wire the existing project manager/controller shell into the daemon with a + durable SQLite-backed project store. +2. Implement `ao project list/add/show/remove` against `/api/v1/projects`. 3. Wire production Session Manager dependencies: project-backed repo resolver, tmux/zellij runtime registry, first agent adapter, and AgentMessenger. 4. Add `/api/v1/sessions`, then implement `ao spawn`, `ao session ...`, and @@ -281,8 +285,8 @@ Acceptance criteria for the foundation: ## Implementation Readiness This section records what the CLI can connect to in the current codebase and -what still needs to be built. Inventory date: 2026-05-31 on `main` at -`0672dbb`. +what still needs to be built. Inventory date: 2026-05-31 after merging +`origin/main` at `438b830`. ### Implemented Foundation @@ -298,8 +302,9 @@ Implemented commands: supports `--json` and `--timeout`. - `ao status` reports stopped/stale/unhealthy/not-ready/ready states and supports `--json`. -- `ao doctor` checks config, data dir, SQLite open/migrations, daemon state, and - local tool availability for `git`, `tmux`, and `zellij`. +- `ao doctor` checks config, data dir, database-file presence, daemon state, and + local tool availability for `git`, `tmux`, and `zellij`; supports `--json`. It + does not open or migrate the store (the daemon owns that). - `ao completion` generates `bash`, `zsh`, `fish`, and `powershell` completions. - `ao version` prints build metadata. @@ -331,7 +336,7 @@ they are wired into the daemon and exposed through HTTP. | Area | Existing code | Missing before CLI can use it | |---|---|---| -| Project persistence | `sqlite.Store` has `UpsertProject`, `GetProject`, `ListProjects`, and `ArchiveProject`. | Project domain/service layer, project ID/path/origin validation, and `/api/v1/projects` routes. | +| Project API pieces | `internal/project` has manager/controller DTOs, `/api/v1/projects` routes exist, and `sqlite.Store` has project CRUD. | Durable project-store adapter/wiring in the daemon and CLI commands. The daemon currently constructs the router with nil API deps, so project routes are not product-usable from `ao` yet. | | Session Manager | `backend/internal/session.Manager` implements `Spawn`, `Kill`, `Restore`, `List`, `Get`, `Send`, and `Cleanup`. | Production daemon wiring with real runtime, agent, workspace, messenger, and HTTP routes. | | Runtime adapters | tmux and zellij adapters implement `ports.Runtime` and also have attach/send/output helpers. | Runtime registry wiring in daemon, attach/send abstractions in ports/API, and selection config. | | Workspace adapter | git worktree adapter implements create/destroy/restore/list with safety checks. | Repo resolver backed by registered projects and daemon wiring into Session Manager. | @@ -345,12 +350,10 @@ These are the main gaps before the full initial command set is real. | Gap | Blocks | |---|---| -| Cobra dependency and CLI packages. | All CLI commands. | -| Daemon extraction from `backend/main.go` into `internal/daemon`. | `ao daemon`, `ao start`, tests around daemon startup. | -| CLI process runner and PID signal helpers. | `ao start`, `ao stop`. | -| Loopback HTTP client package with run-file discovery. | `ao status`, later all daemon-backed commands. | +| Product API client package with run-file discovery. | `project`, `spawn`, `session`, `send`, `events list`, richer `status`. | | Shutdown mechanism choice: PID signal now, optional `POST /api/v1/daemon/shutdown` later. | `ao stop` polish and cross-platform behavior. | -| HTTP API route surface under `/api/v1`. | `project`, `spawn`, `session`, `send`, `events list`, richer `status`. | +| Session/send API route surface under `/api/v1`. | `spawn`, `session`, `send`, richer `status`. | +| Project API daemon wiring. | `ao project list/add/show/remove`. | | SSE route for live CDC events plus durable catch-up reads. | `ao events tail`, frontend live updates. | | Agent adapters for supported harnesses (`codex`, `claude-code`, etc.). | `ao spawn`, `ao session restore`. | | AgentMessenger implementation over tmux/zellij. | `ao send`, LCM auto-nudge reactions. | @@ -368,10 +371,10 @@ These are the main gaps before the full initial command set is real. | `ao start` | Implemented | Config, run-file stale check, HTTP readiness probes. | Later: package-manager/service integration if needed. | | `ao stop` | Implemented | Run-file discovery gives PID/port; server exits cleanly on SIGINT/SIGTERM. | Optional later shutdown HTTP route. | | `ao status` | Partially implemented | Run-file, process liveness via PID, `/healthz`, `/readyz`. | Rich project/session summary waits for `/api/v1/projects` and `/api/v1/sessions`. | -| `ao doctor` | Partially implemented | Config resolution, run-file, storage open, runtime binary checks. | Deeper adapter preflights need daemon wiring/config. | +| `ao doctor` | Partially implemented | Config resolution, run-file, database-file presence (no open/migrate), runtime binary checks. | Deeper adapter preflights need daemon wiring/config and should be queried from the daemon, not run in-process. | | `ao completion` | Implemented | Cobra generators. | None for foundation. | | `ao version` | Implemented | Build metadata can be injected with `-ldflags`. | Release tooling needs to set metadata. | -| `ao project list/add/show/remove` | Not yet | SQLite project CRUD exists. | Project service and HTTP routes. CLI must not write SQLite directly. | +| `ao project list/add/show/remove` | Not yet | Project manager/controller route shell and SQLite project CRUD exist. | Durable project-store adapter, daemon API wiring, and CLI HTTP client. CLI must not write SQLite directly. | | `ao spawn` | Not yet | Session Manager exists; runtime/workspace/tracker pieces partly exist. | Agent adapters, registry/config wiring, project lookup, tracker hydration, HTTP route. | | `ao session list/show` | Not yet | Store and Session Manager read model exist. | HTTP routes and response DTOs. | | `ao session attach` | Not yet | tmux/zellij have attach command helpers. | Runtime attach port/API and terminal-launch policy. | @@ -383,8 +386,8 @@ These are the main gaps before the full initial command set is real. 1. Build CLI foundation around the daemon only: `daemon`, `start`, `stop`, `status`, `doctor`, `completion`, `version`. -2. Add `/api/v1/projects` over a small project service, then implement - `project list/add/show/remove`. +2. Wire the existing project manager/controller shell into the daemon with a + durable SQLite-backed store, then implement `project list/add/show/remove`. 3. Wire production Session Manager dependencies: project-backed repo resolver, tmux/zellij runtime registry, first agent adapter, and AgentMessenger. 4. Add `/api/v1/sessions` and implement `spawn`, `session list/show/kill/restore`, From 3680ac5474196faf080d8c20f3421e2968c3462f Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 02:33:56 +0530 Subject: [PATCH 08/12] test(cli): add end-to-end smoke test + Docker/CI harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a fresh-machine, install→use→verify E2E test for the `ao` CLI and wires it into CI. The suite drives the real binary (start/status/doctor/stop + the daemon-control HTTP surface) against fully isolated state — its own temp run-file, data dir, and an auto-picked free loopback port — so it never collides with a developer's real AO install or daemon. - test/cli/smoke.sh: 40 assertions covering install resolution, version/help (daemon hidden), doctor text+json (and that it does NOT migrate SQLite), status stopped/stale/ready, start fresh+idempotent, daemon-created store, /healthz identity, the /shutdown CSRF + DNS-rebinding guard (403 + survives), graceful/stale/idempotent stop, run-file ownership cleanup, exit codes (2 usage / 1 runtime), and completion for all four shells. It deliberately ignores an inherited AO_PORT and self-allocates a free port for isolation. - test/cli/Dockerfile: models installing ao on a fresh machine — builds the binary, drops it on PATH in a clean Debian image with only runtime deps (git/tmux/curl), runs the suite as a non-root user. - test/cli/run-local.sh: build-from-source + native run convenience wrapper. - .github/workflows/cli-e2e.yml: two tiers — `native` runs the suite on a ubuntu+macos runner matrix (the real VMs, to cover the unix setsid detach and macOS os.UserConfigDir paths a Linux container can't), and `container` runs the fresh-machine Docker image with --init (real PID-1 reaper so the stale-daemon assertion is reliable). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/cli-e2e.yml | 55 +++++++ test/cli/Dockerfile | 46 ++++++ test/cli/README.md | 73 +++++++++ test/cli/run-local.sh | 24 +++ test/cli/smoke.sh | 299 ++++++++++++++++++++++++++++++++++ 5 files changed, 497 insertions(+) create mode 100644 .github/workflows/cli-e2e.yml create mode 100644 test/cli/Dockerfile create mode 100644 test/cli/README.md create mode 100755 test/cli/run-local.sh create mode 100755 test/cli/smoke.sh diff --git a/.github/workflows/cli-e2e.yml b/.github/workflows/cli-e2e.yml new file mode 100644 index 00000000..e9daf56c --- /dev/null +++ b/.github/workflows/cli-e2e.yml @@ -0,0 +1,55 @@ +name: CLI E2E + +on: + push: + branches: [main] + pull_request: + paths: + - "backend/**" + - "test/cli/**" + - ".github/workflows/cli-e2e.yml" + +permissions: + contents: read + +jobs: + # Primary tier: run the REAL `ao` binary on GitHub's native VM runners. These + # runners are the "VMs" — the only place that exercises the OS-specific code + # paths (unix Setsid process-group detach + macOS os.UserConfigDir resolution). + # Bash is available on both ubuntu and macos runners, so the one smoke.sh runs + # unchanged. State is isolated per run (own temp dir + a free loopback port). + native: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.25" + cache: false + + - name: Build ao + run: cd backend && CGO_ENABLED=0 go build -trimpath -o "$RUNNER_TEMP/ao" ./cmd/ao + + - name: CLI smoke test + run: AO_BIN="$RUNNER_TEMP/ao" bash test/cli/smoke.sh + + # Secondary hardening tier: model "install ao on a fresh machine" in a clean, + # locked-down Linux container with no access to a developer's real state. + # --init gives the container a real PID-1 reaper (tini) so the stale-daemon + # assertion is reliable — without it, an orphaned daemon can linger as a + # zombie and skew the check. + container: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build smoke image (fresh-machine install) + run: docker build -f test/cli/Dockerfile -t ao-cli-smoke . + + - name: Run CLI smoke test in container + run: docker run --rm --init ao-cli-smoke diff --git a/test/cli/Dockerfile b/test/cli/Dockerfile new file mode 100644 index 00000000..dfeb4c46 --- /dev/null +++ b/test/cli/Dockerfile @@ -0,0 +1,46 @@ +# End-to-end CLI smoke test, modelling "install ao on a fresh machine, then use it". +# +# Build context is the REPO ROOT: +# docker build -f test/cli/Dockerfile -t ao-cli-smoke . +# docker run --rm --init ao-cli-smoke +# +# --init gives the container a real PID-1 reaper (tini) so a stopped daemon is +# reaped promptly; the test is written to not depend on it, but it keeps process +# accounting clean. + +# ---- stage 1: build the binary (the "release" a user would download) ---- +FROM golang:1.25-bookworm AS build +WORKDIR /src + +# Cache modules first. +COPY backend/go.mod backend/go.sum ./backend/ +RUN cd backend && go mod download + +COPY backend ./backend +# Pure-Go SQLite (modernc) builds fine with CGO disabled -> a static binary. +RUN cd backend && CGO_ENABLED=0 go build -trimpath -o /out/ao ./cmd/ao + +# ---- stage 2: a clean machine with NO Go toolchain, just like an end user ---- +FROM debian:bookworm-slim AS run + +# Runtime deps a fresh user would need: git is required by `ao doctor`; tmux is +# the optional runtime it probes for; curl drives the HTTP-level guard checks; +# ca-certificates for good measure. +RUN apt-get update \ + && apt-get install -y --no-install-recommends git tmux curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# "Install" the CLI the way a user would: drop the binary on PATH. +COPY --from=build /out/ao /usr/local/bin/ao +COPY test/cli/smoke.sh /usr/local/bin/ao-smoke.sh +RUN chmod +x /usr/local/bin/ao /usr/local/bin/ao-smoke.sh + +# Run as an unprivileged user with a real HOME, like a normal install. +RUN useradd --create-home --shell /bin/bash ao +USER ao +WORKDIR /home/ao + +# Sanity: prove the install resolved before the suite runs. +RUN ao version + +ENTRYPOINT ["/usr/local/bin/ao-smoke.sh"] diff --git a/test/cli/README.md b/test/cli/README.md new file mode 100644 index 00000000..51ce67fe --- /dev/null +++ b/test/cli/README.md @@ -0,0 +1,73 @@ +# `ao` CLI end-to-end tests + +These tests install and drive the **real `ao` binary** the way a user would — +`start` → `status` → `doctor` → `stop`, plus the daemon-control HTTP surface — +and assert the whole thing works end to end. They run against **isolated, +throwaway state** (their own temp run-file + data dir + a free loopback port), +so they never touch a developer's real AO installation. + +## Files + +| File | Purpose | +|------|---------| +| `smoke.sh` | The suite. Host-agnostic bash; drives the binary at `$AO_BIN` (default `ao` on PATH) and prints a PASS/FAIL line per assertion. | +| `Dockerfile` | Models **installing `ao` on a fresh machine**: builds the binary, drops it on `PATH` in a clean Debian image with only runtime deps (`git`, `tmux`, `curl`), then runs `smoke.sh` as a non-root user. | +| `run-local.sh` | Convenience wrapper: build from source and run `smoke.sh` natively against a temp binary. | + +## Run it + +**Native (fastest, uses your toolchain):** +```bash +test/cli/run-local.sh +# or, against a binary you already built: +AO_BIN=/path/to/ao test/cli/smoke.sh +``` + +**Fresh-machine install, in a clean container:** +```bash +docker build -f test/cli/Dockerfile -t ao-cli-smoke . +docker run --rm --init ao-cli-smoke +``` +> `--init` is important: it gives the container a real PID-1 reaper so the +> "stale daemon" assertion is reliable. Without it an orphaned daemon can linger +> as a zombie and skew the check. + +## What it covers + +Install resolves on PATH · `version`/`--version` · `--help` (and hides the +internal `daemon` command) · `doctor` text + `--json` (and that it **does not** +open/migrate SQLite) · `status` stopped/stale/ready · `start` (fresh + +idempotent) · daemon-created store · `/healthz` identity · the `/shutdown` +CSRF/DNS-rebinding guard (403 + daemon survives) · `stop` (graceful + stale + +idempotent) · run-file cleanup/ownership · exit codes (`2` usage, `1` runtime) · +completion for all four shells. + +## Testing strategy — why it's shaped this way + +We deliberately don't make Docker the *only* tier. A daemon that detaches with +`setsid` and outlives the launching process is exactly the workload that +container PID-1 semantics mishandle, and the OS-specific bits (`setsid` vs +Windows `CREATE_NEW_PROCESS_GROUP`, and `os.UserConfigDir()` resolving to +`~/Library/Application Support` on macOS, `%AppData%` on Windows, `~/.config` +on Linux) can't be observed from a Linux container at all. + +So CI (`.github/workflows/cli-e2e.yml`) runs two tiers: + +1. **`native`** — the primary signal. Builds and runs the real binary on a + GitHub matrix of `ubuntu-latest` + `macos-latest` (those runners *are* the + VMs), covering the unix detach path and macOS config-dir resolution. +2. **`container`** — a hardening tier. The `Dockerfile` proves a clean-machine + install works and that the CLI has no hidden dependence on developer state, + run with `--init`. + +### Extending + +- Add an assertion: drop a `step`/`assert_*` pair into the relevant section of + `smoke.sh`. The helpers (`assert_eq`, `assert_contains`, `assert_not_contains`, + `run_rc`) keep cases one-liners. +- Cover Windows: add a `windows-latest` leg to the `native` matrix (Git Bash + ships on the runner) once the suite is confirmed green there, or add Go-based + `os/exec` E2E tests for the Windows process-group path. +- Deeper per-OS path assertions (that state resolves under the OS-native config + dir when `AO_RUN_FILE`/`AO_DATA_DIR` are unset) are best added as Go unit + tests in `internal/config`. diff --git a/test/cli/run-local.sh b/test/cli/run-local.sh new file mode 100755 index 00000000..4cd786a6 --- /dev/null +++ b/test/cli/run-local.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# +# Convenience wrapper: build `ao` from source and run the CLI smoke test against +# it natively, using an isolated temp state dir and a free port. Touches nothing +# in your real AO installation. +# +# test/cli/run-local.sh +# +# To run the same suite the way a brand-new user would install it (clean Linux +# container, binary on PATH), use Docker instead: +# +# docker build -f test/cli/Dockerfile -t ao-cli-smoke . && docker run --rm --init ao-cli-smoke + +set -euo pipefail + +repo_root="$(cd "$(dirname "$0")/../.." && pwd)" +bindir="$(mktemp -d)" +trap 'rm -rf "$bindir"' EXIT + +echo "building ao ..." +( cd "$repo_root/backend" && CGO_ENABLED=0 go build -trimpath -o "$bindir/ao" ./cmd/ao ) + +echo "running smoke test ..." +AO_BIN="$bindir/ao" bash "$repo_root/test/cli/smoke.sh" diff --git a/test/cli/smoke.sh b/test/cli/smoke.sh new file mode 100755 index 00000000..46bd1a4b --- /dev/null +++ b/test/cli/smoke.sh @@ -0,0 +1,299 @@ +#!/usr/bin/env bash +# +# End-to-end smoke test for the `ao` CLI. +# +# It models a *fresh machine*: `ao` is expected to already be installed on PATH +# (the Dockerfile in this directory installs it, simulating a new user), and the +# whole test runs against isolated, throwaway state — its own temp run-file, +# data dir, and a free loopback port — so it never touches a developer's real +# AO installation. +# +# Run locally against a binary you built: +# AO_BIN=/path/to/ao test/cli/smoke.sh +# Or in the container (models install-on-a-fresh-machine): +# docker build -f test/cli/Dockerfile -t ao-cli-smoke . && docker run --rm --init ao-cli-smoke +# +# Exit code: 0 if every assertion passes, 1 otherwise. + +set -uo pipefail + +AO_BIN="${AO_BIN:-ao}" + +# --------------------------------------------------------------------------- +# Tiny assertion framework +# --------------------------------------------------------------------------- +PASS=0 +FAIL=0 +CURRENT="" + +section() { printf '\n\033[1m== %s ==\033[0m\n' "$1"; } +step() { CURRENT="$1"; printf ' • %s ... ' "$1"; } + +ok() { PASS=$((PASS + 1)); printf '\033[32mPASS\033[0m\n'; } +bad() { FAIL=$((FAIL + 1)); printf '\033[31mFAIL\033[0m\n %s\n' "$1"; } + +# assert_eq [msg] +assert_eq() { + if [ "$1" = "$2" ]; then ok; else bad "${3:-}: expected [$2], got [$1]"; fi +} +# assert_contains +assert_contains() { + case "$1" in + *"$2"*) ok ;; + *) bad "expected output to contain [$2]; got: $(printf '%s' "$1" | head -c 400)" ;; + esac +} +# assert_not_contains +assert_not_contains() { + case "$1" in + *"$2"*) bad "expected output to NOT contain [$2]" ;; + *) ok ;; + esac +} + +# run_rc -> sets RC and OUT (stdout+stderr combined) +run_rc() { + OUT="$("$@" 2>&1)" + RC=$? +} + +# --------------------------------------------------------------------------- +# Isolated, throwaway environment +# --------------------------------------------------------------------------- +TMP="$(mktemp -d)" +export AO_RUN_FILE="$TMP/running.json" +export AO_DATA_DIR="$TMP/data" + +# Pick a free loopback port (bash /dev/tcp probe; connect-refused == free). +find_free_port() { + local p + for p in $(seq 3071 3170); do + if ! (exec 3<>"/dev/tcp/127.0.0.1/$p") 2>/dev/null; then + echo "$p"; return 0 + fi + exec 3>&- 2>/dev/null || true + done + echo 3071 +} +# Always run against an isolated, free port. We deliberately do NOT honour an +# inherited AO_PORT — it might point at a real daemon, which is exactly the +# collision this isolation is meant to prevent. Override only via AO_SMOKE_PORT. +export AO_PORT="${AO_SMOKE_PORT:-$(find_free_port)}" + +cleanup() { + "$AO_BIN" stop >/dev/null 2>&1 || true + rm -rf "$TMP" +} +trap cleanup EXIT + +printf 'ao smoke test\n binary : %s\n port : %s\n state : %s\n' \ + "$(command -v "$AO_BIN" || echo "$AO_BIN")" "$AO_PORT" "$TMP" + +# --------------------------------------------------------------------------- +# 1. Install verification — `ao` is a real, runnable binary on this machine +# --------------------------------------------------------------------------- +section "install" + +step "ao resolves on PATH / at AO_BIN" +if command -v "$AO_BIN" >/dev/null 2>&1; then ok; else bad "ao not found"; fi + +step "ao version prints build metadata" +run_rc "$AO_BIN" version +if [ "$RC" -eq 0 ] && [ -n "$OUT" ]; then ok; else bad "rc=$RC out=$OUT"; fi + +step "ao --version works" +run_rc "$AO_BIN" --version +assert_eq "$RC" "0" "--version rc" + +step "ao --help lists product commands" +run_rc "$AO_BIN" --help +assert_contains "$OUT" "start" +step "ao --help lists status/stop/doctor" +assert_contains "$OUT" "doctor" +step "ao --help hides internal daemon command" +assert_not_contains "$OUT" $'\n daemon' + +# --------------------------------------------------------------------------- +# 2. doctor on a fresh machine (no daemon yet) +# --------------------------------------------------------------------------- +section "doctor (fresh)" + +step "doctor exits 0 when required tools present" +run_rc "$AO_BIN" doctor +assert_eq "$RC" "0" "doctor rc (git/tmux must be installed in the image)" + +step "doctor reports git found" +assert_contains "$OUT" "git" + +step "doctor does NOT migrate the store (sqlite WARN, db absent)" +assert_contains "$OUT" "database not created yet" + +step "doctor data dir was created but ao.db was NOT (CLI is not the store writer)" +if [ ! -f "$AO_DATA_DIR/ao.db" ]; then ok; else bad "ao.db exists — doctor must not create/migrate the DB"; fi + +step "doctor --json is valid JSON with ok=true" +run_rc "$AO_BIN" doctor --json +assert_contains "$OUT" '"ok": true' + +# --------------------------------------------------------------------------- +# 3. status when stopped +# --------------------------------------------------------------------------- +section "status (stopped)" + +step "status --json reports stopped" +run_rc "$AO_BIN" status --json +assert_contains "$OUT" '"state": "stopped"' +step "status exits 0 even when stopped (status never errors)" +assert_eq "$RC" "0" "status exit code" +step "stopped status omits startedAt" +assert_not_contains "$OUT" "startedAt" + +step "stop is idempotent when already stopped" +run_rc "$AO_BIN" stop +if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "stopped"; else bad "stop-when-stopped rc=$RC"; fi + +# --------------------------------------------------------------------------- +# 4. start → ready, and status reflects it +# --------------------------------------------------------------------------- +section "start" + +step "start brings the daemon up and reports ready" +run_rc "$AO_BIN" start +if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "ready"; else bad "start rc=$RC out=$OUT"; fi + +step "status --json reports ready with pid+port" +run_rc "$AO_BIN" status --json +assert_contains "$OUT" '"state": "ready"' +step "status carries the bound port" +assert_contains "$OUT" "\"port\": $AO_PORT" + +step "start is idempotent (second start returns ready, no error)" +run_rc "$AO_BIN" start +if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "ready"; else bad "idempotent start rc=$RC"; fi + +# Capture the live pid for later assertions. +PID="$("$AO_BIN" status --json | sed -n 's/.*"pid": \([0-9]*\).*/\1/p' | head -1)" + +# --------------------------------------------------------------------------- +# 5. doctor while running — now the daemon (not the CLI) has created the store +# --------------------------------------------------------------------------- +section "doctor (running)" + +step "daemon created and migrated the store" +if [ -f "$AO_DATA_DIR/ao.db" ]; then ok; else bad "daemon should have created ao.db"; fi + +step "doctor now reports sqlite present + daemon-migrated" +run_rc "$AO_BIN" doctor +assert_contains "$OUT" "migrations are applied by the daemon" + +# --------------------------------------------------------------------------- +# 6. Health endpoint identity (loopback) +# --------------------------------------------------------------------------- +section "health endpoint" + +if command -v curl >/dev/null 2>&1; then + step "/healthz reports the AO daemon service + pid" + run_rc curl -fsS "http://127.0.0.1:$AO_PORT/healthz" + assert_contains "$OUT" "agent-orchestrator-daemon" + + # ------------------------------------------------------------------------- + # 7. /shutdown CSRF / DNS-rebinding guard (review fix M3) + # ------------------------------------------------------------------------- + section "/shutdown guard" + + step "cross-origin POST /shutdown is rejected (403)" + CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ + -H 'Origin: https://evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" + assert_eq "$CODE" "403" "cross-origin shutdown" + + step "non-loopback Host POST /shutdown is rejected (403)" + CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ + -H 'Host: evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" + assert_eq "$CODE" "403" "rebinding-host shutdown" + + step "daemon survived the rejected shutdown attempts" + run_rc "$AO_BIN" status --json + assert_contains "$OUT" '"state": "ready"' +else + section "/shutdown guard" + step "curl unavailable — skipping HTTP-level guard checks" + printf '\033[33mSKIP\033[0m\n' +fi + +# --------------------------------------------------------------------------- +# 8. stop → stopped, run-file cleaned up +# --------------------------------------------------------------------------- +section "stop" + +step "stop gracefully stops the daemon" +run_rc "$AO_BIN" stop +if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "stopped"; else bad "stop rc=$RC out=$OUT"; fi + +step "run-file removed after stop" +if [ ! -f "$AO_RUN_FILE" ]; then ok; else bad "running.json still present"; fi + +step "status --json reports stopped after stop" +run_rc "$AO_BIN" status --json +assert_contains "$OUT" '"state": "stopped"' + +# --------------------------------------------------------------------------- +# 9. stale run-file (dead PID) — deterministic, no real process needed +# --------------------------------------------------------------------------- +section "stale run-file" + +# PID 2147483647 is never alive; the CLI must classify this as stale, not kill it. +printf '{"pid":2147483647,"port":%s,"startedAt":"2020-01-01T00:00:00Z"}\n' "$AO_PORT" > "$AO_RUN_FILE" + +step "status reports stale for a dead-PID run-file" +run_rc "$AO_BIN" status --json +assert_contains "$OUT" '"state": "stale"' +step "status still exits 0 for a stale daemon (reports, never errors)" +assert_eq "$RC" "0" "stale status exit code" + +step "stop clears a stale run-file and reports stopped" +run_rc "$AO_BIN" stop +assert_contains "$OUT" "stopped" +step "stale run-file removed" +if [ ! -f "$AO_RUN_FILE" ]; then ok; else bad "stale running.json not removed"; fi + +# --------------------------------------------------------------------------- +# 10. exit codes: 2 for usage errors, 1 for runtime errors +# --------------------------------------------------------------------------- +section "exit codes" + +step "unknown flag exits 2 (usage error)" +run_rc "$AO_BIN" status --definitely-not-a-flag +assert_eq "$RC" "2" "bad-flag exit code" + +step "missing required arg exits 2 (completion needs a shell)" +run_rc "$AO_BIN" completion +assert_eq "$RC" "2" "missing-arg exit code" + +step "unsupported shell exits non-zero (runtime error)" +run_rc "$AO_BIN" completion notashell +if [ "$RC" -ne 0 ]; then ok; else bad "expected non-zero for bad shell"; fi + +step "invalid config (AO_PORT out of range) exits 1, not 2" +OUT="$(AO_PORT=99999 "$AO_BIN" status 2>&1)"; RC=$? +assert_eq "$RC" "1" "config-error exit code (runtime, not usage)" + +# --------------------------------------------------------------------------- +# 11. shell completion generators +# --------------------------------------------------------------------------- +section "completion" + +for sh in bash zsh fish powershell; do + step "completion $sh generates a script" + run_rc "$AO_BIN" completion "$sh" + if [ "$RC" -eq 0 ] && [ -n "$OUT" ]; then ok; else bad "completion $sh rc=$RC"; fi +done + +# --------------------------------------------------------------------------- +# Result +# --------------------------------------------------------------------------- +printf '\n\033[1m== result ==\033[0m\n passed: %s\n failed: %s\n' "$PASS" "$FAIL" +if [ "$FAIL" -ne 0 ]; then + printf '\033[31mSMOKE TEST FAILED\033[0m\n' + exit 1 +fi +printf '\033[32mSMOKE TEST PASSED\033[0m\n' From a9e83011f188ccbe106bb56a8c35356832bc792b Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 02:37:51 +0530 Subject: [PATCH 09/12] docs(test): correct --init rationale (suite uses a fabricated dead PID) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stale-daemon assertion does not depend on container PID-1 reaping — it writes a fabricated dead PID rather than killing a real process. --init is still run so the real daemon spawned by the `start` test (detached via setsid) is reaped after `stop` instead of lingering as a zombie. Reword the README, Dockerfile, and workflow comments to say that accurately. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/cli-e2e.yml | 7 ++++--- test/cli/Dockerfile | 7 ++++--- test/cli/README.md | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cli-e2e.yml b/.github/workflows/cli-e2e.yml index e9daf56c..35ef26cd 100644 --- a/.github/workflows/cli-e2e.yml +++ b/.github/workflows/cli-e2e.yml @@ -40,9 +40,10 @@ jobs: # Secondary hardening tier: model "install ao on a fresh machine" in a clean, # locked-down Linux container with no access to a developer's real state. - # --init gives the container a real PID-1 reaper (tini) so the stale-daemon - # assertion is reliable — without it, an orphaned daemon can linger as a - # zombie and skew the check. + # --init gives the container a real PID-1 reaper (tini) so the live daemon the + # `start` test spawns (it detaches via setsid) is reaped after `stop` rather + # than lingering as a zombie. The suite doesn't depend on it (the stale case + # uses a fabricated dead PID), but it keeps process accounting clean. container: runs-on: ubuntu-latest steps: diff --git a/test/cli/Dockerfile b/test/cli/Dockerfile index dfeb4c46..ce429a9c 100644 --- a/test/cli/Dockerfile +++ b/test/cli/Dockerfile @@ -4,9 +4,10 @@ # docker build -f test/cli/Dockerfile -t ao-cli-smoke . # docker run --rm --init ao-cli-smoke # -# --init gives the container a real PID-1 reaper (tini) so a stopped daemon is -# reaped promptly; the test is written to not depend on it, but it keeps process -# accounting clean. +# Run with --init so the real daemon spawned during the `start` test (it detaches +# via setsid) is reaped promptly after `stop` instead of lingering as a zombie. +# The suite does NOT depend on it — the stale-daemon case uses a fabricated dead +# PID — but --init keeps process accounting clean. # ---- stage 1: build the binary (the "release" a user would download) ---- FROM golang:1.25-bookworm AS build diff --git a/test/cli/README.md b/test/cli/README.md index 51ce67fe..5eb1ecb5 100644 --- a/test/cli/README.md +++ b/test/cli/README.md @@ -28,9 +28,10 @@ AO_BIN=/path/to/ao test/cli/smoke.sh docker build -f test/cli/Dockerfile -t ao-cli-smoke . docker run --rm --init ao-cli-smoke ``` -> `--init` is important: it gives the container a real PID-1 reaper so the -> "stale daemon" assertion is reliable. Without it an orphaned daemon can linger -> as a zombie and skew the check. +> `--init` gives the container a real PID-1 reaper (tini) so the live daemon +> spawned during the `start` test is reaped after `stop` instead of lingering as +> a zombie. The suite itself doesn't depend on it — the stale-daemon case uses a +> fabricated dead PID — but it keeps process accounting clean. ## What it covers From e6661e3f3b57144989da374569700a1e9e8df192 Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 02:38:25 +0530 Subject: [PATCH 10/12] test(cli): pin ExitCode mapping (usage=2, runtime=1, nil=0) Closes the one nit from the regression audit: the exit-code wiring was correct and covered end-to-end by the smoke test, but not pinned by a unit test. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/internal/cli/exitcode_test.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 backend/internal/cli/exitcode_test.go diff --git a/backend/internal/cli/exitcode_test.go b/backend/internal/cli/exitcode_test.go new file mode 100644 index 00000000..bd8817c6 --- /dev/null +++ b/backend/internal/cli/exitcode_test.go @@ -0,0 +1,27 @@ +package cli + +import ( + "errors" + "fmt" + "testing" +) + +func TestExitCode(t *testing.T) { + cases := []struct { + name string + err error + want int + }{ + {"nil is success", nil, 0}, + {"runtime error is 1", errors.New("boom"), 1}, + {"usage error is 2", usageError{errors.New("bad flag")}, 2}, + {"wrapped usage error is still 2", fmt.Errorf("ctx: %w", usageError{errors.New("x")}), 2}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := ExitCode(tc.err); got != tc.want { + t.Errorf("ExitCode(%v) = %d, want %d", tc.err, got, tc.want) + } + }) + } +} From 4f13dd1b83871ce6c87e37e28405a6684c319291 Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 03:04:54 +0530 Subject: [PATCH 11/12] test(cli): add -v/--verbose mode that prints each command and full output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run-local.sh -v (or AO_SMOKE_VERBOSE=1) makes smoke.sh echo every command and its complete output, indented, alongside the PASS/FAIL — for auditing exactly what the suite runs and what the CLI returns. Default output is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/cli/README.md | 5 +++-- test/cli/run-local.sh | 15 +++++++++++++-- test/cli/smoke.sh | 41 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/test/cli/README.md b/test/cli/README.md index 5eb1ecb5..231c44e9 100644 --- a/test/cli/README.md +++ b/test/cli/README.md @@ -18,9 +18,10 @@ so they never touch a developer's real AO installation. **Native (fastest, uses your toolchain):** ```bash -test/cli/run-local.sh +test/cli/run-local.sh # PASS/FAIL per assertion +test/cli/run-local.sh -v # also print every command and its full output # or, against a binary you already built: -AO_BIN=/path/to/ao test/cli/smoke.sh +AO_BIN=/path/to/ao test/cli/smoke.sh [-v] ``` **Fresh-machine install, in a clean container:** diff --git a/test/cli/run-local.sh b/test/cli/run-local.sh index 4cd786a6..f2f560ac 100755 --- a/test/cli/run-local.sh +++ b/test/cli/run-local.sh @@ -4,7 +4,8 @@ # it natively, using an isolated temp state dir and a free port. Touches nothing # in your real AO installation. # -# test/cli/run-local.sh +# test/cli/run-local.sh # PASS/FAIL per assertion +# test/cli/run-local.sh -v # also print every command and its full output # # To run the same suite the way a brand-new user would install it (clean Linux # container, binary on PATH), use Docker instead: @@ -13,6 +14,16 @@ set -euo pipefail +# Pass through -v/--verbose (anything else is forwarded to smoke.sh too). +smoke_args=() +for arg in "$@"; do + case "$arg" in + -v|--verbose) smoke_args+=("--verbose") ;; + -h|--help) echo "usage: run-local.sh [-v|--verbose]"; exit 0 ;; + *) smoke_args+=("$arg") ;; + esac +done + repo_root="$(cd "$(dirname "$0")/../.." && pwd)" bindir="$(mktemp -d)" trap 'rm -rf "$bindir"' EXIT @@ -21,4 +32,4 @@ echo "building ao ..." ( cd "$repo_root/backend" && CGO_ENABLED=0 go build -trimpath -o "$bindir/ao" ./cmd/ao ) echo "running smoke test ..." -AO_BIN="$bindir/ao" bash "$repo_root/test/cli/smoke.sh" +AO_BIN="$bindir/ao" bash "$repo_root/test/cli/smoke.sh" "${smoke_args[@]+"${smoke_args[@]}"}" diff --git a/test/cli/smoke.sh b/test/cli/smoke.sh index 46bd1a4b..4682d53b 100755 --- a/test/cli/smoke.sh +++ b/test/cli/smoke.sh @@ -10,6 +10,8 @@ # # Run locally against a binary you built: # AO_BIN=/path/to/ao test/cli/smoke.sh +# Verbose — print each command and its full output: +# AO_BIN=/path/to/ao test/cli/smoke.sh -v (or AO_SMOKE_VERBOSE=1) # Or in the container (models install-on-a-fresh-machine): # docker build -f test/cli/Dockerfile -t ao-cli-smoke . && docker run --rm --init ao-cli-smoke # @@ -19,6 +21,16 @@ set -uo pipefail AO_BIN="${AO_BIN:-ao}" +# Verbose mode prints every command and its complete output, not just PASS/FAIL. +VERBOSE="${AO_SMOKE_VERBOSE:-0}" +for arg in "$@"; do + case "$arg" in + -v|--verbose) VERBOSE=1 ;; + -h|--help) echo "usage: [AO_BIN=...] smoke.sh [-v|--verbose]"; exit 0 ;; + *) echo "unknown argument: $arg" >&2; exit 2 ;; + esac +done + # --------------------------------------------------------------------------- # Tiny assertion framework # --------------------------------------------------------------------------- @@ -27,10 +39,29 @@ FAIL=0 CURRENT="" section() { printf '\n\033[1m== %s ==\033[0m\n' "$1"; } -step() { CURRENT="$1"; printf ' • %s ... ' "$1"; } -ok() { PASS=$((PASS + 1)); printf '\033[32mPASS\033[0m\n'; } -bad() { FAIL=$((FAIL + 1)); printf '\033[31mFAIL\033[0m\n %s\n' "$1"; } +step() { + CURRENT="$1" + if [ "$VERBOSE" = 1 ]; then printf ' • %s\n' "$1"; else printf ' • %s ... ' "$1"; fi +} + +ok() { + PASS=$((PASS + 1)) + if [ "$VERBOSE" = 1 ]; then printf ' \033[32m→ PASS\033[0m\n'; else printf '\033[32mPASS\033[0m\n'; fi +} +bad() { + FAIL=$((FAIL + 1)) + if [ "$VERBOSE" = 1 ]; then printf ' \033[31m→ FAIL\033[0m %s\n' "$1"; else printf '\033[31mFAIL\033[0m\n %s\n' "$1"; fi +} + +# vdump : in verbose mode, echo the command +# and its complete output, indented. A no-op otherwise. +vdump() { + [ "$VERBOSE" = 1 ] || return 0 + printf ' \033[2m$ %s\033[0m\n' "$1" + if [ -n "$2" ]; then printf '%s\n' "$2" | sed 's/^/ | /'; fi + printf ' \033[2m(exit %s)\033[0m\n' "$3" +} # assert_eq [msg] assert_eq() { @@ -55,6 +86,7 @@ assert_not_contains() { run_rc() { OUT="$("$@" 2>&1)" RC=$? + vdump "$*" "$OUT" "$RC" } # --------------------------------------------------------------------------- @@ -204,11 +236,13 @@ if command -v curl >/dev/null 2>&1; then step "cross-origin POST /shutdown is rejected (403)" CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ -H 'Origin: https://evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" + vdump "curl -X POST -H 'Origin: https://evil.example' http://127.0.0.1:$AO_PORT/shutdown" "HTTP $CODE" "-" assert_eq "$CODE" "403" "cross-origin shutdown" step "non-loopback Host POST /shutdown is rejected (403)" CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ -H 'Host: evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" + vdump "curl -X POST -H 'Host: evil.example' http://127.0.0.1:$AO_PORT/shutdown" "HTTP $CODE" "-" assert_eq "$CODE" "403" "rebinding-host shutdown" step "daemon survived the rejected shutdown attempts" @@ -275,6 +309,7 @@ if [ "$RC" -ne 0 ]; then ok; else bad "expected non-zero for bad shell"; fi step "invalid config (AO_PORT out of range) exits 1, not 2" OUT="$(AO_PORT=99999 "$AO_BIN" status 2>&1)"; RC=$? +vdump "AO_PORT=99999 $AO_BIN status" "$OUT" "$RC" assert_eq "$RC" "1" "config-error exit code (runtime, not usage)" # --------------------------------------------------------------------------- From c0bf99eb2281cf4e3385dde087418df6ac1a4c1c Mon Sep 17 00:00:00 2001 From: itrytoohard Date: Mon, 1 Jun 2026 03:14:08 +0530 Subject: [PATCH 12/12] test(cli): port the E2E suite to cross-platform Go; slim the Docker harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the growing bash smoke test with a Go os/exec suite behind the `e2e` build tag (backend/internal/cli/e2e_test.go). It builds the real binary and drives start/status/doctor/stop + the daemon-control HTTP surface against isolated state (temp dir + OS-assigned free port), and now runs natively on ubuntu + macOS + WINDOWS in CI — finally covering the Windows CREATE_NEW_PROCESS_GROUP detach path and per-OS os.UserConfigDir resolution that a Linux container can't observe. `go test -tags e2e -v` logs every command and its output, replacing the bash -v flag. - backend/internal/cli/e2e_test.go: 8 table-style TestE2E_* cases; strips any inherited AO_* env so a real daemon's AO_PORT can't leak in. - test/cli/install-check.sh: small, linear fresh-install proof the Dockerfile runs (binary on PATH, no toolchain) — kept as the hardening tier. - test/cli/Dockerfile: run install-check.sh instead of the full bash suite. - .github/workflows/cli-e2e.yml: `native` is now a go test matrix over ubuntu+macos+windows; `container` builds the image and runs it with --init. - Removes test/cli/smoke.sh and test/cli/run-local.sh (superseded by `go test`). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/cli-e2e.yml | 37 ++-- backend/internal/cli/e2e_test.go | 346 +++++++++++++++++++++++++++++++ test/cli/Dockerfile | 8 +- test/cli/README.md | 98 ++++----- test/cli/install-check.sh | 36 ++++ test/cli/run-local.sh | 35 ---- test/cli/smoke.sh | 334 ----------------------------- 7 files changed, 448 insertions(+), 446 deletions(-) create mode 100644 backend/internal/cli/e2e_test.go create mode 100755 test/cli/install-check.sh delete mode 100755 test/cli/run-local.sh delete mode 100755 test/cli/smoke.sh diff --git a/.github/workflows/cli-e2e.yml b/.github/workflows/cli-e2e.yml index 35ef26cd..30738432 100644 --- a/.github/workflows/cli-e2e.yml +++ b/.github/workflows/cli-e2e.yml @@ -13,17 +13,20 @@ permissions: contents: read jobs: - # Primary tier: run the REAL `ao` binary on GitHub's native VM runners. These - # runners are the "VMs" — the only place that exercises the OS-specific code - # paths (unix Setsid process-group detach + macOS os.UserConfigDir resolution). - # Bash is available on both ubuntu and macos runners, so the one smoke.sh runs - # unchanged. State is isolated per run (own temp dir + a free loopback port). + # Primary tier: the cross-platform Go E2E suite (build tag `e2e`) runs the real + # `ao` binary against isolated state on every OS GitHub hosts. These runners + # are the "VMs" — the only place that exercises the OS-specific process-detach + # paths (unix Setsid vs Windows CREATE_NEW_PROCESS_GROUP) and os.UserConfigDir + # resolution. The suite builds its own binary and self-allocates a free port. native: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} + defaults: + run: + working-directory: backend steps: - uses: actions/checkout@v4 @@ -32,25 +35,21 @@ jobs: go-version: "1.25" cache: false - - name: Build ao - run: cd backend && CGO_ENABLED=0 go build -trimpath -o "$RUNNER_TEMP/ao" ./cmd/ao + - name: CLI E2E (native) + run: go test -tags e2e -v ./internal/cli/... - - name: CLI smoke test - run: AO_BIN="$RUNNER_TEMP/ao" bash test/cli/smoke.sh - - # Secondary hardening tier: model "install ao on a fresh machine" in a clean, - # locked-down Linux container with no access to a developer's real state. - # --init gives the container a real PID-1 reaper (tini) so the live daemon the - # `start` test spawns (it detaches via setsid) is reaped after `stop` rather - # than lingering as a zombie. The suite doesn't depend on it (the stale case - # uses a fabricated dead PID), but it keeps process accounting clean. + # Secondary hardening tier: prove that a freshly installed binary works on a + # clean machine with no Go toolchain and no developer state. The Dockerfile + # installs `ao` on PATH in a slim image and runs test/cli/install-check.sh. + # --init gives a real PID-1 reaper so the daemon the check starts is reaped + # after `stop` instead of lingering as a zombie. container: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Build smoke image (fresh-machine install) + - name: Build fresh-install image run: docker build -f test/cli/Dockerfile -t ao-cli-smoke . - - name: Run CLI smoke test in container + - name: Fresh-install check (container) run: docker run --rm --init ao-cli-smoke diff --git a/backend/internal/cli/e2e_test.go b/backend/internal/cli/e2e_test.go new file mode 100644 index 00000000..f89e4671 --- /dev/null +++ b/backend/internal/cli/e2e_test.go @@ -0,0 +1,346 @@ +//go:build e2e + +// Package cli_test holds the end-to-end suite for the `ao` CLI. It builds the +// real binary and drives it (start/status/doctor/stop + the daemon-control HTTP +// surface) against fully isolated state — a per-test temp run-file, data dir, +// and an OS-assigned free loopback port — so it never touches a developer's real +// AO install. Unlike the Linux-only container smoke test, this runs natively on +// every OS in CI (ubuntu/macos/windows), which is the only way to exercise the +// unix setsid vs Windows CREATE_NEW_PROCESS_GROUP detach paths and the per-OS +// os.UserConfigDir resolution. +// +// It is gated behind the `e2e` build tag so it never runs in the normal +// `go test ./...` lane (it spawns processes and binds ports): +// +// go test -tags e2e ./internal/cli/... # run it +// go test -tags e2e -v -run TestE2E ./internal/cli/... # verbose, see every command +package cli_test + +import ( + "fmt" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "testing" + "time" +) + +// aoBin is the path to the binary built once for the whole suite. +var aoBin string + +func TestMain(m *testing.M) { + dir, err := os.MkdirTemp("", "ao-e2e-bin") + if err != nil { + fmt.Fprintln(os.Stderr, "e2e: mktemp:", err) + os.Exit(1) + } + aoBin = filepath.Join(dir, "ao") + if runtime.GOOS == "windows" { + aoBin += ".exe" + } + build := exec.Command("go", "build", "-o", aoBin, "github.com/aoagents/agent-orchestrator/backend/cmd/ao") + build.Stdout, build.Stderr = os.Stderr, os.Stderr + if err := build.Run(); err != nil { + fmt.Fprintln(os.Stderr, "e2e: build ao:", err) + os.Exit(1) + } + code := m.Run() + _ = os.RemoveAll(dir) + os.Exit(code) +} + +// env is an isolated CLI environment: its own state files and free port. +type env struct { + runFile string + dataDir string + port int +} + +func newEnv(t *testing.T) env { + t.Helper() + dir := t.TempDir() + return env{ + runFile: filepath.Join(dir, "running.json"), + dataDir: filepath.Join(dir, "data"), + port: freePort(t), + } +} + +// environ builds the child env: the ambient environment with every inherited +// AO_* var stripped (so a real daemon's AO_PORT can't leak in) plus our isolated +// settings. portOverride, when non-empty, replaces the numeric AO_PORT — used to +// inject an invalid value. +func (e env) environ(portOverride string) []string { + out := make([]string, 0, len(os.Environ())+3) + for _, kv := range os.Environ() { + if strings.HasPrefix(kv, "AO_") { + continue + } + out = append(out, kv) + } + port := fmt.Sprintf("%d", e.port) + if portOverride != "" { + port = portOverride + } + return append(out, "AO_RUN_FILE="+e.runFile, "AO_DATA_DIR="+e.dataDir, "AO_PORT="+port) +} + +func freePort(t *testing.T) int { + t.Helper() + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("alloc free port: %v", err) + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + +// run executes `ao args...` in env e and returns combined output + exit code. +func (e env) run(t *testing.T, args ...string) (string, int) { + t.Helper() + return e.runEnv(t, e.environ(""), args...) +} + +func (e env) runEnv(t *testing.T, environ []string, args ...string) (string, int) { + t.Helper() + cmd := exec.Command(aoBin, args...) + cmd.Env = environ + b, err := cmd.CombinedOutput() + out := string(b) + code := 0 + if err != nil { + var ee *exec.ExitError + if asExit(err, &ee) { + code = ee.ExitCode() + } else { + t.Fatalf("run %v: %v\n%s", args, err, out) + } + } + t.Logf("$ ao %s\n%s(exit %d)", strings.Join(args, " "), out, code) + return out, code +} + +func asExit(err error, target **exec.ExitError) bool { + if ee, ok := err.(*exec.ExitError); ok { + *target = ee + return true + } + return false +} + +// startDaemon brings the daemon up and registers a stop on cleanup. +func (e env) startDaemon(t *testing.T) { + t.Helper() + out, code := e.run(t, "start") + if code != 0 { + t.Fatalf("start failed (exit %d): %s", code, out) + } + t.Cleanup(func() { e.run(t, "stop") }) +} + +func mustContain(t *testing.T, out, want string) { + t.Helper() + if !strings.Contains(out, want) { + t.Fatalf("expected output to contain %q; got:\n%s", want, out) + } +} + +func mustNotContain(t *testing.T, out, notWant string) { + t.Helper() + if strings.Contains(out, notWant) { + t.Fatalf("expected output NOT to contain %q; got:\n%s", notWant, out) + } +} + +// --------------------------------------------------------------------------- + +func TestE2E_VersionAndHelp(t *testing.T) { + e := newEnv(t) + + if out, code := e.run(t, "version"); code != 0 || strings.TrimSpace(out) == "" { + t.Fatalf("version: exit %d, out %q", code, out) + } + if _, code := e.run(t, "--version"); code != 0 { + t.Fatalf("--version exit %d", code) + } + + out, code := e.run(t, "--help") + if code != 0 { + t.Fatalf("--help exit %d", code) + } + for _, want := range []string{"start", "stop", "status", "doctor", "completion", "version"} { + mustContain(t, out, want) + } + // the internal daemon command is hidden from help (rendered as "\n daemon") + mustNotContain(t, out, "\n daemon") +} + +func TestE2E_DoctorDoesNotTouchTheStore(t *testing.T) { + e := newEnv(t) + + out, code := e.run(t, "doctor") + if code != 0 { + t.Fatalf("doctor (fresh) exit %d: %s", code, out) + } + mustContain(t, out, "git") + mustContain(t, out, "database not created yet") // sqlite WARN, never migrated + + // doctor must NOT create/migrate the DB — the daemon is the sole writer. + if _, err := os.Stat(filepath.Join(e.dataDir, "ao.db")); err == nil { + t.Fatal("doctor created ao.db; the CLI must not open/migrate the store") + } + + if out, code := e.run(t, "doctor", "--json"); code != 0 || !strings.Contains(out, `"ok": true`) { + t.Fatalf("doctor --json: exit %d, out %s", code, out) + } +} + +func TestE2E_StatusStopped(t *testing.T) { + e := newEnv(t) + out, code := e.run(t, "status", "--json") + if code != 0 { // status always exits 0 + t.Fatalf("status exit %d", code) + } + mustContain(t, out, `"state": "stopped"`) + mustNotContain(t, out, "startedAt") + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop-when-stopped: exit %d, out %s", code, out) // idempotent + } +} + +func TestE2E_Lifecycle(t *testing.T) { + e := newEnv(t) + e.startDaemon(t) + + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "ready"`) + mustContain(t, out, fmt.Sprintf(`"port": %d`, e.port)) + + // idempotent + if out, code := e.run(t, "start"); code != 0 || !strings.Contains(out, "ready") { + t.Fatalf("idempotent start: exit %d, out %s", code, out) + } + + // now the daemon (not the CLI) has created + migrated the store + if _, err := os.Stat(filepath.Join(e.dataDir, "ao.db")); err != nil { + t.Fatalf("daemon should have created ao.db: %v", err) + } + out, _ = e.run(t, "doctor") + mustContain(t, out, "migrations are applied by the daemon") + + // /healthz identity + body := httpGet(t, e.port, "/healthz") + mustContain(t, body, "agent-orchestrator-daemon") + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop: exit %d, out %s", code, out) + } + if _, err := os.Stat(e.runFile); !os.IsNotExist(err) { + t.Fatal("run-file should be removed after stop") + } +} + +func TestE2E_ShutdownGuard(t *testing.T) { + e := newEnv(t) + e.startDaemon(t) + + // A cross-site Origin header must be rejected without stopping the daemon. + if code := postShutdown(t, e.port, func(r *http.Request) { r.Header.Set("Origin", "https://evil.example") }); code != http.StatusForbidden { + t.Fatalf("cross-origin /shutdown = %d, want 403", code) + } + // A non-loopback Host (DNS-rebinding) must be rejected too. + if code := postShutdown(t, e.port, func(r *http.Request) { r.Host = "evil.example" }); code != http.StatusForbidden { + t.Fatalf("rebinding-host /shutdown = %d, want 403", code) + } + // The daemon survived both. + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "ready"`) +} + +func TestE2E_StaleRunFile(t *testing.T) { + e := newEnv(t) + // PID 2147483647 is never alive -> the CLI must classify this as stale. + content := fmt.Sprintf(`{"pid":2147483647,"port":%d,"startedAt":"2020-01-01T00:00:00Z"}`, e.port) + if err := os.MkdirAll(filepath.Dir(e.runFile), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(e.runFile, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "stale"`) + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop stale: exit %d, out %s", code, out) + } + if _, err := os.Stat(e.runFile); !os.IsNotExist(err) { + t.Fatal("stale run-file should be removed") + } +} + +func TestE2E_ExitCodes(t *testing.T) { + e := newEnv(t) + + if _, code := e.run(t, "status", "--definitely-not-a-flag"); code != 2 { + t.Fatalf("bad flag exit %d, want 2", code) + } + if _, code := e.run(t, "completion"); code != 2 { // missing required arg + t.Fatalf("missing-arg exit %d, want 2", code) + } + if _, code := e.run(t, "completion", "notashell"); code == 0 { // runtime error + t.Fatal("unsupported shell should be non-zero") + } + // invalid config is a runtime error (1), not a usage error (2). + if _, code := e.runEnv(t, e.environ("notaport"), "status"); code != 1 { + t.Fatalf("invalid AO_PORT exit %d, want 1", code) + } +} + +func TestE2E_Completion(t *testing.T) { + e := newEnv(t) + for _, sh := range []string{"bash", "zsh", "fish", "powershell"} { + out, code := e.run(t, "completion", sh) + if code != 0 || strings.TrimSpace(out) == "" { + t.Fatalf("completion %s: exit %d, empty=%v", sh, code, strings.TrimSpace(out) == "") + } + } +} + +// --------------------------------------------------------------------------- +// HTTP helpers (loopback) + +func httpClient() *http.Client { return &http.Client{Timeout: 3 * time.Second} } + +func httpGet(t *testing.T, port int, path string) string { + t.Helper() + resp, err := httpClient().Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path)) + if err != nil { + t.Fatalf("GET %s: %v", path, err) + } + defer resp.Body.Close() + b := make([]byte, 4096) + n, _ := resp.Body.Read(b) + return string(b[:n]) +} + +// postShutdown issues POST /shutdown with mutator applied, returns the status code. +func postShutdown(t *testing.T, port int, mutate func(*http.Request)) int { + t.Helper() + req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/shutdown", port), nil) + if err != nil { + t.Fatal(err) + } + mutate(req) + resp, err := httpClient().Do(req) + if err != nil { + t.Fatalf("POST /shutdown: %v", err) + } + defer resp.Body.Close() + return resp.StatusCode +} diff --git a/test/cli/Dockerfile b/test/cli/Dockerfile index ce429a9c..fb5d85b2 100644 --- a/test/cli/Dockerfile +++ b/test/cli/Dockerfile @@ -33,15 +33,15 @@ RUN apt-get update \ # "Install" the CLI the way a user would: drop the binary on PATH. COPY --from=build /out/ao /usr/local/bin/ao -COPY test/cli/smoke.sh /usr/local/bin/ao-smoke.sh -RUN chmod +x /usr/local/bin/ao /usr/local/bin/ao-smoke.sh +COPY test/cli/install-check.sh /usr/local/bin/ao-install-check.sh +RUN chmod +x /usr/local/bin/ao /usr/local/bin/ao-install-check.sh # Run as an unprivileged user with a real HOME, like a normal install. RUN useradd --create-home --shell /bin/bash ao USER ao WORKDIR /home/ao -# Sanity: prove the install resolved before the suite runs. +# Sanity: prove the install resolved before the check runs. RUN ao version -ENTRYPOINT ["/usr/local/bin/ao-smoke.sh"] +ENTRYPOINT ["/usr/local/bin/ao-install-check.sh"] diff --git a/test/cli/README.md b/test/cli/README.md index 231c44e9..14c3a56c 100644 --- a/test/cli/README.md +++ b/test/cli/README.md @@ -1,75 +1,65 @@ # `ao` CLI end-to-end tests -These tests install and drive the **real `ao` binary** the way a user would — -`start` → `status` → `doctor` → `stop`, plus the daemon-control HTTP surface — -and assert the whole thing works end to end. They run against **isolated, -throwaway state** (their own temp run-file + data dir + a free loopback port), -so they never touch a developer's real AO installation. +These tests drive the **real `ao` binary** the way a user would — `start` → +`status` → `doctor` → `stop`, plus the daemon-control HTTP surface — and assert +the whole thing works. They run against **isolated, throwaway state** (a per-test +temp run-file + data dir + an OS-assigned free loopback port), so they never +touch a developer's real AO installation. -## Files +## Two tiers -| File | Purpose | -|------|---------| -| `smoke.sh` | The suite. Host-agnostic bash; drives the binary at `$AO_BIN` (default `ao` on PATH) and prints a PASS/FAIL line per assertion. | -| `Dockerfile` | Models **installing `ao` on a fresh machine**: builds the binary, drops it on `PATH` in a clean Debian image with only runtime deps (`git`, `tmux`, `curl`), then runs `smoke.sh` as a non-root user. | -| `run-local.sh` | Convenience wrapper: build from source and run `smoke.sh` natively against a temp binary. | +| Tier | What | Where | +|------|------|-------| +| **Comprehensive (primary)** | A cross-platform Go suite that builds `ao` and exercises the full behaviour. Runs natively on **ubuntu + macOS + windows** — the only way to cover the OS-specific process-detach paths (`setsid` vs `CREATE_NEW_PROCESS_GROUP`) and `os.UserConfigDir()` resolution. | `backend/internal/cli/e2e_test.go` (build tag `e2e`) | +| **Fresh-install (hardening)** | Proves a freshly installed binary works on a clean machine with no Go toolchain and no developer state. | `test/cli/Dockerfile` + `test/cli/install-check.sh` | ## Run it -**Native (fastest, uses your toolchain):** +**The Go suite (fastest, cross-platform):** ```bash -test/cli/run-local.sh # PASS/FAIL per assertion -test/cli/run-local.sh -v # also print every command and its full output -# or, against a binary you already built: -AO_BIN=/path/to/ao test/cli/smoke.sh [-v] +cd backend +go test -tags e2e ./internal/cli/... # run it +go test -tags e2e -v -run TestE2E ./internal/cli/... # verbose: prints every command + output ``` +It builds its own `ao` binary; `git` must be on PATH (required by `doctor`). +`-v` logs each `ao` invocation and its full output, which is the audit trail you +get for free from `go test`. **Fresh-machine install, in a clean container:** ```bash docker build -f test/cli/Dockerfile -t ao-cli-smoke . docker run --rm --init ao-cli-smoke ``` -> `--init` gives the container a real PID-1 reaper (tini) so the live daemon -> spawned during the `start` test is reaped after `stop` instead of lingering as -> a zombie. The suite itself doesn't depend on it — the stale-daemon case uses a -> fabricated dead PID — but it keeps process accounting clean. +> `--init` gives the container a real PID-1 reaper (tini) so the daemon the +> check starts is reaped after `stop` instead of lingering as a zombie. -## What it covers +## What the Go suite covers -Install resolves on PATH · `version`/`--version` · `--help` (and hides the -internal `daemon` command) · `doctor` text + `--json` (and that it **does not** -open/migrate SQLite) · `status` stopped/stale/ready · `start` (fresh + -idempotent) · daemon-created store · `/healthz` identity · the `/shutdown` -CSRF/DNS-rebinding guard (403 + daemon survives) · `stop` (graceful + stale + -idempotent) · run-file cleanup/ownership · exit codes (`2` usage, `1` runtime) · -completion for all four shells. +`TestE2E_VersionAndHelp` (version/`--version`/help, daemon hidden) · +`TestE2E_DoctorDoesNotTouchTheStore` (doctor text + `--json`; proves it does +**not** create/migrate `ao.db`) · `TestE2E_StatusStopped` (stopped + idempotent +stop) · `TestE2E_Lifecycle` (start, ready, idempotent, daemon-created store, +`/healthz` identity, stop, run-file cleanup) · `TestE2E_ShutdownGuard` (the +`/shutdown` CSRF + DNS-rebinding 403 guard, daemon survives) · +`TestE2E_StaleRunFile` (dead-PID run-file → stale → cleaned) · `TestE2E_ExitCodes` +(2 usage / 1 runtime / config error) · `TestE2E_Completion` (all four shells). -## Testing strategy — why it's shaped this way +## Why a Go suite (not bash, not Python) -We deliberately don't make Docker the *only* tier. A daemon that detaches with -`setsid` and outlives the launching process is exactly the workload that -container PID-1 semantics mishandle, and the OS-specific bits (`setsid` vs -Windows `CREATE_NEW_PROCESS_GROUP`, and `os.UserConfigDir()` resolving to -`~/Library/Application Support` on macOS, `%AppData%` on Windows, `~/.config` -on Linux) can't be observed from a Linux container at all. +The bash version grew past the point where bash was a good fit, and a Linux +container can't observe the macOS/Windows code paths at all. A Go `os/exec` +suite is the right home: it uses the repo's own toolchain (runs under `go test`), +gives real assertions and structured data, and — critically — runs natively on +the Windows and macOS runners, finally covering the `CREATE_NEW_PROCESS_GROUP` +detach path and per-OS config-dir resolution. The container stays as a thin +"clean install actually works" check. -So CI (`.github/workflows/cli-e2e.yml`) runs two tiers: +## Extending -1. **`native`** — the primary signal. Builds and runs the real binary on a - GitHub matrix of `ubuntu-latest` + `macos-latest` (those runners *are* the - VMs), covering the unix detach path and macOS config-dir resolution. -2. **`container`** — a hardening tier. The `Dockerfile` proves a clean-machine - install works and that the CLI has no hidden dependence on developer state, - run with `--init`. - -### Extending - -- Add an assertion: drop a `step`/`assert_*` pair into the relevant section of - `smoke.sh`. The helpers (`assert_eq`, `assert_contains`, `assert_not_contains`, - `run_rc`) keep cases one-liners. -- Cover Windows: add a `windows-latest` leg to the `native` matrix (Git Bash - ships on the runner) once the suite is confirmed green there, or add Go-based - `os/exec` E2E tests for the Windows process-group path. -- Deeper per-OS path assertions (that state resolves under the OS-native config - dir when `AO_RUN_FILE`/`AO_DATA_DIR` are unset) are best added as Go unit - tests in `internal/config`. +- **Add a case:** a new `TestE2E_*` function (or a `t.Run` subtest) in + `e2e_test.go`. Use `newEnv(t)` for isolated state and the `env.run`/`httpGet`/ + `postShutdown` helpers. +- **Add an OS:** extend the `matrix.os` list in `.github/workflows/cli-e2e.yml`. +- Deeper per-OS path assertions (state resolves under the OS-native config dir + when `AO_RUN_FILE`/`AO_DATA_DIR` are unset) fit best as unit tests in + `internal/config`. diff --git a/test/cli/install-check.sh b/test/cli/install-check.sh new file mode 100755 index 00000000..f4bcacd4 --- /dev/null +++ b/test/cli/install-check.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# Fresh-machine install check. The Dockerfile installs `ao` on PATH in a clean +# image and runs this; it proves a freshly installed binary actually works on a +# machine with no Go toolchain and no developer state. The COMPREHENSIVE, +# cross-platform behavioural suite lives in Go (backend/internal/cli/e2e_test.go, +# `go test -tags e2e`); this stays deliberately small and linear. + +set -euo pipefail + +AO_BIN="${AO_BIN:-ao}" +tmp="$(mktemp -d)" +export AO_RUN_FILE="$tmp/running.json" +export AO_DATA_DIR="$tmp/data" +export AO_PORT="${AO_PORT:-3001}" # the container is isolated; 3001 is free +trap '"$AO_BIN" stop >/dev/null 2>&1 || true; rm -rf "$tmp"' EXIT + +fail() { echo "FAIL: $1" >&2; exit 1; } + +echo "ao binary : $(command -v "$AO_BIN")" +"$AO_BIN" version >/dev/null || fail "version" +"$AO_BIN" doctor >/dev/null || fail "doctor" +"$AO_BIN" start >/dev/null || fail "start" + +"$AO_BIN" status --json | grep -q '"state": "ready"' || fail "daemon not ready after start" + +# the /shutdown control endpoint rejects a cross-origin caller (403) and survives +code="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ + -H 'Origin: https://evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" +[ "$code" = "403" ] || fail "cross-origin /shutdown returned $code, want 403" +"$AO_BIN" status --json | grep -q '"state": "ready"' || fail "daemon died after rejected shutdown" + +"$AO_BIN" stop >/dev/null || fail "stop" +"$AO_BIN" status --json | grep -q '"state": "stopped"' || fail "daemon not stopped" + +echo "fresh-install check: OK" diff --git a/test/cli/run-local.sh b/test/cli/run-local.sh deleted file mode 100755 index f2f560ac..00000000 --- a/test/cli/run-local.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# -# Convenience wrapper: build `ao` from source and run the CLI smoke test against -# it natively, using an isolated temp state dir and a free port. Touches nothing -# in your real AO installation. -# -# test/cli/run-local.sh # PASS/FAIL per assertion -# test/cli/run-local.sh -v # also print every command and its full output -# -# To run the same suite the way a brand-new user would install it (clean Linux -# container, binary on PATH), use Docker instead: -# -# docker build -f test/cli/Dockerfile -t ao-cli-smoke . && docker run --rm --init ao-cli-smoke - -set -euo pipefail - -# Pass through -v/--verbose (anything else is forwarded to smoke.sh too). -smoke_args=() -for arg in "$@"; do - case "$arg" in - -v|--verbose) smoke_args+=("--verbose") ;; - -h|--help) echo "usage: run-local.sh [-v|--verbose]"; exit 0 ;; - *) smoke_args+=("$arg") ;; - esac -done - -repo_root="$(cd "$(dirname "$0")/../.." && pwd)" -bindir="$(mktemp -d)" -trap 'rm -rf "$bindir"' EXIT - -echo "building ao ..." -( cd "$repo_root/backend" && CGO_ENABLED=0 go build -trimpath -o "$bindir/ao" ./cmd/ao ) - -echo "running smoke test ..." -AO_BIN="$bindir/ao" bash "$repo_root/test/cli/smoke.sh" "${smoke_args[@]+"${smoke_args[@]}"}" diff --git a/test/cli/smoke.sh b/test/cli/smoke.sh deleted file mode 100755 index 4682d53b..00000000 --- a/test/cli/smoke.sh +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/env bash -# -# End-to-end smoke test for the `ao` CLI. -# -# It models a *fresh machine*: `ao` is expected to already be installed on PATH -# (the Dockerfile in this directory installs it, simulating a new user), and the -# whole test runs against isolated, throwaway state — its own temp run-file, -# data dir, and a free loopback port — so it never touches a developer's real -# AO installation. -# -# Run locally against a binary you built: -# AO_BIN=/path/to/ao test/cli/smoke.sh -# Verbose — print each command and its full output: -# AO_BIN=/path/to/ao test/cli/smoke.sh -v (or AO_SMOKE_VERBOSE=1) -# Or in the container (models install-on-a-fresh-machine): -# docker build -f test/cli/Dockerfile -t ao-cli-smoke . && docker run --rm --init ao-cli-smoke -# -# Exit code: 0 if every assertion passes, 1 otherwise. - -set -uo pipefail - -AO_BIN="${AO_BIN:-ao}" - -# Verbose mode prints every command and its complete output, not just PASS/FAIL. -VERBOSE="${AO_SMOKE_VERBOSE:-0}" -for arg in "$@"; do - case "$arg" in - -v|--verbose) VERBOSE=1 ;; - -h|--help) echo "usage: [AO_BIN=...] smoke.sh [-v|--verbose]"; exit 0 ;; - *) echo "unknown argument: $arg" >&2; exit 2 ;; - esac -done - -# --------------------------------------------------------------------------- -# Tiny assertion framework -# --------------------------------------------------------------------------- -PASS=0 -FAIL=0 -CURRENT="" - -section() { printf '\n\033[1m== %s ==\033[0m\n' "$1"; } - -step() { - CURRENT="$1" - if [ "$VERBOSE" = 1 ]; then printf ' • %s\n' "$1"; else printf ' • %s ... ' "$1"; fi -} - -ok() { - PASS=$((PASS + 1)) - if [ "$VERBOSE" = 1 ]; then printf ' \033[32m→ PASS\033[0m\n'; else printf '\033[32mPASS\033[0m\n'; fi -} -bad() { - FAIL=$((FAIL + 1)) - if [ "$VERBOSE" = 1 ]; then printf ' \033[31m→ FAIL\033[0m %s\n' "$1"; else printf '\033[31mFAIL\033[0m\n %s\n' "$1"; fi -} - -# vdump : in verbose mode, echo the command -# and its complete output, indented. A no-op otherwise. -vdump() { - [ "$VERBOSE" = 1 ] || return 0 - printf ' \033[2m$ %s\033[0m\n' "$1" - if [ -n "$2" ]; then printf '%s\n' "$2" | sed 's/^/ | /'; fi - printf ' \033[2m(exit %s)\033[0m\n' "$3" -} - -# assert_eq [msg] -assert_eq() { - if [ "$1" = "$2" ]; then ok; else bad "${3:-}: expected [$2], got [$1]"; fi -} -# assert_contains -assert_contains() { - case "$1" in - *"$2"*) ok ;; - *) bad "expected output to contain [$2]; got: $(printf '%s' "$1" | head -c 400)" ;; - esac -} -# assert_not_contains -assert_not_contains() { - case "$1" in - *"$2"*) bad "expected output to NOT contain [$2]" ;; - *) ok ;; - esac -} - -# run_rc -> sets RC and OUT (stdout+stderr combined) -run_rc() { - OUT="$("$@" 2>&1)" - RC=$? - vdump "$*" "$OUT" "$RC" -} - -# --------------------------------------------------------------------------- -# Isolated, throwaway environment -# --------------------------------------------------------------------------- -TMP="$(mktemp -d)" -export AO_RUN_FILE="$TMP/running.json" -export AO_DATA_DIR="$TMP/data" - -# Pick a free loopback port (bash /dev/tcp probe; connect-refused == free). -find_free_port() { - local p - for p in $(seq 3071 3170); do - if ! (exec 3<>"/dev/tcp/127.0.0.1/$p") 2>/dev/null; then - echo "$p"; return 0 - fi - exec 3>&- 2>/dev/null || true - done - echo 3071 -} -# Always run against an isolated, free port. We deliberately do NOT honour an -# inherited AO_PORT — it might point at a real daemon, which is exactly the -# collision this isolation is meant to prevent. Override only via AO_SMOKE_PORT. -export AO_PORT="${AO_SMOKE_PORT:-$(find_free_port)}" - -cleanup() { - "$AO_BIN" stop >/dev/null 2>&1 || true - rm -rf "$TMP" -} -trap cleanup EXIT - -printf 'ao smoke test\n binary : %s\n port : %s\n state : %s\n' \ - "$(command -v "$AO_BIN" || echo "$AO_BIN")" "$AO_PORT" "$TMP" - -# --------------------------------------------------------------------------- -# 1. Install verification — `ao` is a real, runnable binary on this machine -# --------------------------------------------------------------------------- -section "install" - -step "ao resolves on PATH / at AO_BIN" -if command -v "$AO_BIN" >/dev/null 2>&1; then ok; else bad "ao not found"; fi - -step "ao version prints build metadata" -run_rc "$AO_BIN" version -if [ "$RC" -eq 0 ] && [ -n "$OUT" ]; then ok; else bad "rc=$RC out=$OUT"; fi - -step "ao --version works" -run_rc "$AO_BIN" --version -assert_eq "$RC" "0" "--version rc" - -step "ao --help lists product commands" -run_rc "$AO_BIN" --help -assert_contains "$OUT" "start" -step "ao --help lists status/stop/doctor" -assert_contains "$OUT" "doctor" -step "ao --help hides internal daemon command" -assert_not_contains "$OUT" $'\n daemon' - -# --------------------------------------------------------------------------- -# 2. doctor on a fresh machine (no daemon yet) -# --------------------------------------------------------------------------- -section "doctor (fresh)" - -step "doctor exits 0 when required tools present" -run_rc "$AO_BIN" doctor -assert_eq "$RC" "0" "doctor rc (git/tmux must be installed in the image)" - -step "doctor reports git found" -assert_contains "$OUT" "git" - -step "doctor does NOT migrate the store (sqlite WARN, db absent)" -assert_contains "$OUT" "database not created yet" - -step "doctor data dir was created but ao.db was NOT (CLI is not the store writer)" -if [ ! -f "$AO_DATA_DIR/ao.db" ]; then ok; else bad "ao.db exists — doctor must not create/migrate the DB"; fi - -step "doctor --json is valid JSON with ok=true" -run_rc "$AO_BIN" doctor --json -assert_contains "$OUT" '"ok": true' - -# --------------------------------------------------------------------------- -# 3. status when stopped -# --------------------------------------------------------------------------- -section "status (stopped)" - -step "status --json reports stopped" -run_rc "$AO_BIN" status --json -assert_contains "$OUT" '"state": "stopped"' -step "status exits 0 even when stopped (status never errors)" -assert_eq "$RC" "0" "status exit code" -step "stopped status omits startedAt" -assert_not_contains "$OUT" "startedAt" - -step "stop is idempotent when already stopped" -run_rc "$AO_BIN" stop -if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "stopped"; else bad "stop-when-stopped rc=$RC"; fi - -# --------------------------------------------------------------------------- -# 4. start → ready, and status reflects it -# --------------------------------------------------------------------------- -section "start" - -step "start brings the daemon up and reports ready" -run_rc "$AO_BIN" start -if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "ready"; else bad "start rc=$RC out=$OUT"; fi - -step "status --json reports ready with pid+port" -run_rc "$AO_BIN" status --json -assert_contains "$OUT" '"state": "ready"' -step "status carries the bound port" -assert_contains "$OUT" "\"port\": $AO_PORT" - -step "start is idempotent (second start returns ready, no error)" -run_rc "$AO_BIN" start -if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "ready"; else bad "idempotent start rc=$RC"; fi - -# Capture the live pid for later assertions. -PID="$("$AO_BIN" status --json | sed -n 's/.*"pid": \([0-9]*\).*/\1/p' | head -1)" - -# --------------------------------------------------------------------------- -# 5. doctor while running — now the daemon (not the CLI) has created the store -# --------------------------------------------------------------------------- -section "doctor (running)" - -step "daemon created and migrated the store" -if [ -f "$AO_DATA_DIR/ao.db" ]; then ok; else bad "daemon should have created ao.db"; fi - -step "doctor now reports sqlite present + daemon-migrated" -run_rc "$AO_BIN" doctor -assert_contains "$OUT" "migrations are applied by the daemon" - -# --------------------------------------------------------------------------- -# 6. Health endpoint identity (loopback) -# --------------------------------------------------------------------------- -section "health endpoint" - -if command -v curl >/dev/null 2>&1; then - step "/healthz reports the AO daemon service + pid" - run_rc curl -fsS "http://127.0.0.1:$AO_PORT/healthz" - assert_contains "$OUT" "agent-orchestrator-daemon" - - # ------------------------------------------------------------------------- - # 7. /shutdown CSRF / DNS-rebinding guard (review fix M3) - # ------------------------------------------------------------------------- - section "/shutdown guard" - - step "cross-origin POST /shutdown is rejected (403)" - CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ - -H 'Origin: https://evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" - vdump "curl -X POST -H 'Origin: https://evil.example' http://127.0.0.1:$AO_PORT/shutdown" "HTTP $CODE" "-" - assert_eq "$CODE" "403" "cross-origin shutdown" - - step "non-loopback Host POST /shutdown is rejected (403)" - CODE="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ - -H 'Host: evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" - vdump "curl -X POST -H 'Host: evil.example' http://127.0.0.1:$AO_PORT/shutdown" "HTTP $CODE" "-" - assert_eq "$CODE" "403" "rebinding-host shutdown" - - step "daemon survived the rejected shutdown attempts" - run_rc "$AO_BIN" status --json - assert_contains "$OUT" '"state": "ready"' -else - section "/shutdown guard" - step "curl unavailable — skipping HTTP-level guard checks" - printf '\033[33mSKIP\033[0m\n' -fi - -# --------------------------------------------------------------------------- -# 8. stop → stopped, run-file cleaned up -# --------------------------------------------------------------------------- -section "stop" - -step "stop gracefully stops the daemon" -run_rc "$AO_BIN" stop -if [ "$RC" -eq 0 ]; then assert_contains "$OUT" "stopped"; else bad "stop rc=$RC out=$OUT"; fi - -step "run-file removed after stop" -if [ ! -f "$AO_RUN_FILE" ]; then ok; else bad "running.json still present"; fi - -step "status --json reports stopped after stop" -run_rc "$AO_BIN" status --json -assert_contains "$OUT" '"state": "stopped"' - -# --------------------------------------------------------------------------- -# 9. stale run-file (dead PID) — deterministic, no real process needed -# --------------------------------------------------------------------------- -section "stale run-file" - -# PID 2147483647 is never alive; the CLI must classify this as stale, not kill it. -printf '{"pid":2147483647,"port":%s,"startedAt":"2020-01-01T00:00:00Z"}\n' "$AO_PORT" > "$AO_RUN_FILE" - -step "status reports stale for a dead-PID run-file" -run_rc "$AO_BIN" status --json -assert_contains "$OUT" '"state": "stale"' -step "status still exits 0 for a stale daemon (reports, never errors)" -assert_eq "$RC" "0" "stale status exit code" - -step "stop clears a stale run-file and reports stopped" -run_rc "$AO_BIN" stop -assert_contains "$OUT" "stopped" -step "stale run-file removed" -if [ ! -f "$AO_RUN_FILE" ]; then ok; else bad "stale running.json not removed"; fi - -# --------------------------------------------------------------------------- -# 10. exit codes: 2 for usage errors, 1 for runtime errors -# --------------------------------------------------------------------------- -section "exit codes" - -step "unknown flag exits 2 (usage error)" -run_rc "$AO_BIN" status --definitely-not-a-flag -assert_eq "$RC" "2" "bad-flag exit code" - -step "missing required arg exits 2 (completion needs a shell)" -run_rc "$AO_BIN" completion -assert_eq "$RC" "2" "missing-arg exit code" - -step "unsupported shell exits non-zero (runtime error)" -run_rc "$AO_BIN" completion notashell -if [ "$RC" -ne 0 ]; then ok; else bad "expected non-zero for bad shell"; fi - -step "invalid config (AO_PORT out of range) exits 1, not 2" -OUT="$(AO_PORT=99999 "$AO_BIN" status 2>&1)"; RC=$? -vdump "AO_PORT=99999 $AO_BIN status" "$OUT" "$RC" -assert_eq "$RC" "1" "config-error exit code (runtime, not usage)" - -# --------------------------------------------------------------------------- -# 11. shell completion generators -# --------------------------------------------------------------------------- -section "completion" - -for sh in bash zsh fish powershell; do - step "completion $sh generates a script" - run_rc "$AO_BIN" completion "$sh" - if [ "$RC" -eq 0 ] && [ -n "$OUT" ]; then ok; else bad "completion $sh rc=$RC"; fi -done - -# --------------------------------------------------------------------------- -# Result -# --------------------------------------------------------------------------- -printf '\n\033[1m== result ==\033[0m\n passed: %s\n failed: %s\n' "$PASS" "$FAIL" -if [ "$FAIL" -ne 0 ]; then - printf '\033[31mSMOKE TEST FAILED\033[0m\n' - exit 1 -fi -printf '\033[32mSMOKE TEST PASSED\033[0m\n'