diff --git a/.github/workflows/cli-e2e.yml b/.github/workflows/cli-e2e.yml new file mode 100644 index 00000000..30738432 --- /dev/null +++ b/.github/workflows/cli-e2e.yml @@ -0,0 +1,55 @@ +name: CLI E2E + +on: + push: + branches: [main] + pull_request: + paths: + - "backend/**" + - "test/cli/**" + - ".github/workflows/cli-e2e.yml" + +permissions: + contents: read + +jobs: + # Primary tier: the cross-platform Go E2E suite (build tag `e2e`) runs the real + # `ao` binary against isolated state on every OS GitHub hosts. These runners + # are the "VMs" — the only place that exercises the OS-specific process-detach + # paths (unix Setsid vs Windows CREATE_NEW_PROCESS_GROUP) and os.UserConfigDir + # resolution. The suite builds its own binary and self-allocates a free port. + native: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + defaults: + run: + working-directory: backend + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.25" + cache: false + + - name: CLI E2E (native) + run: go test -tags e2e -v ./internal/cli/... + + # Secondary hardening tier: prove that a freshly installed binary works on a + # clean machine with no Go toolchain and no developer state. The Dockerfile + # installs `ao` on PATH in a slim image and runs test/cli/install-check.sh. + # --init gives a real PID-1 reaper so the daemon the check starts is reaped + # after `stop` instead of lingering as a zombie. + container: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build fresh-install image + run: docker build -f test/cli/Dockerfile -t ao-cli-smoke . + + - name: Fresh-install check (container) + run: docker run --rm --init ao-cli-smoke diff --git a/README.md b/README.md index f5c17deb..61a639d2 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,24 @@ Lifecycle Manager + Session Manager lane in [`docs/architecture.md`](docs/archit ## Backend daemon -The Go binary in [`backend/`](backend/) is the HTTP daemon — a loopback-only -sidecar the Electron supervisor will spawn (Phase 1c). Phase 1a landed the -skeleton: chi router, middleware stack (recoverer → request-id → logger → -real-ip), `/healthz` + `/readyz`, atomic `running.json` PID/port handshake, -graceful shutdown on SIGINT/SIGTERM. +The Go backend now has a Cobra-based `ao` CLI in [`backend/cmd/ao`](backend/cmd/ao). +The CLI controls the HTTP daemon — a loopback-only sidecar the Electron +supervisor will also use. The daemon skeleton includes the chi router, +middleware stack (recoverer → request-id → logger → real-ip), `/healthz` + +`/readyz`, atomic `running.json` PID/port handshake, graceful shutdown on +SIGINT/SIGTERM, SQLite storage, CDC polling, and lifecycle/reaper wiring. ### Run ```bash cd backend -go run . # binds 127.0.0.1:3001 with all defaults -AO_PORT=3019 go run . # override per invocation +go run ./cmd/ao start # start the daemon and wait for readiness +go run ./cmd/ao status # inspect PID/port/health/readiness +go run ./cmd/ao stop # gracefully stop the daemon +go run ./cmd/ao daemon # internal daemon entrypoint + +go run . # compatibility wrapper; starts the daemon +AO_PORT=3019 go run ./cmd/ao start # override per invocation ``` Health check: @@ -48,4 +54,3 @@ is intentionally not env-configurable. cd backend gofmt -l . && go build ./... && go vet ./... && go test -race ./... ``` - diff --git a/backend/cmd/ao/main.go b/backend/cmd/ao/main.go new file mode 100644 index 00000000..d1ea897c --- /dev/null +++ b/backend/cmd/ao/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "fmt" + "os" + + "github.com/aoagents/agent-orchestrator/backend/internal/cli" +) + +func main() { + if err := cli.Execute(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(cli.ExitCode(err)) + } +} diff --git a/backend/go.mod b/backend/go.mod index f270a14c..a2de66a0 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -7,6 +7,8 @@ require ( github.com/creack/pty v1.1.24 github.com/go-chi/chi/v5 v5.1.0 github.com/pressly/goose/v3 v3.27.1 + github.com/spf13/cobra v1.10.1 + golang.org/x/sys v0.43.0 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.51.0 ) @@ -14,14 +16,15 @@ require ( require ( github.com/dustin/go-humanize v1.0.1 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/mattn/go-isatty v0.0.21 // indirect github.com/mfridman/interpolate v0.0.2 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/sethvargo/go-retry v0.3.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.43.0 // indirect modernc.org/libc v1.72.3 // indirect modernc.org/mathutil v1.7.1 // indirect modernc.org/memory v1.11.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 84823bbd..cf3f0029 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,5 +1,6 @@ github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -14,6 +15,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/mfridman/interpolate v0.0.2 h1:pnuTK7MQIxxFz1Gr+rjSIx9u7qVjf5VOoM/u6BbAxPY= @@ -26,8 +29,13 @@ github.com/pressly/goose/v3 v3.27.1 h1:6uEvcprBybDmW4hcz3gYujhARhye+GoWKhEWyzD5s github.com/pressly/goose/v3 v3.27.1/go.mod h1:maruOxsPnIG2yHHyo8UqKWXYKFcH7Q76csUV7+7KYoM= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= github.com/sethvargo/go-retry v0.3.0/go.mod h1:mNX17F0C/HguQMyMyJxcnU471gOZGxCLyYaFyAZraas= +github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= +github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= diff --git a/backend/internal/cli/completion.go b/backend/internal/cli/completion.go new file mode 100644 index 00000000..f4575de0 --- /dev/null +++ b/backend/internal/cli/completion.go @@ -0,0 +1,37 @@ +package cli + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +func newCompletionCommand() *cobra.Command { + return &cobra.Command{ + Use: "completion [bash|zsh|fish|powershell]", + Short: "Generate shell completion scripts", + Args: func(cmd *cobra.Command, args []string) error { + if err := cobra.ExactArgs(1)(cmd, args); err != nil { + return usageError{err} + } + return nil + }, + ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, + RunE: func(cmd *cobra.Command, args []string) error { + root := cmd.Root() + out := cmd.OutOrStdout() + switch args[0] { + case "bash": + return root.GenBashCompletion(out) + case "zsh": + return root.GenZshCompletion(out) + case "fish": + return root.GenFishCompletion(out, true) + case "powershell": + return root.GenPowerShellCompletion(out) + default: + return fmt.Errorf("unsupported shell %q", args[0]) + } + }, + } +} diff --git a/backend/internal/cli/doctor.go b/backend/internal/cli/doctor.go new file mode 100644 index 00000000..4c6953f2 --- /dev/null +++ b/backend/internal/cli/doctor.go @@ -0,0 +1,155 @@ +package cli + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +type doctorLevel string + +const ( + doctorPass doctorLevel = "PASS" + doctorWarn doctorLevel = "WARN" + doctorFail doctorLevel = "FAIL" +) + +type doctorCheck struct { + Level doctorLevel `json:"level"` + Name string `json:"name"` + Message string `json:"message"` +} + +type doctorReport struct { + OK bool `json:"ok"` + Failures int `json:"failures"` + Checks []doctorCheck `json:"checks"` +} + +func newDoctorCommand(ctx *commandContext) *cobra.Command { + var asJSON bool + cmd := &cobra.Command{ + Use: "doctor", + Short: "Run local AO health checks", + RunE: func(cmd *cobra.Command, args []string) error { + checks := ctx.runDoctor(cmd.Context()) + failures := 0 + for _, check := range checks { + if check.Level == doctorFail { + failures++ + } + } + + if asJSON { + if err := writeJSON(cmd.OutOrStdout(), doctorReport{ + OK: failures == 0, Failures: failures, Checks: checks, + }); err != nil { + return err + } + } else { + for _, check := range checks { + if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s: %s\n", check.Level, check.Name, check.Message); err != nil { + return err + } + } + } + + if failures > 0 { + return fmt.Errorf("doctor found %d failing check(s)", failures) + } + return nil + }, + } + cmd.Flags().BoolVar(&asJSON, "json", false, "Output health checks as JSON") + return cmd +} + +func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { + checks := []doctorCheck{} + + cfg, err := config.Load() + if err != nil { + return append(checks, doctorCheck{Level: doctorFail, Name: "config", Message: err.Error()}) + } + checks = append(checks, doctorCheck{ + Level: doctorPass, Name: "config", + Message: fmt.Sprintf("runFile=%s dataDir=%s port=%d", cfg.RunFilePath, cfg.DataDir, cfg.Port), + }) + + if err := os.MkdirAll(cfg.DataDir, 0o755); err != nil { + checks = append(checks, doctorCheck{Level: doctorFail, Name: "data-dir", Message: err.Error()}) + } else { + checks = append(checks, doctorCheck{Level: doctorPass, Name: "data-dir", Message: cfg.DataDir}) + } + + checks = append(checks, checkStore(cfg.DataDir)) + + st, err := c.inspectDaemon(ctx) + if err != nil { + checks = append(checks, doctorCheck{Level: doctorFail, Name: "daemon", Message: err.Error()}) + } else { + level := doctorPass + switch st.State { + case "stale", "not_ready": + level = doctorWarn + case "unhealthy": + level = doctorFail + } + msg := st.State + if st.PID != 0 { + msg = fmt.Sprintf("%s pid=%d port=%d", msg, st.PID, st.Port) + } + if st.Error != "" { + msg += " (" + st.Error + ")" + } + checks = append(checks, doctorCheck{Level: level, Name: "daemon", Message: msg}) + } + + checks = append(checks, c.checkTool("git", true)) + checks = append(checks, c.checkTool("tmux", false)) + checks = append(checks, c.checkTool("zellij", false)) + return checks +} + +// checkStore inspects the SQLite store WITHOUT opening or migrating it. The +// daemon is the sole writer and migrator of the database (architecture.md §7); +// the CLI must never run migrations or open a second writer against a database +// a live daemon may already own. Migrations are validated by the daemon at +// startup and surfaced through /readyz, so doctor only confirms whether the +// database file exists yet. +func checkStore(dataDir string) doctorCheck { + dbPath := filepath.Join(dataDir, "ao.db") + info, err := os.Stat(dbPath) + switch { + case err == nil: + return doctorCheck{ + Level: doctorPass, Name: "sqlite", + Message: fmt.Sprintf("%s (%d bytes); migrations are applied by the daemon at startup", dbPath, info.Size()), + } + case errors.Is(err, fs.ErrNotExist): + return doctorCheck{ + Level: doctorWarn, Name: "sqlite", + Message: "database not created yet; run `ao start` to initialize and migrate it", + } + default: + return doctorCheck{Level: doctorFail, Name: "sqlite", Message: err.Error()} + } +} + +func (c *commandContext) checkTool(name string, required bool) doctorCheck { + path, err := c.deps.LookPath(name) + if err == nil { + return doctorCheck{Level: doctorPass, Name: name, Message: path} + } + if required { + return doctorCheck{Level: doctorFail, Name: name, Message: "not found in PATH"} + } + return doctorCheck{Level: doctorWarn, Name: name, Message: "not found in PATH"} +} diff --git a/backend/internal/cli/e2e_test.go b/backend/internal/cli/e2e_test.go new file mode 100644 index 00000000..f89e4671 --- /dev/null +++ b/backend/internal/cli/e2e_test.go @@ -0,0 +1,346 @@ +//go:build e2e + +// Package cli_test holds the end-to-end suite for the `ao` CLI. It builds the +// real binary and drives it (start/status/doctor/stop + the daemon-control HTTP +// surface) against fully isolated state — a per-test temp run-file, data dir, +// and an OS-assigned free loopback port — so it never touches a developer's real +// AO install. Unlike the Linux-only container smoke test, this runs natively on +// every OS in CI (ubuntu/macos/windows), which is the only way to exercise the +// unix setsid vs Windows CREATE_NEW_PROCESS_GROUP detach paths and the per-OS +// os.UserConfigDir resolution. +// +// It is gated behind the `e2e` build tag so it never runs in the normal +// `go test ./...` lane (it spawns processes and binds ports): +// +// go test -tags e2e ./internal/cli/... # run it +// go test -tags e2e -v -run TestE2E ./internal/cli/... # verbose, see every command +package cli_test + +import ( + "fmt" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "testing" + "time" +) + +// aoBin is the path to the binary built once for the whole suite. +var aoBin string + +func TestMain(m *testing.M) { + dir, err := os.MkdirTemp("", "ao-e2e-bin") + if err != nil { + fmt.Fprintln(os.Stderr, "e2e: mktemp:", err) + os.Exit(1) + } + aoBin = filepath.Join(dir, "ao") + if runtime.GOOS == "windows" { + aoBin += ".exe" + } + build := exec.Command("go", "build", "-o", aoBin, "github.com/aoagents/agent-orchestrator/backend/cmd/ao") + build.Stdout, build.Stderr = os.Stderr, os.Stderr + if err := build.Run(); err != nil { + fmt.Fprintln(os.Stderr, "e2e: build ao:", err) + os.Exit(1) + } + code := m.Run() + _ = os.RemoveAll(dir) + os.Exit(code) +} + +// env is an isolated CLI environment: its own state files and free port. +type env struct { + runFile string + dataDir string + port int +} + +func newEnv(t *testing.T) env { + t.Helper() + dir := t.TempDir() + return env{ + runFile: filepath.Join(dir, "running.json"), + dataDir: filepath.Join(dir, "data"), + port: freePort(t), + } +} + +// environ builds the child env: the ambient environment with every inherited +// AO_* var stripped (so a real daemon's AO_PORT can't leak in) plus our isolated +// settings. portOverride, when non-empty, replaces the numeric AO_PORT — used to +// inject an invalid value. +func (e env) environ(portOverride string) []string { + out := make([]string, 0, len(os.Environ())+3) + for _, kv := range os.Environ() { + if strings.HasPrefix(kv, "AO_") { + continue + } + out = append(out, kv) + } + port := fmt.Sprintf("%d", e.port) + if portOverride != "" { + port = portOverride + } + return append(out, "AO_RUN_FILE="+e.runFile, "AO_DATA_DIR="+e.dataDir, "AO_PORT="+port) +} + +func freePort(t *testing.T) int { + t.Helper() + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("alloc free port: %v", err) + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + +// run executes `ao args...` in env e and returns combined output + exit code. +func (e env) run(t *testing.T, args ...string) (string, int) { + t.Helper() + return e.runEnv(t, e.environ(""), args...) +} + +func (e env) runEnv(t *testing.T, environ []string, args ...string) (string, int) { + t.Helper() + cmd := exec.Command(aoBin, args...) + cmd.Env = environ + b, err := cmd.CombinedOutput() + out := string(b) + code := 0 + if err != nil { + var ee *exec.ExitError + if asExit(err, &ee) { + code = ee.ExitCode() + } else { + t.Fatalf("run %v: %v\n%s", args, err, out) + } + } + t.Logf("$ ao %s\n%s(exit %d)", strings.Join(args, " "), out, code) + return out, code +} + +func asExit(err error, target **exec.ExitError) bool { + if ee, ok := err.(*exec.ExitError); ok { + *target = ee + return true + } + return false +} + +// startDaemon brings the daemon up and registers a stop on cleanup. +func (e env) startDaemon(t *testing.T) { + t.Helper() + out, code := e.run(t, "start") + if code != 0 { + t.Fatalf("start failed (exit %d): %s", code, out) + } + t.Cleanup(func() { e.run(t, "stop") }) +} + +func mustContain(t *testing.T, out, want string) { + t.Helper() + if !strings.Contains(out, want) { + t.Fatalf("expected output to contain %q; got:\n%s", want, out) + } +} + +func mustNotContain(t *testing.T, out, notWant string) { + t.Helper() + if strings.Contains(out, notWant) { + t.Fatalf("expected output NOT to contain %q; got:\n%s", notWant, out) + } +} + +// --------------------------------------------------------------------------- + +func TestE2E_VersionAndHelp(t *testing.T) { + e := newEnv(t) + + if out, code := e.run(t, "version"); code != 0 || strings.TrimSpace(out) == "" { + t.Fatalf("version: exit %d, out %q", code, out) + } + if _, code := e.run(t, "--version"); code != 0 { + t.Fatalf("--version exit %d", code) + } + + out, code := e.run(t, "--help") + if code != 0 { + t.Fatalf("--help exit %d", code) + } + for _, want := range []string{"start", "stop", "status", "doctor", "completion", "version"} { + mustContain(t, out, want) + } + // the internal daemon command is hidden from help (rendered as "\n daemon") + mustNotContain(t, out, "\n daemon") +} + +func TestE2E_DoctorDoesNotTouchTheStore(t *testing.T) { + e := newEnv(t) + + out, code := e.run(t, "doctor") + if code != 0 { + t.Fatalf("doctor (fresh) exit %d: %s", code, out) + } + mustContain(t, out, "git") + mustContain(t, out, "database not created yet") // sqlite WARN, never migrated + + // doctor must NOT create/migrate the DB — the daemon is the sole writer. + if _, err := os.Stat(filepath.Join(e.dataDir, "ao.db")); err == nil { + t.Fatal("doctor created ao.db; the CLI must not open/migrate the store") + } + + if out, code := e.run(t, "doctor", "--json"); code != 0 || !strings.Contains(out, `"ok": true`) { + t.Fatalf("doctor --json: exit %d, out %s", code, out) + } +} + +func TestE2E_StatusStopped(t *testing.T) { + e := newEnv(t) + out, code := e.run(t, "status", "--json") + if code != 0 { // status always exits 0 + t.Fatalf("status exit %d", code) + } + mustContain(t, out, `"state": "stopped"`) + mustNotContain(t, out, "startedAt") + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop-when-stopped: exit %d, out %s", code, out) // idempotent + } +} + +func TestE2E_Lifecycle(t *testing.T) { + e := newEnv(t) + e.startDaemon(t) + + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "ready"`) + mustContain(t, out, fmt.Sprintf(`"port": %d`, e.port)) + + // idempotent + if out, code := e.run(t, "start"); code != 0 || !strings.Contains(out, "ready") { + t.Fatalf("idempotent start: exit %d, out %s", code, out) + } + + // now the daemon (not the CLI) has created + migrated the store + if _, err := os.Stat(filepath.Join(e.dataDir, "ao.db")); err != nil { + t.Fatalf("daemon should have created ao.db: %v", err) + } + out, _ = e.run(t, "doctor") + mustContain(t, out, "migrations are applied by the daemon") + + // /healthz identity + body := httpGet(t, e.port, "/healthz") + mustContain(t, body, "agent-orchestrator-daemon") + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop: exit %d, out %s", code, out) + } + if _, err := os.Stat(e.runFile); !os.IsNotExist(err) { + t.Fatal("run-file should be removed after stop") + } +} + +func TestE2E_ShutdownGuard(t *testing.T) { + e := newEnv(t) + e.startDaemon(t) + + // A cross-site Origin header must be rejected without stopping the daemon. + if code := postShutdown(t, e.port, func(r *http.Request) { r.Header.Set("Origin", "https://evil.example") }); code != http.StatusForbidden { + t.Fatalf("cross-origin /shutdown = %d, want 403", code) + } + // A non-loopback Host (DNS-rebinding) must be rejected too. + if code := postShutdown(t, e.port, func(r *http.Request) { r.Host = "evil.example" }); code != http.StatusForbidden { + t.Fatalf("rebinding-host /shutdown = %d, want 403", code) + } + // The daemon survived both. + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "ready"`) +} + +func TestE2E_StaleRunFile(t *testing.T) { + e := newEnv(t) + // PID 2147483647 is never alive -> the CLI must classify this as stale. + content := fmt.Sprintf(`{"pid":2147483647,"port":%d,"startedAt":"2020-01-01T00:00:00Z"}`, e.port) + if err := os.MkdirAll(filepath.Dir(e.runFile), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(e.runFile, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + out, _ := e.run(t, "status", "--json") + mustContain(t, out, `"state": "stale"`) + + if out, code := e.run(t, "stop"); code != 0 || !strings.Contains(out, "stopped") { + t.Fatalf("stop stale: exit %d, out %s", code, out) + } + if _, err := os.Stat(e.runFile); !os.IsNotExist(err) { + t.Fatal("stale run-file should be removed") + } +} + +func TestE2E_ExitCodes(t *testing.T) { + e := newEnv(t) + + if _, code := e.run(t, "status", "--definitely-not-a-flag"); code != 2 { + t.Fatalf("bad flag exit %d, want 2", code) + } + if _, code := e.run(t, "completion"); code != 2 { // missing required arg + t.Fatalf("missing-arg exit %d, want 2", code) + } + if _, code := e.run(t, "completion", "notashell"); code == 0 { // runtime error + t.Fatal("unsupported shell should be non-zero") + } + // invalid config is a runtime error (1), not a usage error (2). + if _, code := e.runEnv(t, e.environ("notaport"), "status"); code != 1 { + t.Fatalf("invalid AO_PORT exit %d, want 1", code) + } +} + +func TestE2E_Completion(t *testing.T) { + e := newEnv(t) + for _, sh := range []string{"bash", "zsh", "fish", "powershell"} { + out, code := e.run(t, "completion", sh) + if code != 0 || strings.TrimSpace(out) == "" { + t.Fatalf("completion %s: exit %d, empty=%v", sh, code, strings.TrimSpace(out) == "") + } + } +} + +// --------------------------------------------------------------------------- +// HTTP helpers (loopback) + +func httpClient() *http.Client { return &http.Client{Timeout: 3 * time.Second} } + +func httpGet(t *testing.T, port int, path string) string { + t.Helper() + resp, err := httpClient().Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path)) + if err != nil { + t.Fatalf("GET %s: %v", path, err) + } + defer resp.Body.Close() + b := make([]byte, 4096) + n, _ := resp.Body.Read(b) + return string(b[:n]) +} + +// postShutdown issues POST /shutdown with mutator applied, returns the status code. +func postShutdown(t *testing.T, port int, mutate func(*http.Request)) int { + t.Helper() + req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/shutdown", port), nil) + if err != nil { + t.Fatal(err) + } + mutate(req) + resp, err := httpClient().Do(req) + if err != nil { + t.Fatalf("POST /shutdown: %v", err) + } + defer resp.Body.Close() + return resp.StatusCode +} diff --git a/backend/internal/cli/exitcode_test.go b/backend/internal/cli/exitcode_test.go new file mode 100644 index 00000000..bd8817c6 --- /dev/null +++ b/backend/internal/cli/exitcode_test.go @@ -0,0 +1,27 @@ +package cli + +import ( + "errors" + "fmt" + "testing" +) + +func TestExitCode(t *testing.T) { + cases := []struct { + name string + err error + want int + }{ + {"nil is success", nil, 0}, + {"runtime error is 1", errors.New("boom"), 1}, + {"usage error is 2", usageError{errors.New("bad flag")}, 2}, + {"wrapped usage error is still 2", fmt.Errorf("ctx: %w", usageError{errors.New("x")}), 2}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := ExitCode(tc.err); got != tc.want { + t.Errorf("ExitCode(%v) = %d, want %d", tc.err, got, tc.want) + } + }) + } +} diff --git a/backend/internal/cli/output.go b/backend/internal/cli/output.go new file mode 100644 index 00000000..df76e23e --- /dev/null +++ b/backend/internal/cli/output.go @@ -0,0 +1,12 @@ +package cli + +import ( + "encoding/json" + "io" +) + +func writeJSON(w io.Writer, v any) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(v) +} diff --git a/backend/internal/cli/process.go b/backend/internal/cli/process.go new file mode 100644 index 00000000..19c4d19f --- /dev/null +++ b/backend/internal/cli/process.go @@ -0,0 +1,34 @@ +package cli + +import ( + "os" + "os/exec" +) + +type processStartConfig struct { + Path string + Args []string + Env []string + Stdout *os.File + Stderr *os.File +} + +type processHandle struct { + PID int +} + +func startProcess(cfg processStartConfig) (processHandle, error) { + cmd := exec.Command(cfg.Path, cfg.Args...) + cmd.Env = cfg.Env + cmd.Stdout = cfg.Stdout + cmd.Stderr = cfg.Stderr + // Detach the daemon into its own session/process group so a Ctrl-C in the + // terminal where `ao start` is waiting for readiness doesn't also SIGINT the + // freshly spawned daemon (it would otherwise share the launcher's group). + cmd.SysProcAttr = detachSysProcAttr() + if err := cmd.Start(); err != nil { + return processHandle{}, err + } + go func() { _ = cmd.Wait() }() + return processHandle{PID: cmd.Process.Pid}, nil +} diff --git a/backend/internal/cli/process_unix.go b/backend/internal/cli/process_unix.go new file mode 100644 index 00000000..9963d9e9 --- /dev/null +++ b/backend/internal/cli/process_unix.go @@ -0,0 +1,23 @@ +//go:build !windows + +package cli + +import ( + "errors" + "syscall" +) + +func processAlive(pid int) bool { + if pid <= 0 { + return false + } + err := syscall.Kill(pid, 0) + return err == nil || errors.Is(err, syscall.EPERM) +} + +// detachSysProcAttr puts the daemon in a new session (Setsid) so it is no +// longer in the launcher's foreground process group and won't receive the +// terminal's SIGINT/SIGHUP. +func detachSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setsid: true} +} diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go new file mode 100644 index 00000000..3ff8190a --- /dev/null +++ b/backend/internal/cli/process_windows.go @@ -0,0 +1,36 @@ +//go:build windows + +package cli + +import ( + "errors" + "syscall" + + "golang.org/x/sys/windows" +) + +func processAlive(pid int) bool { + if pid <= 0 { + return false + } + handle, err := windows.OpenProcess(windows.SYNCHRONIZE, false, uint32(pid)) + if err != nil { + if errors.Is(err, windows.ERROR_ACCESS_DENIED) { + return true + } + return false + } + defer windows.CloseHandle(handle) + + status, err := windows.WaitForSingleObject(handle, 0) + if err != nil { + return false + } + return status == uint32(windows.WAIT_TIMEOUT) +} + +// detachSysProcAttr starts the daemon in a new process group so it does not +// receive the console's CTRL_C/CTRL_BREAK while `ao start` waits for readiness. +func detachSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{CreationFlags: windows.CREATE_NEW_PROCESS_GROUP} +} diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go new file mode 100644 index 00000000..36e83e5a --- /dev/null +++ b/backend/internal/cli/root.go @@ -0,0 +1,158 @@ +// Package cli implements the user-facing ao command. It stays thin: commands +// discover the local daemon, call its loopback HTTP API, and format output. +package cli + +import ( + "errors" + "io" + "net/http" + "os" + "os/exec" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemon" +) + +// Execute runs the ao CLI with process stdio. +func Execute() error { + return NewRootCommand(DefaultDeps()).Execute() +} + +// usageError marks a command-line misuse (bad flag, wrong arg count). It lets +// the process entrypoint return exit code 2 for usage errors versus 1 for +// runtime failures, matching the convention CLIs are scripted against. +type usageError struct{ err error } + +func (e usageError) Error() string { return e.err.Error() } +func (e usageError) Unwrap() error { return e.err } + +// ExitCode maps a CLI error to a process exit code: 2 for usage errors, 1 for +// any other failure, 0 for success. +func ExitCode(err error) int { + if err == nil { + return 0 + } + var ue usageError + if errors.As(err, &ue) { + return 2 + } + return 1 +} + +// Deps holds the small set of side effects the CLI needs. Tests replace these +// functions without reaching into process-global state. +type Deps struct { + In io.Reader + Out io.Writer + Err io.Writer + + HTTPClient *http.Client + Executable func() (string, error) + StartProcess func(processStartConfig) (processHandle, error) + ProcessAlive func(pid int) bool + LookPath func(file string) (string, error) + Now func() time.Time + Sleep func(time.Duration) +} + +// DefaultDeps returns production dependencies. +func DefaultDeps() Deps { + return Deps{ + In: os.Stdin, + Out: os.Stdout, + Err: os.Stderr, + HTTPClient: &http.Client{Timeout: 2 * time.Second}, + Executable: os.Executable, + StartProcess: startProcess, + ProcessAlive: processAlive, + LookPath: exec.LookPath, + Now: time.Now, + Sleep: time.Sleep, + } +} + +func (d Deps) withDefaults() Deps { + def := DefaultDeps() + if d.In == nil { + d.In = def.In + } + if d.Out == nil { + d.Out = def.Out + } + if d.Err == nil { + d.Err = def.Err + } + if d.HTTPClient == nil { + d.HTTPClient = def.HTTPClient + } + if d.Executable == nil { + d.Executable = def.Executable + } + if d.StartProcess == nil { + d.StartProcess = def.StartProcess + } + if d.ProcessAlive == nil { + d.ProcessAlive = def.ProcessAlive + } + if d.LookPath == nil { + d.LookPath = def.LookPath + } + if d.Now == nil { + d.Now = def.Now + } + if d.Sleep == nil { + d.Sleep = def.Sleep + } + return d +} + +// NewRootCommand builds a testable root command. +func NewRootCommand(deps Deps) *cobra.Command { + deps = deps.withDefaults() + ctx := &commandContext{deps: deps} + + root := &cobra.Command{ + Use: "ao", + Short: "Agent Orchestrator", + Long: "Agent Orchestrator manages the local daemon that supervises parallel coding-agent sessions.", + Version: VersionString(), + SilenceUsage: true, + SilenceErrors: true, + } + root.SetIn(deps.In) + root.SetOut(deps.Out) + root.SetErr(deps.Err) + root.CompletionOptions.DisableDefaultCmd = true + // Tag flag-parse failures as usage errors so the entrypoint can exit 2 for + // misuse versus 1 for runtime failures. Subcommands inherit this func. + root.SetFlagErrorFunc(func(_ *cobra.Command, err error) error { + return usageError{err} + }) + + root.AddCommand(newDaemonCommand()) + root.AddCommand(newStartCommand(ctx)) + root.AddCommand(newStopCommand(ctx)) + root.AddCommand(newStatusCommand(ctx)) + root.AddCommand(newDoctorCommand(ctx)) + root.AddCommand(newCompletionCommand()) + root.AddCommand(newVersionCommand()) + + return root +} + +type commandContext struct { + deps Deps +} + +func newDaemonCommand() *cobra.Command { + return &cobra.Command{ + Use: "daemon", + Short: "Run the AO backend daemon", + Hidden: true, + RunE: func(cmd *cobra.Command, args []string) error { + return daemon.Run() + }, + } +} diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go new file mode 100644 index 00000000..5b920531 --- /dev/null +++ b/backend/internal/cli/root_test.go @@ -0,0 +1,385 @@ +package cli + +import ( + "bytes" + "fmt" + "net" + "net/http" + "net/http/httptest" + "net/url" + "os" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +func TestRootHelpDoesNotShowDaemon(t *testing.T) { + out, _, err := executeCLI(t, Deps{}, "--help") + if err != nil { + t.Fatal(err) + } + if strings.Contains(out, "\n daemon") { + t.Fatalf("hidden daemon command leaked into help:\n%s", out) + } + for _, want := range []string{"start", "stop", "status", "doctor", "completion", "version"} { + if !strings.Contains(out, want) { + t.Fatalf("help missing %q:\n%s", want, out) + } + } +} + +func TestStatusStoppedJSON(t *testing.T) { + setConfigEnv(t) + + out, _, err := executeCLI(t, Deps{ProcessAlive: func(int) bool { return false }}, "status", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("status did not report stopped:\n%s", out) + } + if strings.Contains(out, "startedAt") { + t.Fatalf("stopped JSON should omit startedAt:\n%s", out) + } +} + +func TestStartReturnsExistingReadyDaemon(t *testing.T) { + cfg := setConfigEnv(t) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/readyz": + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + port := serverPort(t, srv.URL) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var started bool + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + StartProcess: func(processStartConfig) (processHandle, error) { + started = true + return processHandle{}, nil + }, + Now: func() time.Time { return time.Unix(110, 0).UTC() }, + }, "start", "--json") + if err != nil { + t.Fatal(err) + } + if started { + t.Fatal("start should not spawn when daemon is already ready") + } + if !strings.Contains(out, `"state": "ready"`) { + t.Fatalf("start did not report ready:\n%s", out) + } +} + +func TestStartClearsStaleRunFileBeforeSpawning(t *testing.T) { + cfg := setConfigEnv(t) + var spawned atomic.Bool + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !spawned.Load() { + _, _ = fmt.Fprintf(w, `{"status":"ok","service":"not-ao","pid":4242}`) + return + } + switch r.URL.Path { + case "/healthz": + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/readyz": + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + port := serverPort(t, srv.URL) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: port, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 || pid == os.Getpid() }, + StartProcess: func(processStartConfig) (processHandle, error) { + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("stale run-file was not removed before spawn: %#v", info) + } + spawned.Store(true) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(110, 0).UTC()}); err != nil { + t.Fatal(err) + } + return processHandle{PID: os.Getpid()}, nil + }, + Now: func() time.Time { return time.Unix(120, 0).UTC() }, + }, "start", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "ready"`) { + t.Fatalf("start did not report ready after clearing stale run-file:\n%s", out) + } +} + +func TestStopRemovesStaleRunFile(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 999999, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ProcessAlive: func(int) bool { return false }}, "stop", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("stale run-file was not removed: %#v", info) + } +} + +func TestStopDoesNotShutdownUnverifiedReusedPID(t *testing.T) { + cfg := setConfigEnv(t) + shutdownCalled := make(chan struct{}, 1) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = w.Write([]byte(`{"status":"ok"}`)) + case "/readyz": + _, _ = w.Write([]byte(`{"status":"ready"}`)) + case "/shutdown": + shutdownCalled <- struct{}{} + http.Error(w, "unexpected shutdown", http.StatusInternalServerError) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: serverPort(t, srv.URL), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + }, "stop", "--json") + if err != nil { + t.Fatal(err) + } + select { + case <-shutdownCalled: + t.Fatal("stop requested shutdown from a process whose health probe did not prove AO daemon ownership") + default: + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("unverified run-file was not removed: %#v", info) + } +} + +func TestStopUsesShutdownEndpoint(t *testing.T) { + cfg := setConfigEnv(t) + shutdownCalled := make(chan struct{}, 1) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/healthz": + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/readyz": + _, _ = fmt.Fprintf(w, `{"status":"ready","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + case "/shutdown": + if err := runfile.Remove(cfg.runFile); err != nil { + t.Fatal(err) + } + shutdownCalled <- struct{}{} + _, _ = fmt.Fprintf(w, `{"status":"shutting_down","service":%q,"pid":%d}`, daemonmeta.ServiceName, os.Getpid()) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: serverPort(t, srv.URL), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, + }, "stop", "--json") + if err != nil { + t.Fatal(err) + } + select { + case <-shutdownCalled: + default: + t.Fatal("stop did not call daemon shutdown endpoint") + } + if !strings.Contains(out, `"state": "stopped"`) { + t.Fatalf("stop did not report stopped:\n%s", out) + } +} + +func TestStatusKeepsLiveProbeFailureUnhealthy(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + out, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + }, "status", "--json") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(out, `"state": "unhealthy"`) { + t.Fatalf("status should keep live probe failures unhealthy:\n%s", out) + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("live probe failure should not remove run-file") + } +} + +func TestStopRefusesUnverifiedLivePID(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + _, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + }, "stop", "--json") + if err == nil { + t.Fatal("stop should fail when daemon ownership cannot be verified") + } + info, err := runfile.Read(cfg.runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("unverified live PID should remain tracked") + } +} + +func TestStartDoesNotSpawnWhenLiveProbeFails(t *testing.T) { + cfg := setConfigEnv(t) + if err := runfile.Write(cfg.runFile, runfile.Info{PID: 4242, Port: closedPort(t), StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + var started bool + _, _, err := executeCLI(t, Deps{ + ProcessAlive: func(pid int) bool { return pid == 4242 }, + StartProcess: func(processStartConfig) (processHandle, error) { + started = true + return processHandle{}, nil + }, + }, "start", "--timeout", "1ns", "--json") + if err == nil { + t.Fatal("start should fail instead of spawning over a live unverified PID") + } + if started { + t.Fatal("start spawned while run-file PID was still alive") + } +} + +type testConfig struct { + runFile string + dataDir string +} + +func setConfigEnv(t *testing.T) testConfig { + t.Helper() + dir := t.TempDir() + cfg := testConfig{ + runFile: filepath.Join(dir, "running.json"), + dataDir: filepath.Join(dir, "data"), + } + t.Setenv("AO_RUN_FILE", cfg.runFile) + t.Setenv("AO_DATA_DIR", cfg.dataDir) + t.Setenv("AO_PORT", "3001") + t.Setenv("AO_REQUEST_TIMEOUT", "") + t.Setenv("AO_SHUTDOWN_TIMEOUT", "") + return cfg +} + +func executeCLI(t *testing.T, deps Deps, args ...string) (string, string, error) { + t.Helper() + var out, errOut bytes.Buffer + deps.Out = &out + deps.Err = &errOut + if deps.Sleep == nil { + deps.Sleep = func(time.Duration) {} + } + cmd := NewRootCommand(deps) + cmd.SetArgs(args) + err := cmd.Execute() + return out.String(), errOut.String(), err +} + +func serverPort(t *testing.T, raw string) int { + t.Helper() + u, err := url.Parse(raw) + if err != nil { + t.Fatal(err) + } + _, portRaw, err := net.SplitHostPort(u.Host) + if err != nil { + t.Fatal(err) + } + port, err := strconv.Atoi(portRaw) + if err != nil { + t.Fatal(err) + } + return port +} + +func closedPort(t *testing.T) int { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatal(err) + } + defer ln.Close() + + _, portRaw, err := net.SplitHostPort(ln.Addr().String()) + if err != nil { + t.Fatal(err) + } + port, err := strconv.Atoi(portRaw) + if err != nil { + t.Fatal(err) + } + return port +} diff --git a/backend/internal/cli/start.go b/backend/internal/cli/start.go new file mode 100644 index 00000000..0787eba9 --- /dev/null +++ b/backend/internal/cli/start.go @@ -0,0 +1,144 @@ +package cli + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +const defaultStartTimeout = 10 * time.Second + +type startOptions struct { + timeout time.Duration + logFile string + json bool +} + +func newStartCommand(ctx *commandContext) *cobra.Command { + opts := startOptions{timeout: defaultStartTimeout} + cmd := &cobra.Command{ + Use: "start", + Short: "Start the AO daemon", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.startDaemon(cmd.Context(), opts) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + if st.State == "ready" { + _, err = fmt.Fprintf(cmd.OutOrStdout(), "AO daemon ready (pid %d, port %d)\n", st.PID, st.Port) + return err + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().DurationVar(&opts.timeout, "timeout", defaultStartTimeout, "How long to wait for daemon readiness") + cmd.Flags().StringVar(&opts.logFile, "log-file", "", "Daemon log file path") + cmd.Flags().BoolVar(&opts.json, "json", false, "Output start result as JSON") + return cmd +} + +func (c *commandContext) startDaemon(ctx context.Context, opts startOptions) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + + st, err := c.inspectDaemon(ctx) + if err != nil { + return daemonStatus{}, err + } + if st.State == "ready" { + return st, nil + } + if st.State != "stopped" && st.State != "stale" { + ready, waitErr := c.waitForReady(ctx, opts.timeout) + if waitErr == nil { + return ready, nil + } + return daemonStatus{}, fmt.Errorf("daemon process exists but did not become ready: %w", waitErr) + } + if st.State == "stale" { + if err := runfile.Remove(cfg.RunFilePath); err != nil { + return daemonStatus{}, err + } + } + + exe, err := c.deps.Executable() + if err != nil { + return daemonStatus{}, fmt.Errorf("resolve executable: %w", err) + } + + logPath := opts.logFile + if logPath == "" { + logPath = filepath.Join(filepath.Dir(cfg.RunFilePath), "daemon.log") + } + if err := os.MkdirAll(filepath.Dir(logPath), 0o755); err != nil { + return daemonStatus{}, fmt.Errorf("create log dir: %w", err) + } + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return daemonStatus{}, fmt.Errorf("open daemon log: %w", err) + } + defer logFile.Close() + + if _, err := c.deps.StartProcess(processStartConfig{ + Path: exe, + Args: []string{"daemon"}, + Env: os.Environ(), + Stdout: logFile, + Stderr: logFile, + }); err != nil { + return daemonStatus{}, fmt.Errorf("start daemon: %w", err) + } + + ready, err := c.waitForReady(ctx, opts.timeout) + if err != nil { + return daemonStatus{}, fmt.Errorf("%w; see daemon log %s", err, logPath) + } + return ready, nil +} + +func (c *commandContext) waitForReady(ctx context.Context, timeout time.Duration) (daemonStatus, error) { + if timeout <= 0 { + timeout = defaultStartTimeout + } + deadline := c.deps.Now().Add(timeout) + var last daemonStatus + var lastErr error + + for { + select { + case <-ctx.Done(): + return daemonStatus{}, ctx.Err() + default: + } + + st, err := c.inspectDaemon(ctx) + if err != nil { + lastErr = err + } else { + last = st + if st.State == "ready" { + return st, nil + } + } + + if !c.deps.Now().Before(deadline) { + if lastErr != nil { + return daemonStatus{}, fmt.Errorf("daemon did not become ready within %s: %w", timeout, lastErr) + } + return daemonStatus{}, fmt.Errorf("daemon did not become ready within %s (last state: %s)", timeout, last.State) + } + c.deps.Sleep(100 * time.Millisecond) + } +} diff --git a/backend/internal/cli/status.go b/backend/internal/cli/status.go new file mode 100644 index 00000000..85cbf5ac --- /dev/null +++ b/backend/internal/cli/status.go @@ -0,0 +1,219 @@ +package cli + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +const probeTimeout = 2 * time.Second + +type statusOptions struct { + json bool +} + +type daemonStatus struct { + State string `json:"state"` + PID int `json:"pid,omitempty"` + Port int `json:"port,omitempty"` + StartedAt *time.Time `json:"startedAt,omitempty"` + Uptime string `json:"uptime,omitempty"` + RunFile string `json:"runFile"` + DataDir string `json:"dataDir"` + Health string `json:"health,omitempty"` + Ready string `json:"ready,omitempty"` + Error string `json:"error,omitempty"` + owned bool +} + +type probeResult struct { + Status string `json:"status"` + Service string `json:"service"` + PID int `json:"pid"` +} + +func newStatusCommand(ctx *commandContext) *cobra.Command { + var opts statusOptions + cmd := &cobra.Command{ + Use: "status", + Short: "Show AO daemon status", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.inspectDaemon(cmd.Context()) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().BoolVar(&opts.json, "json", false, "Output status as JSON") + return cmd +} + +func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + st := daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir} + + info, err := runfile.Read(cfg.RunFilePath) + if err != nil { + return daemonStatus{}, err + } + if info == nil { + return st, nil + } + + st.PID = info.PID + st.Port = info.Port + startedAt := info.StartedAt + st.StartedAt = &startedAt + st.Uptime = formatUptime(c.deps.Now().Sub(info.StartedAt)) + + if !c.deps.ProcessAlive(info.PID) { + st.State = "stale" + st.Error = "run-file points to a dead process" + return st, nil + } + + health, err := c.readProbe(ctx, info.Port, "healthz") + if err != nil { + st.State = "unhealthy" + st.Error = err.Error() + return st, nil + } + if err := verifyProbeOwner(health, info.PID, "healthz"); err != nil { + st.State = "stale" + st.Error = err.Error() + return st, nil + } + st.owned = true + st.Health = health.Status + if health.Status != "ok" { + st.State = "unhealthy" + return st, nil + } + + ready, err := c.readProbe(ctx, info.Port, "readyz") + if err != nil { + st.State = "not_ready" + st.Error = err.Error() + return st, nil + } + if err := verifyProbeOwner(ready, info.PID, "readyz"); err != nil { + st.State = "stale" + st.owned = false + st.Error = err.Error() + return st, nil + } + st.Ready = ready.Status + if ready.Status == "ready" { + st.State = "ready" + return st, nil + } + st.State = "not_ready" + return st, nil +} + +func (c *commandContext) readProbe(ctx context.Context, port int, path string) (probeResult, error) { + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, fmt.Sprintf("http://%s:%d/%s", config.LoopbackHost, port, path), nil) + if err != nil { + return probeResult{}, err + } + resp, err := c.deps.HTTPClient.Do(req) + if err != nil { + return probeResult{}, fmt.Errorf("%s: %w", path, err) + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return probeResult{}, fmt.Errorf("%s: HTTP %d", path, resp.StatusCode) + } + var body probeResult + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return probeResult{}, fmt.Errorf("%s: decode response: %w", path, err) + } + if body.Status == "" { + return probeResult{}, fmt.Errorf("%s: missing status", path) + } + return body, nil +} + +func verifyProbeOwner(probe probeResult, wantPID int, path string) error { + if probe.Service != daemonmeta.ServiceName { + return fmt.Errorf("%s: response is not from AO daemon", path) + } + if probe.PID != wantPID { + return fmt.Errorf("%s: daemon pid %d does not match run-file pid %d", path, probe.PID, wantPID) + } + return nil +} + +func writeStatus(cmd *cobra.Command, st daemonStatus) error { + out := cmd.OutOrStdout() + if _, err := fmt.Fprintf(out, "AO daemon: %s\n", st.State); err != nil { + return err + } + if st.PID != 0 { + if _, err := fmt.Fprintf(out, " pid: %d\n", st.PID); err != nil { + return err + } + } + if st.Port != 0 { + if _, err := fmt.Fprintf(out, " port: %d\n", st.Port); err != nil { + return err + } + } + if st.StartedAt != nil && !st.StartedAt.IsZero() { + if _, err := fmt.Fprintf(out, " started: %s\n", st.StartedAt.Format(time.RFC3339)); err != nil { + return err + } + } + if st.Uptime != "" { + if _, err := fmt.Fprintf(out, " uptime: %s\n", st.Uptime); err != nil { + return err + } + } + if _, err := fmt.Fprintf(out, " run file: %s\n", st.RunFile); err != nil { + return err + } + if _, err := fmt.Fprintf(out, " data dir: %s\n", st.DataDir); err != nil { + return err + } + if st.Health != "" { + if _, err := fmt.Fprintf(out, " healthz: %s\n", st.Health); err != nil { + return err + } + } + if st.Ready != "" { + if _, err := fmt.Fprintf(out, " readyz: %s\n", st.Ready); err != nil { + return err + } + } + if st.Error != "" { + if _, err := fmt.Fprintf(out, " error: %s\n", st.Error); err != nil { + return err + } + } + return nil +} + +func formatUptime(d time.Duration) string { + if d < 0 { + d = 0 + } + return d.Round(time.Second).String() +} diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go new file mode 100644 index 00000000..9b00c1c4 --- /dev/null +++ b/backend/internal/cli/stop.go @@ -0,0 +1,134 @@ +package cli + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +const defaultStopTimeout = 10 * time.Second + +type stopOptions struct { + timeout time.Duration + json bool +} + +func newStopCommand(ctx *commandContext) *cobra.Command { + opts := stopOptions{timeout: defaultStopTimeout} + cmd := &cobra.Command{ + Use: "stop", + Short: "Stop the AO daemon", + RunE: func(cmd *cobra.Command, args []string) error { + st, err := ctx.stopDaemon(cmd.Context(), opts) + if err != nil { + return err + } + if opts.json { + return writeJSON(cmd.OutOrStdout(), st) + } + if st.State == "stopped" { + _, err = fmt.Fprintln(cmd.OutOrStdout(), "AO daemon stopped") + return err + } + return writeStatus(cmd, st) + }, + } + cmd.Flags().DurationVar(&opts.timeout, "timeout", defaultStopTimeout, "How long to wait for daemon shutdown") + cmd.Flags().BoolVar(&opts.json, "json", false, "Output stop result as JSON") + return cmd +} + +func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daemonStatus, error) { + cfg, err := config.Load() + if err != nil { + return daemonStatus{}, err + } + st, err := c.inspectDaemon(ctx) + if err != nil { + return daemonStatus{}, err + } + switch st.State { + case "stopped": + return st, nil + case "stale": + if err := runfile.Remove(cfg.RunFilePath); err != nil { + return daemonStatus{}, err + } + return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + } + if !st.owned { + if st.Error != "" { + return daemonStatus{}, fmt.Errorf("daemon pid %d is alive but ownership could not be verified: %s", st.PID, st.Error) + } + return daemonStatus{}, fmt.Errorf("daemon pid %d is alive but ownership could not be verified", st.PID) + } + + if err := c.requestShutdown(ctx, st.Port); err != nil { + return daemonStatus{}, fmt.Errorf("request daemon shutdown: %w", err) + } + return c.waitForStopped(ctx, st.PID, cfg.RunFilePath, cfg.DataDir, opts.timeout) +} + +func (c *commandContext) requestShutdown(ctx context.Context, port int) error { + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, fmt.Sprintf("http://%s:%d/shutdown", config.LoopbackHost, port), nil) + if err != nil { + return err + } + resp, err := c.deps.HTTPClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("HTTP %d", resp.StatusCode) + } + return nil +} + +func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePath, dataDir string, timeout time.Duration) (daemonStatus, error) { + if timeout <= 0 { + timeout = defaultStopTimeout + } + deadline := c.deps.Now().Add(timeout) + for { + select { + case <-ctx.Done(): + return daemonStatus{}, ctx.Err() + default: + } + + info, err := runfile.Read(runFilePath) + if err != nil { + return daemonStatus{}, err + } + alive := c.deps.ProcessAlive(pid) + if info == nil { + return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + } + if !alive { + // Only remove the run-file if it still belongs to the process we + // stopped. A concurrent `ao start` may have already written a new + // run-file for a different daemon; removing that would corrupt its + // handshake and make a live daemon look stopped. + if info.PID == pid { + if err := runfile.Remove(runFilePath); err != nil { + return daemonStatus{}, err + } + } + return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + } + if !c.deps.Now().Before(deadline) { + return daemonStatus{}, fmt.Errorf("daemon pid %d did not stop within %s", pid, timeout) + } + c.deps.Sleep(100 * time.Millisecond) + } +} diff --git a/backend/internal/cli/stop_test.go b/backend/internal/cli/stop_test.go new file mode 100644 index 00000000..85b6a509 --- /dev/null +++ b/backend/internal/cli/stop_test.go @@ -0,0 +1,83 @@ +package cli + +import ( + "context" + "path/filepath" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// TestWaitForStoppedKeepsRunFileFromConcurrentStart guards against deleting a +// fresh daemon's handshake: if a concurrent `ao start` replaces running.json +// with a new live PID while we are polling the PID we stopped, waitForStopped +// must report stopped but leave the new run-file intact. +func TestWaitForStoppedKeepsRunFileFromConcurrentStart(t *testing.T) { + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + + const stoppedPID, newPID = 1111, 2222 + // running.json now belongs to a different, live daemon. + if err := runfile.Write(runFile, runfile.Info{PID: newPID, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + c := &commandContext{deps: Deps{ + ProcessAlive: func(pid int) bool { return pid == newPID }, // stoppedPID is dead + Now: func() time.Time { return time.Unix(200, 0).UTC() }, + Sleep: func(time.Duration) {}, + }.withDefaults()} + + st, err := c.waitForStopped(context.Background(), stoppedPID, runFile, dir, time.Second) + if err != nil { + t.Fatal(err) + } + if st.State != "stopped" { + t.Fatalf("state = %q, want stopped", st.State) + } + + info, err := runfile.Read(runFile) + if err != nil { + t.Fatal(err) + } + if info == nil { + t.Fatal("new daemon's run-file was deleted by stop of a different PID") + } + if info.PID != newPID { + t.Fatalf("run-file PID = %d, want %d (new daemon)", info.PID, newPID) + } +} + +// TestWaitForStoppedRemovesOwnRunFile confirms the normal path still cleans up: +// when the dead PID owns the run-file, it is removed. +func TestWaitForStoppedRemovesOwnRunFile(t *testing.T) { + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + + const stoppedPID = 1111 + if err := runfile.Write(runFile, runfile.Info{PID: stoppedPID, Port: 3001, StartedAt: time.Unix(100, 0).UTC()}); err != nil { + t.Fatal(err) + } + + c := &commandContext{deps: Deps{ + ProcessAlive: func(int) bool { return false }, + Now: func() time.Time { return time.Unix(200, 0).UTC() }, + Sleep: func(time.Duration) {}, + }.withDefaults()} + + st, err := c.waitForStopped(context.Background(), stoppedPID, runFile, dir, time.Second) + if err != nil { + t.Fatal(err) + } + if st.State != "stopped" { + t.Fatalf("state = %q, want stopped", st.State) + } + info, err := runfile.Read(runFile) + if err != nil { + t.Fatal(err) + } + if info != nil { + t.Fatalf("own run-file should have been removed, got %#v", info) + } +} diff --git a/backend/internal/cli/version.go b/backend/internal/cli/version.go new file mode 100644 index 00000000..dd8a2598 --- /dev/null +++ b/backend/internal/cli/version.go @@ -0,0 +1,37 @@ +package cli + +import ( + "fmt" + "strings" + + "github.com/spf13/cobra" +) + +// Build metadata. Release tooling can override these with -ldflags. +var ( + Version = "dev" + Commit = "" + Date = "" +) + +func VersionString() string { + parts := []string{Version} + if Commit != "" { + parts = append(parts, "commit "+Commit) + } + if Date != "" { + parts = append(parts, "built "+Date) + } + return strings.Join(parts, " ") +} + +func newVersionCommand() *cobra.Command { + return &cobra.Command{ + Use: "version", + Short: "Print version information", + RunE: func(cmd *cobra.Command, args []string) error { + _, err := fmt.Fprintln(cmd.OutOrStdout(), VersionString()) + return err + }, + } +} diff --git a/backend/cdc_wiring.go b/backend/internal/daemon/cdc_wiring.go similarity index 99% rename from backend/cdc_wiring.go rename to backend/internal/daemon/cdc_wiring.go index d824cbab..a76c5c78 100644 --- a/backend/cdc_wiring.go +++ b/backend/internal/daemon/cdc_wiring.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go new file mode 100644 index 00000000..556fe5f0 --- /dev/null +++ b/backend/internal/daemon/daemon.go @@ -0,0 +1,126 @@ +// Package daemon owns the Agent Orchestrator backend process: config loading, +// loopback HTTP serving, durable storage, CDC fan-out, lifecycle wiring, and +// graceful shutdown. +package daemon + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/signal" + "syscall" + + "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" + "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/httpd" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" + "github.com/aoagents/agent-orchestrator/backend/internal/terminal" +) + +// Run starts the daemon and blocks until it exits. SIGINT/SIGTERM drive +// graceful shutdown through the HTTP server and background workers. +func Run() error { + cfg, err := config.Load() + if err != nil { + return err + } + + log := newLogger() + + // Fail fast if a live daemon already owns the handshake file. A run-file + // left by a crashed predecessor (dead PID) is treated as stale and + // overwritten when the new server starts. + if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { + return fmt.Errorf("inspect run-file: %w", err) + } else if live != nil { + return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) + } + + // Open the durable store and bring up the CDC substrate: the DB triggers + // capture changes into change_log, the poller tails it, and the broadcaster + // fans events out to the SSE transport. The LCM/Session Manager and the HTTP + // API routes that drive and read this store are owned by the daemon lane and + // are wired there once their collaborators (Notifier, AgentMessenger, and the + // runtime/agent/workspace plugins) have production implementations; here we + // stand up the persistence + change-delivery foundation they build on. + store, err := sqlite.Open(cfg.DataDir) + if err != nil { + return fmt.Errorf("open store: %w", err) + } + defer store.Close() + + // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the + // graceful shutdown inside Server.Run and stops the background goroutines. + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + cdcPipe, err := startCDC(ctx, store, log) + if err != nil { + return err + } + + // Terminal streaming: the tmux runtime supplies the PTY-attach command and + // liveness; the CDC broadcaster feeds the session-state channel. The manager + // is handed to httpd, which mounts it at /mux. Raw PTY bytes never flow + // through the CDC change_log — only session-state events do. + runtimeAdapter := tmux.New(tmux.Options{}) + termMgr := terminal.NewManager(runtimeAdapter, cdcPipe.Broadcaster, log) + defer termMgr.Close() + + srv, err := httpd.New(cfg, log, termMgr) + if err != nil { + return err + } + + // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE + // timer). This makes the write path live end-to-end: LCM write -> store -> DB + // trigger -> change_log -> poller -> broadcaster. + lcStack, err := startLifecycle(ctx, store, log) + if err != nil { + return err + } + + // Bring up the Session Manager. Runtime (tmux) and Workspace (gitworktree) + // are real on main; ports.Agent has no production adapter yet, so a loud + // stub returns a sentinel command that makes any Spawn fail at the runtime + // layer rather than start a broken session quietly. Notifier and + // AgentMessenger remain stubbed alongside the LCM until their multiplexers + // land. No HTTP routes wire to this yet — the daemon lane (#10) owns API + // surfacing — so we hold the SM in a local until it does. + sStack, err := startSession(ctx, cfg, lcStack, log) + if err != nil { + // startSession is the first start* call after this point that can + // realistically fail while the cdc poller and the reaper are already + // running. Mirror the bottom-of-run shutdown sequence so both have + // drained before the deferred store.Close() fires. Defers would hit + // the LIFO trap (see comment after srv.Run), hence explicit. + stop() + lcStack.Stop() + if cdcErr := cdcPipe.Stop(); cdcErr != nil { + log.Error("cdc pipeline shutdown", "err", cdcErr) + } + return err + } + _ = sStack + + runErr := srv.Run(ctx) + + // Shut the background goroutines down in order: cancel the context FIRST so + // their loops exit, then wait for them to drain. Doing this explicitly (not + // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel + // runs before the cancel — which would hang any non-signal exit path. + stop() + lcStack.Stop() + if err := cdcPipe.Stop(); err != nil { + log.Error("cdc pipeline shutdown", "err", err) + } + return runErr +} + +// newLogger returns the daemon's slog logger. It writes to stderr so supervisors +// can capture it separately from any structured stdout protocol added later. +func newLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) +} diff --git a/backend/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go similarity index 99% rename from backend/lifecycle_wiring.go rename to backend/internal/daemon/lifecycle_wiring.go index 35eac385..5a791054 100644 --- a/backend/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/wiring_test.go b/backend/internal/daemon/wiring_test.go similarity index 99% rename from backend/wiring_test.go rename to backend/internal/daemon/wiring_test.go index 6a372c6c..c2cfb721 100644 --- a/backend/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -1,4 +1,4 @@ -package main +package daemon import ( "context" diff --git a/backend/internal/daemonmeta/meta.go b/backend/internal/daemonmeta/meta.go new file mode 100644 index 00000000..72f322e5 --- /dev/null +++ b/backend/internal/daemonmeta/meta.go @@ -0,0 +1,6 @@ +package daemonmeta + +// ServiceName identifies the AO daemon in loopback health/readiness probes. +// The CLI uses it with the reported PID to avoid signaling an unrelated process +// when a stale run-file's PID has been reused. +const ServiceName = "agent-orchestrator-daemon" diff --git a/backend/internal/httpd/control_test.go b/backend/internal/httpd/control_test.go new file mode 100644 index 00000000..3e8456f8 --- /dev/null +++ b/backend/internal/httpd/control_test.go @@ -0,0 +1,52 @@ +package httpd + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +// TestShutdownGuard verifies that POST /shutdown only fires for a trusted local +// caller: a loopback Host with no Origin header. A cross-site Origin or a +// non-loopback (DNS-rebinding) Host must be rejected without triggering the +// shutdown side effect. +func TestShutdownGuard(t *testing.T) { + cases := []struct { + name string + host string + origin string + wantStatus int + wantFired bool + }{ + {name: "loopback no origin", host: "127.0.0.1:3001", wantStatus: http.StatusAccepted, wantFired: true}, + {name: "localhost no origin", host: "localhost:3001", wantStatus: http.StatusAccepted, wantFired: true}, + {name: "cross-site origin", host: "127.0.0.1:3001", origin: "https://evil.example", wantStatus: http.StatusForbidden, wantFired: false}, + {name: "rebinding host", host: "evil.example", wantStatus: http.StatusForbidden, wantFired: false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + fired := false + r := NewRouterWithControl(config.Config{}, discardLogger(), nil, APIDeps{}, ControlDeps{ + RequestShutdown: func() { fired = true }, + }) + + req := httptest.NewRequest(http.MethodPost, "http://"+tc.host+"/shutdown", nil) + req.Host = tc.host + if tc.origin != "" { + req.Header.Set("Origin", tc.origin) + } + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + if rec.Code != tc.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tc.wantStatus) + } + if fired != tc.wantFired { + t.Fatalf("shutdown fired = %v, want %v", fired, tc.wantFired) + } + }) + } +} diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 019f7efe..5d132eb4 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -6,12 +6,15 @@ package httpd import ( "log/slog" + "net" "net/http" + "os" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) @@ -34,10 +37,18 @@ func NewRouter(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) c return NewRouterWithAPI(cfg, log, termMgr, APIDeps{}) } +type ControlDeps struct { + RequestShutdown func() +} + // NewRouterWithAPI is the dependency-injected variant. main.go calls it with // real Managers when they exist; tests/dev wiring inject mocks explicitly. // Missing Managers intentionally keep the route-shell 501 behavior. func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps) chi.Router { + return NewRouterWithControl(cfg, log, termMgr, deps, ControlDeps{}) +} + +func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps, control ControlDeps) chi.Router { r := chi.NewRouter() r.Use(middleware.Recoverer) @@ -53,6 +64,7 @@ func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Man mountHealth(r) mountMux(r, termMgr, log) + mountControl(r, control) NewAPI(cfg, deps).Register(r) return r @@ -65,15 +77,71 @@ func mountHealth(r chi.Router) { r.Get("/readyz", handleReadyz) } +// mountControl registers the loopback daemon-control endpoints. /shutdown is +// unauthenticated and state-changing, so it is gated by localControlRequest to +// keep a browser the user happens to have open (CSRF / DNS-rebinding) or a +// remote client from being able to kill the daemon. +func mountControl(r chi.Router, deps ControlDeps) { + if deps.RequestShutdown == nil { + return + } + r.Post("/shutdown", func(w http.ResponseWriter, req *http.Request) { + if !localControlRequest(req) { + writeJSON(w, http.StatusForbidden, map[string]any{ + "status": "forbidden", + "service": daemonmeta.ServiceName, + }) + return + } + writeJSON(w, http.StatusAccepted, map[string]any{ + "status": "shutting_down", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) + deps.RequestShutdown() + }) +} + +// localControlRequest reports whether a control request is a trusted local +// caller. The Go CLI client addresses the daemon by its loopback host and +// never sets an Origin header; a cross-site browser fetch always carries an +// Origin, and a DNS-rebinding attempt resolves a non-loopback Host. Rejecting +// either closes the CSRF/rebinding vector while leaving the CLI unaffected. +func localControlRequest(r *http.Request) bool { + if r.Header.Get("Origin") != "" { + return false + } + host := r.Host + if h, _, err := net.SplitHostPort(host); err == nil { + host = h + } + switch host { + case "127.0.0.1", "::1", "localhost": + return true + } + if ip := net.ParseIP(host); ip != nil { + return ip.IsLoopback() + } + return false +} + // handleHealthz is the liveness probe: it answers 200 as long as the process is // up and serving. It does no dependency checks by design. func handleHealthz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]string{"status": "ok"}) + writeJSON(w, http.StatusOK, map[string]any{ + "status": "ok", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) } // handleReadyz is the readiness probe. In the 1a skeleton the daemon is ready // as soon as it is listening; later phases will gate this on dependency // initialisation (e.g. store/event-bus warm-up). func handleReadyz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]string{"status": "ready"}) + writeJSON(w, http.StatusOK, map[string]any{ + "status": "ready", + "service": daemonmeta.ServiceName, + "pid": os.Getpid(), + }) } diff --git a/backend/internal/httpd/server.go b/backend/internal/httpd/server.go index 506f78b5..0ed67eaf 100644 --- a/backend/internal/httpd/server.go +++ b/backend/internal/httpd/server.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "os" + "sync" "time" "github.com/aoagents/agent-orchestrator/backend/internal/config" @@ -23,6 +24,9 @@ type Server struct { log *slog.Logger http *http.Server listen net.Listener + + shutdownRequested chan struct{} + shutdownOnce sync.Once } // New constructs a Server and binds the listener immediately so a port @@ -36,15 +40,18 @@ func New(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) (*Serve } srv := &Server{ - cfg: cfg, - log: log, - listen: ln, - http: &http.Server{ - Handler: NewRouter(cfg, log, termMgr), - // ReadHeaderTimeout guards against slow-loris even on loopback; - // per-request body/handler timeouts are applied per-surface. - ReadHeaderTimeout: 10 * time.Second, - }, + cfg: cfg, + log: log, + listen: ln, + shutdownRequested: make(chan struct{}), + } + srv.http = &http.Server{ + Handler: NewRouterWithControl(cfg, log, termMgr, APIDeps{}, ControlDeps{ + RequestShutdown: srv.requestShutdown, + }), + // ReadHeaderTimeout guards against slow-loris even on loopback; + // per-request body/handler timeouts are applied per-surface. + ReadHeaderTimeout: 10 * time.Second, } return srv, nil } @@ -89,6 +96,8 @@ func (s *Server) Run(ctx context.Context) error { // Serve died on its own (bind already happened, so this is a real // runtime failure) before any shutdown signal. return err + case <-s.shutdownRequested: + s.log.Info("shutdown requested over HTTP", "timeout", s.cfg.ShutdownTimeout) case <-ctx.Done(): s.log.Info("shutdown signal received, draining connections", "timeout", s.cfg.ShutdownTimeout) } @@ -113,3 +122,9 @@ func (s *Server) boundPort() int { } return s.cfg.Port } + +func (s *Server) requestShutdown() { + s.shutdownOnce.Do(func() { + close(s.shutdownRequested) + }) +} diff --git a/backend/internal/httpd/server_test.go b/backend/internal/httpd/server_test.go index 39270d1c..2b7ba4f3 100644 --- a/backend/internal/httpd/server_test.go +++ b/backend/internal/httpd/server_test.go @@ -91,6 +91,49 @@ func TestServerLifecycle(t *testing.T) { } } +func TestServerShutdownEndpoint(t *testing.T) { + runPath := filepath.Join(t.TempDir(), "running.json") + cfg := config.Config{ + Host: "127.0.0.1", + Port: 0, + ShutdownTimeout: 5 * time.Second, + RunFilePath: runPath, + } + + srv, err := New(cfg, discardLogger(), nil) + if err != nil { + t.Fatalf("New: %v", err) + } + + runErr := make(chan error, 1) + go func() { runErr <- srv.Run(context.Background()) }() + + base := "http://" + srv.Addr().String() + waitForHealth(t, base) + + resp, err := http.Post(base+"/shutdown", "application/json", nil) + if err != nil { + t.Fatalf("POST /shutdown: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusAccepted { + t.Fatalf("POST /shutdown = %d, want 202", resp.StatusCode) + } + + select { + case err := <-runErr: + if err != nil { + t.Fatalf("Run returned error on shutdown endpoint: %v", err) + } + case <-time.After(10 * time.Second): + t.Fatal("Run did not return after shutdown endpoint") + } + + if after, _ := runfile.Read(runPath); after != nil { + t.Error("run-file still present after shutdown endpoint; want it removed") + } +} + func waitForHealth(t *testing.T, base string) { t.Helper() // Per-request timeout so a stalled connect or hung handshake doesn't park diff --git a/backend/main.go b/backend/main.go index e825039f..5315d510 100644 --- a/backend/main.go +++ b/backend/main.go @@ -1,133 +1,18 @@ -// Command backend is the Agent Orchestrator HTTP daemon: a loopback-only -// sidecar spawned and supervised by the Electron main process. Phase 1a brings -// up the server skeleton — config, 127.0.0.1 bind, middleware stack, health -// probes, the running.json handshake, and graceful shutdown. +// Command backend is a compatibility wrapper for the Agent Orchestrator daemon. +// The user-facing CLI lives at cmd/ao; keep this wrapper so existing `go run .` +// development workflows continue to start the daemon while scripts migrate. package main import ( - "context" "fmt" - "log/slog" "os" - "os/signal" - "syscall" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" - "github.com/aoagents/agent-orchestrator/backend/internal/config" - "github.com/aoagents/agent-orchestrator/backend/internal/httpd" - "github.com/aoagents/agent-orchestrator/backend/internal/runfile" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" - "github.com/aoagents/agent-orchestrator/backend/internal/terminal" + "github.com/aoagents/agent-orchestrator/backend/internal/daemon" ) func main() { - if err := run(); err != nil { + if err := daemon.Run(); err != nil { fmt.Fprintln(os.Stderr, "ao backend daemon: "+err.Error()) os.Exit(1) } } - -func run() error { - cfg, err := config.Load() - if err != nil { - return err - } - - log := newLogger() - - // Fail fast if a live daemon already owns the handshake file. A run-file - // left by a crashed predecessor (dead PID) is treated as stale and - // overwritten when the new server starts. - if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { - return fmt.Errorf("inspect run-file: %w", err) - } else if live != nil { - return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) - } - - // Open the durable store and bring up the CDC substrate: the DB triggers - // capture changes into change_log, the poller tails it, and the broadcaster - // fans events out to the SSE transport. The LCM/Session Manager and the HTTP - // API routes that drive and read this store are owned by the daemon lane and - // are wired there once their collaborators (Notifier, AgentMessenger, and the - // runtime/agent/workspace plugins) have production implementations; here we - // stand up the persistence + change-delivery foundation they build on. - store, err := sqlite.Open(cfg.DataDir) - if err != nil { - return fmt.Errorf("open store: %w", err) - } - defer store.Close() - - // signal.NotifyContext cancels ctx on SIGINT/SIGTERM, which drives the - // graceful shutdown inside Server.Run and stops the background goroutines. - ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) - defer stop() - - cdcPipe, err := startCDC(ctx, store, log) - if err != nil { - return err - } - - // Terminal streaming: the tmux runtime supplies the PTY-attach command and - // liveness; the CDC broadcaster feeds the session-state channel. The manager - // is handed to httpd, which mounts it at /mux. Raw PTY bytes never flow - // through the CDC change_log — only session-state events do. - runtimeAdapter := tmux.New(tmux.Options{}) - termMgr := terminal.NewManager(runtimeAdapter, cdcPipe.Broadcaster, log) - defer termMgr.Close() - - srv, err := httpd.New(cfg, log, termMgr) - if err != nil { - return err - } - - // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE - // timer). This makes the write path live end-to-end: LCM write -> store -> DB - // trigger -> change_log -> poller -> broadcaster. - lcStack, err := startLifecycle(ctx, store, log) - if err != nil { - return err - } - - // Bring up the Session Manager. Runtime (tmux) and Workspace (gitworktree) - // are real on main; ports.Agent has no production adapter yet, so a loud - // stub returns a sentinel command that makes any Spawn fail at the runtime - // layer rather than start a broken session quietly. Notifier and - // AgentMessenger remain stubbed alongside the LCM until their multiplexers - // land. No HTTP routes wire to this yet — the daemon lane (#10) owns API - // surfacing — so we hold the SM in a local until it does. - sStack, err := startSession(ctx, cfg, lcStack, log) - if err != nil { - // startSession is the first start* call after this point that can - // realistically fail while the cdc poller and the reaper are already - // running. Mirror the bottom-of-run shutdown sequence so both have - // drained before the deferred store.Close() fires. Defers would hit - // the LIFO trap (see comment after srv.Run), hence explicit. - stop() - lcStack.Stop() - if cdcErr := cdcPipe.Stop(); cdcErr != nil { - log.Error("cdc pipeline shutdown", "err", cdcErr) - } - return err - } - _ = sStack - - runErr := srv.Run(ctx) - - // Shut the background goroutines down in order: cancel the context FIRST so - // their loops exit, then wait for them to drain. Doing this explicitly (not - // via defer) avoids the LIFO trap where a Stop() that blocks on ctx-cancel - // runs before the cancel — which would hang any non-signal exit path. - stop() - lcStack.Stop() - if err := cdcPipe.Stop(); err != nil { - log.Error("cdc pipeline shutdown", "err", err) - } - return runErr -} - -// newLogger returns the daemon's slog logger. It writes to stderr so the -// Electron supervisor can capture it separately from any structured stdout -// protocol added later. -func newLogger() *slog.Logger { - return slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) -} diff --git a/docs/README.md b/docs/README.md index f42f222f..220dec40 100644 --- a/docs/README.md +++ b/docs/README.md @@ -15,6 +15,7 @@ fakes) on the `feat/lcm-sm-contracts` integration branch. |-----|----------------| | [architecture.md](architecture.md) | How the lane works: the OBSERVE→DECIDE→ACT loop, the canonical state model, the package layout, every component, and the load-bearing invariants. Read this first. | | [status.md](status.md) | What's done (PR by PR), what's left, the integration to-dos, the open cross-lane contract questions, and how to build/test. | +| [cli/README.md](cli/README.md) | CLI foundation decisions: Cobra, reference projects, old CLI inventory, and the first command surface. | ## The one-paragraph mental model diff --git a/docs/cli/README.md b/docs/cli/README.md new file mode 100644 index 00000000..d78539a0 --- /dev/null +++ b/docs/cli/README.md @@ -0,0 +1,395 @@ +# AO CLI Foundation + +This page is the running decision log for the Agent Orchestrator CLI. Keep new +CLI decisions here as the command surface grows. + +## Current State + +This branch implements the daemon-control foundation. AO now has a Go/Cobra +`ao` binary that can start, inspect, diagnose, and stop the local backend daemon +end to end. + +What works now: + +- `ao start` starts the daemon in the background and waits for `/readyz`. +- `ao status` and `ao status --json` report stopped, stale, unhealthy, + not-ready, or ready daemon state. +- `ao stop` gracefully stops the daemon via the loopback `POST /shutdown` + endpoint, only after verifying the daemon's identity from `running.json`. +- `ao daemon` is the hidden internal daemon entrypoint used by `ao start`. +- `ao doctor` (and `ao doctor --json`) checks config, data dir, the database + file's presence, daemon state, and local tool availability for `git`, `tmux`, + and `zellij`. It never opens or migrates the store — the daemon is the sole + writer/migrator, so doctor only reports whether the database exists yet. +- `ao completion` generates shell completions for `bash`, `zsh`, `fish`, and + `powershell`. +- `ao version` and `ao --version` print build metadata. +- `go run .` still works as a compatibility wrapper around `internal/daemon.Run`. + +Manual smoke test: + +```bash +cd backend +go build -o /tmp/ao ./cmd/ao + +tmp=$(mktemp -d) +export AO_RUN_FILE="$tmp/running.json" +export AO_DATA_DIR="$tmp/data" +export AO_PORT=3037 + +/tmp/ao status --json +/tmp/ao doctor +/tmp/ao start +/tmp/ao status --json +/tmp/ao stop +/tmp/ao status --json +``` + +What is intentionally not implemented yet: + +- `ao project ...` +- `ao spawn` +- `ao session ...` +- `ao send` +- `ao events ...` + +Next steps: + +1. Wire the existing project manager/controller shell into the daemon with a + durable SQLite-backed project store. +2. Implement `ao project list/add/show/remove` against `/api/v1/projects`. +3. Wire production Session Manager dependencies: project-backed repo resolver, + tmux/zellij runtime registry, first agent adapter, and AgentMessenger. +4. Add `/api/v1/sessions`, then implement `ao spawn`, `ao session ...`, and + `ao send`. +5. Add `/events` SSE and durable event-list reads, then implement + `ao events tail/list`. + +## Decision + +AO will use a single Go CLI binary built with +[Cobra](https://github.com/spf13/cobra). + +The CLI is a thin client for the Go daemon. It should not call SQLite, runtime +adapters, agent adapters, workspace adapters, or SCM integrations directly. It +should start, discover, inspect, and command the daemon through the loopback API +and the existing `running.json` handshake. + +Initial rules: + +- The binary name is `ao`. +- `ao daemon` is the hidden/internal entrypoint for the long-running daemon. +- User-facing commands call the daemon over loopback after reading + `running.json`. +- Commands that mutate core AO state go through HTTP API routes, not direct + stores. +- Commands support predictable text output first and `--json` where automation + is likely. +- Do not introduce Viper in the foundation. Start with explicit flags and a + small config/client layer, then add config loading once the shape is real. + +## References + +These projects inform the direction, but AO should keep its own command surface +smaller at first. + +| Project | CLI stack | What to take | +|---|---|---| +| [Gastown](https://github.com/gastownhall/gastown) | Go + Cobra, with Charmbracelet packages for richer terminal UI | Simple `cmd//main.go` delegating to internal command construction. Useful confirmation that Cobra is the right default for this size of Go CLI. | +| [GitHub CLI](https://github.com/cli/cli) | Go + Cobra | Command factories, explicit IO streams, JSON output, and testable command construction. | +| [Docker CLI](https://github.com/docker/cli) | Go + Cobra | Daemon/client split, command groups, signal handling, and plugin-aware CLI layout. | +| [kubectl](https://github.com/kubernetes/kubectl) | Go + Cobra | Large command tree patterns and IO abstractions. It is a useful ceiling, not a shape to copy now. | +| [Tailscale CLI](https://github.com/tailscale/tailscale) | Go + ffcli | Useful daemon-backed product model: a CLI talks to a local daemon. Do not copy the framework choice. | + +The old AO TypeScript CLI is a product/workflow reference only. We should not +port its implementation because it mixes CLI, storage, runtime, and project +logic in-process. The rewrite needs the CLI to sit outside the core daemon. + +## Current Legacy CLI Inventory + +Inventory source: installed `ao` binary at version `0.9.2`, plus the old +`packages/cli/src/program.ts` and `packages/cli/src/commands/*.ts` files. + +Count: + +- 25 public top-level commands, excluding Commander-generated `help`. +- 26 visible top-level commands if generated `help` is counted. +- 64 explicit public command nodes when nested subcommands are counted. +- 1 hidden internal command: `completion __complete`. +- No aliases are registered in the old Commander source. + +Top-level commands: + +| Command | Legacy purpose | Foundation decision | +|---|---|---| +| `start` | Start orchestrator agent and dashboard | Keep, but redefine as daemon start. | +| `stop` | Stop orchestrator agent and dashboard | Keep, daemon stop. | +| `status` | Show all sessions and project/session health | Keep, daemon and session status. | +| `spawn` | Spawn a single agent session | Keep after session API exists. | +| `batch-spawn` | Spawn many sessions | Defer. | +| `session` | Manage sessions | Keep a smaller subset after session API exists. | +| `send` | Send a message to a session | Keep after messaging API exists. | +| `acknowledge` | Agent self-reporting hook | Defer or replace with internal API. | +| `report` | Agent workflow transition hook | Defer or replace with internal API. | +| `review-check` | Trigger agents from review comments | Defer. | +| `review` | Manage AO-local reviewer runs | Defer. | +| `dashboard` | Start web dashboard | Defer to Electron/frontend lane. | +| `open` | Open terminal/dashboard | Defer. | +| `verify` | Verify issue after staging check | Defer. | +| `doctor` | Run install/env/runtime checks | Keep. | +| `update` | Upgrade AO | Defer to packaging/release lane. | +| `setup` | Configure integrations | Defer. | +| `plugin` | Plugin marketplace/install flow | Defer. | +| `notify` | Notification test commands | Defer. | +| `project` | Manage registered projects | Keep after project API exists. | +| `migrate-storage` | Legacy storage migration | Drop for rewrite unless a real migration appears. | +| `completion` | Generate shell completions | Keep. | +| `events` | Query activity event log | Keep a small `tail`/`list` surface after event API exists. | +| `config` | Read/write old global config | Defer. Avoid until config shape is stable. | +| `config-help` | Print old config schema | Drop. | + +Nested legacy commands: + +| Parent | Subcommands | +|---|---| +| `session` | `ls`, `attach`, `kill`, `cleanup`, `claim-pr`, `restore`, `remap` | +| `review` | `run`, `execute`, `send`, `list` | +| `setup` | `dashboard`, `desktop`, `webhook`, `slack`, `discord`, `composio`, `composio-slack`, `composio-discord`, `composio-discord-bot`, `composio-mail`, `openclaw` | +| `plugin` | `list`, `search`, `create`, `install`, `update`, `uninstall` | +| `project` | `ls`, `add`, `rm`, `set-default` | +| `events` | `list`, `search`, `stats` | +| `config` | `set`, `get` | +| `notify` | `test` | +| `completion` | `zsh`, hidden `__complete` | + +## Initial Command Surface + +The first CLI should make AO installable, startable, inspectable, and stoppable +before trying to recreate the old product surface. + +### Foundation Commands + +These are the first commands to implement. + +| Command | Purpose | Notes | +|---|---|---| +| `ao start` | Start the daemon, wait for `/readyz`, and print PID/port. | Reads the same config env as the daemon. Should be idempotent when an existing healthy daemon is already running. | +| `ao stop` | Stop the running daemon. | Reads `running.json`, sends graceful termination, waits for run-file removal, and reports stale/dead daemon state clearly. | +| `ao status` | Show daemon status and, once APIs exist, project/session summary. | First version can show run-file, process liveness, `/healthz`, `/readyz`, uptime, and port. Add `--json`; add `--watch` once useful. | +| `ao daemon` | Hidden internal daemon entrypoint. | This replaces the current direct `go run .` daemon entrypoint once `main.go` is extracted into `internal/daemon`. | +| `ao doctor` | Diagnose the local environment. | Start with daemon/run-file/port checks, required binaries, config dir/data dir permissions, and runtime availability. | +| `ao completion` | Generate shell completions. | Cobra can support `bash`, `zsh`, `fish`, and `powershell`. | +| `ao version` | Print CLI and build metadata. | Implement as both `ao version` and Cobra's `--version` flag. | + +This gives a useful first release even before project/session mutation routes are +complete. + +### First Core Application Commands + +These are the next commands once daemon HTTP routes expose the needed managers. + +| Command | Purpose | Depends on | +|---|---|---| +| `ao project list` | List registered projects. | Project API. Alias `ls` is acceptable for old muscle memory. | +| `ao project add ` | Register a project. | Project API and project identity rules. | +| `ao project show ` | Inspect project config and health. | Project API. | +| `ao project remove ` | Archive/remove a project. | Project API. Alias `rm` is acceptable. | +| `ao spawn [issue]` | Spawn one coding-agent session. | Session Manager HTTP route, tracker lookup, workspace/runtime/agent adapters. | +| `ao session list` | List sessions across projects or one project. | Session API. Alias `ls` is acceptable. | +| `ao session show ` | Show one session with lifecycle, PR, CI, runtime, and paths. | Session API. | +| `ao session attach ` | Attach to the runtime terminal. | Runtime API or direct terminal attach contract exposed by daemon. | +| `ao session kill ` | Kill a session and clean up safely. | Session Manager `Kill`. | +| `ao session restore ` | Restore a terminated/crashed session. | Session Manager `Restore`. | +| `ao send [message...]` | Send instructions to a running session. | AgentMessenger route. | +| `ao events tail` | Follow daemon activity events. | SSE/CDC API. | +| `ao events list` | List recent activity events. | Event read API. | + +This is the smallest surface that covers the core product loop: + +1. Register a repo. +2. Start AO. +3. Spawn work. +4. Inspect work. +5. Intervene in work. +6. Stop AO. + +## Explicit Deferrals + +Do not include these in the CLI foundation: + +- `batch-spawn`: valuable, but it multiplies error handling before single-spawn + semantics are stable. +- `dashboard` and `open`: frontend/Electron should own the primary dashboard + launch path first. +- `review`, `review-check`, and `verify`: useful workflow automation, but not + required to run core AO. +- `setup`, `plugin`, and `notify`: integration/plugin surface should come after + the daemon API and config model settle. +- `update`: belongs with distribution and release packaging. +- `config` and `config-help`: wait for a stable Go config model. Avoid copying + the old TypeScript global config behavior. +- `migrate-storage`: old storage migration is not part of the rewrite unless a + concrete migration requirement appears. +- `acknowledge` and `report`: these are agent self-reporting hooks. Prefer a + daemon/internal protocol before exposing them as durable user CLI commands. + +## Implementation Plan + +1. Add Cobra to `backend/go.mod`. +2. Move current daemon startup from `backend/main.go` into + `backend/internal/daemon.Run(ctx, opts)`. +3. Add `backend/cmd/ao/main.go` as the only user binary entrypoint. +4. Add `backend/internal/cli` for command construction, IO streams, process + launching, run-file discovery, loopback HTTP client, and output formatting. +5. Implement `ao daemon` first so the current daemon behavior is preserved. +6. Implement `ao start`, `ao stop`, and `ao status` around `running.json` and + `/healthz`/`/readyz`. +7. Add `ao doctor`, `ao completion`, and `ao version`. +8. Add command tests using Cobra command construction with fake IO, fake process + runner, and fake daemon client. Keep daemon integration tests in the daemon + packages. + +Suggested package layout: + +```text +backend/ + cmd/ + ao/ + main.go + internal/ + cli/ + root.go + start.go + stop.go + status.go + doctor.go + completion.go + version.go + client.go + output.go + process.go + daemon/ + daemon.go +``` + +Acceptance criteria for the foundation: + +- `go run ./cmd/ao daemon` behaves like today's `go run .`. +- `go run ./cmd/ao start` starts the daemon and waits until `/readyz` returns + ready. +- `go run ./cmd/ao status --json` works when the daemon is running, stopped, and + stale. +- `go run ./cmd/ao stop` gracefully stops the daemon and removes `running.json`. +- `go test ./...`, `go vet ./...`, and `go test -race ./...` pass. + +## Implementation Readiness + +This section records what the CLI can connect to in the current codebase and +what still needs to be built. Inventory date: 2026-05-31 after merging +`origin/main` at `438b830`. + +### Implemented Foundation + +The daemon-control foundation now exists in `backend/cmd/ao` and +`backend/internal/cli`. + +Implemented commands: + +- `ao daemon` hidden/internal daemon entrypoint. +- `ao start` starts the daemon, waits for `/readyz`, and supports `--json`, + `--timeout`, and `--log-file`. +- `ao stop` stops the daemon from `running.json`, removes stale run-files, and + supports `--json` and `--timeout`. +- `ao status` reports stopped/stale/unhealthy/not-ready/ready states and + supports `--json`. +- `ao doctor` checks config, data dir, database-file presence, daemon state, and + local tool availability for `git`, `tmux`, and `zellij`; supports `--json`. It + does not open or migrate the store (the daemon owns that). +- `ao completion` generates `bash`, `zsh`, `fish`, and `powershell` + completions. +- `ao version` prints build metadata. + +The old `backend/main.go` remains as a compatibility wrapper around +`internal/daemon.Run`, so `go run .` still starts the daemon while scripts move +to `go run ./cmd/ao ...`. + +### Already Implemented and Directly Usable by the CLI + +These pieces are available now and are enough to build the daemon-management +part of the CLI. + +| Area | Existing code | CLI use | +|---|---|---| +| Daemon config | `backend/internal/config` loads `AO_PORT`, `AO_REQUEST_TIMEOUT`, `AO_SHUTDOWN_TIMEOUT`, `AO_RUN_FILE`, and `AO_DATA_DIR`. Host is fixed to `127.0.0.1`. | `ao start`, `ao daemon`, `ao status`, and `ao doctor` can share the same config resolution. | +| HTTP server lifecycle | `backend/internal/httpd.Server` binds loopback, writes `running.json`, serves until context cancellation, then removes `running.json`. | `ao daemon` can preserve today's daemon behavior after extraction into `internal/daemon`. | +| Health probes | `GET /healthz` and `GET /readyz`. | `ao start` can wait for readiness; `ao status` and `ao doctor` can check daemon health. | +| Run-file handshake | `backend/internal/runfile` reads, writes, removes, and stale-checks `running.json`. | `ao status` can discover PID/port; `ao stop` can find the process; `ao start` can detect an already-running daemon. | +| Durable store | `backend/internal/storage/sqlite` opens SQLite, runs goose migrations, uses WAL, stores projects/sessions/PR/check/comment rows, and reads `change_log`. | Not directly called by user CLI commands, but confirms the daemon has a durable backend once APIs expose it. | +| CDC substrate | `backend/internal/cdc` poller and broadcaster exist; daemon starts the poller with `startCDC`. | Future `ao events tail` can build on this once an SSE/API transport exists. | +| Lifecycle manager | `backend/internal/lifecycle` is implemented and currently wired in daemon startup. | Session/status APIs can use it; CLI must wait for HTTP routes rather than calling it directly. | +| Reaper timer | `backend/internal/observe/reaper` exists and is wired. | Runtime liveness will be available once runtime registry wiring exists. | + +### Implemented Internally but Not Reachable by CLI Yet + +These are real backend components, but the CLI cannot responsibly use them until +they are wired into the daemon and exposed through HTTP. + +| Area | Existing code | Missing before CLI can use it | +|---|---|---| +| Project API pieces | `internal/project` has manager/controller DTOs, `/api/v1/projects` routes exist, and `sqlite.Store` has project CRUD. | Durable project-store adapter/wiring in the daemon and CLI commands. The daemon currently constructs the router with nil API deps, so project routes are not product-usable from `ao` yet. | +| Session Manager | `backend/internal/session.Manager` implements `Spawn`, `Kill`, `Restore`, `List`, `Get`, `Send`, and `Cleanup`. | Production daemon wiring with real runtime, agent, workspace, messenger, and HTTP routes. | +| Runtime adapters | tmux and zellij adapters implement `ports.Runtime` and also have attach/send/output helpers. | Runtime registry wiring in daemon, attach/send abstractions in ports/API, and selection config. | +| Workspace adapter | git worktree adapter implements create/destroy/restore/list with safety checks. | Repo resolver backed by registered projects and daemon wiring into Session Manager. | +| GitHub issue tracker | `backend/internal/adapters/tracker/github` implements read-only issue `Get`, `List`, and `Preflight`. | Tracker registry/config, spawn prompt hydration, and project tracker metadata. | +| PR facts storage | SQLite PR/check/comment writes and CDC triggers exist. | SCM/PR observer that fetches GitHub PR/CI/review facts and calls `LCM.ApplyPRObservation`. | +| Session read model | `SessionManager.List/Get` derive display status from canonical lifecycle + PR facts. | HTTP response DTOs and API routes for CLI/frontend reads. | + +### Still Missing + +These are the main gaps before the full initial command set is real. + +| Gap | Blocks | +|---|---| +| Product API client package with run-file discovery. | `project`, `spawn`, `session`, `send`, `events list`, richer `status`. | +| Shutdown mechanism choice: PID signal now, optional `POST /api/v1/daemon/shutdown` later. | `ao stop` polish and cross-platform behavior. | +| Session/send API route surface under `/api/v1`. | `spawn`, `session`, `send`, richer `status`. | +| Project API daemon wiring. | `ao project list/add/show/remove`. | +| SSE route for live CDC events plus durable catch-up reads. | `ao events tail`, frontend live updates. | +| Agent adapters for supported harnesses (`codex`, `claude-code`, etc.). | `ao spawn`, `ao session restore`. | +| AgentMessenger implementation over tmux/zellij. | `ao send`, LCM auto-nudge reactions. | +| Runtime registry wired with tmux/zellij. | Reaper liveness, `session attach`, spawn/kill/restore runtime work. | +| Notifier implementation/multiplexer. | Human notifications and LCM escalation side effects. | +| Activity hooks or agent self-report protocol. | Accurate working/idle/needs-input status beyond runtime/PR facts. | +| Project/tracker config model. | `project add/show`, tracker-backed `spawn`, `doctor` config checks. | +| OpenAPI/DTO/error contract. | Stable CLI/frontend API clients and tests. | + +### Command Readiness Matrix + +| Command | Can implement now? | Existing support | Remaining work | +|---|---:|---|---| +| `ao daemon` | Implemented | Current daemon startup is extracted to `internal/daemon.Run`. | None for foundation. | +| `ao start` | Implemented | Config, run-file stale check, HTTP readiness probes. | Later: package-manager/service integration if needed. | +| `ao stop` | Implemented | Run-file discovery gives PID/port; server exits cleanly on SIGINT/SIGTERM. | Optional later shutdown HTTP route. | +| `ao status` | Partially implemented | Run-file, process liveness via PID, `/healthz`, `/readyz`. | Rich project/session summary waits for `/api/v1/projects` and `/api/v1/sessions`. | +| `ao doctor` | Partially implemented | Config resolution, run-file, database-file presence (no open/migrate), runtime binary checks. | Deeper adapter preflights need daemon wiring/config and should be queried from the daemon, not run in-process. | +| `ao completion` | Implemented | Cobra generators. | None for foundation. | +| `ao version` | Implemented | Build metadata can be injected with `-ldflags`. | Release tooling needs to set metadata. | +| `ao project list/add/show/remove` | Not yet | Project manager/controller route shell and SQLite project CRUD exist. | Durable project-store adapter, daemon API wiring, and CLI HTTP client. CLI must not write SQLite directly. | +| `ao spawn` | Not yet | Session Manager exists; runtime/workspace/tracker pieces partly exist. | Agent adapters, registry/config wiring, project lookup, tracker hydration, HTTP route. | +| `ao session list/show` | Not yet | Store and Session Manager read model exist. | HTTP routes and response DTOs. | +| `ao session attach` | Not yet | tmux/zellij have attach command helpers. | Runtime attach port/API and terminal-launch policy. | +| `ao session kill/restore` | Not yet | Session Manager implements both. | Production wiring and HTTP routes. | +| `ao send` | Not yet | Session Manager has `Send`; tmux/zellij have send helpers. | AgentMessenger implementation, port/API wiring, busy/idle delivery policy. | +| `ao events tail/list` | Not yet | Durable `change_log`, CDC poller, in-process broadcaster. | SSE route and durable event-list route. | + +### Recommended Build Order + +1. Build CLI foundation around the daemon only: `daemon`, `start`, `stop`, + `status`, `doctor`, `completion`, `version`. +2. Wire the existing project manager/controller shell into the daemon with a + durable SQLite-backed store, then implement `project list/add/show/remove`. +3. Wire production Session Manager dependencies: project-backed repo resolver, + tmux/zellij runtime registry, first agent adapter, and AgentMessenger. +4. Add `/api/v1/sessions` and implement `spawn`, `session list/show/kill/restore`, + and `send`. +5. Add `/events` SSE plus event-list reads, then implement `events tail/list`. diff --git a/test/cli/Dockerfile b/test/cli/Dockerfile new file mode 100644 index 00000000..fb5d85b2 --- /dev/null +++ b/test/cli/Dockerfile @@ -0,0 +1,47 @@ +# End-to-end CLI smoke test, modelling "install ao on a fresh machine, then use it". +# +# Build context is the REPO ROOT: +# docker build -f test/cli/Dockerfile -t ao-cli-smoke . +# docker run --rm --init ao-cli-smoke +# +# Run with --init so the real daemon spawned during the `start` test (it detaches +# via setsid) is reaped promptly after `stop` instead of lingering as a zombie. +# The suite does NOT depend on it — the stale-daemon case uses a fabricated dead +# PID — but --init keeps process accounting clean. + +# ---- stage 1: build the binary (the "release" a user would download) ---- +FROM golang:1.25-bookworm AS build +WORKDIR /src + +# Cache modules first. +COPY backend/go.mod backend/go.sum ./backend/ +RUN cd backend && go mod download + +COPY backend ./backend +# Pure-Go SQLite (modernc) builds fine with CGO disabled -> a static binary. +RUN cd backend && CGO_ENABLED=0 go build -trimpath -o /out/ao ./cmd/ao + +# ---- stage 2: a clean machine with NO Go toolchain, just like an end user ---- +FROM debian:bookworm-slim AS run + +# Runtime deps a fresh user would need: git is required by `ao doctor`; tmux is +# the optional runtime it probes for; curl drives the HTTP-level guard checks; +# ca-certificates for good measure. +RUN apt-get update \ + && apt-get install -y --no-install-recommends git tmux curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# "Install" the CLI the way a user would: drop the binary on PATH. +COPY --from=build /out/ao /usr/local/bin/ao +COPY test/cli/install-check.sh /usr/local/bin/ao-install-check.sh +RUN chmod +x /usr/local/bin/ao /usr/local/bin/ao-install-check.sh + +# Run as an unprivileged user with a real HOME, like a normal install. +RUN useradd --create-home --shell /bin/bash ao +USER ao +WORKDIR /home/ao + +# Sanity: prove the install resolved before the check runs. +RUN ao version + +ENTRYPOINT ["/usr/local/bin/ao-install-check.sh"] diff --git a/test/cli/README.md b/test/cli/README.md new file mode 100644 index 00000000..14c3a56c --- /dev/null +++ b/test/cli/README.md @@ -0,0 +1,65 @@ +# `ao` CLI end-to-end tests + +These tests drive the **real `ao` binary** the way a user would — `start` → +`status` → `doctor` → `stop`, plus the daemon-control HTTP surface — and assert +the whole thing works. They run against **isolated, throwaway state** (a per-test +temp run-file + data dir + an OS-assigned free loopback port), so they never +touch a developer's real AO installation. + +## Two tiers + +| Tier | What | Where | +|------|------|-------| +| **Comprehensive (primary)** | A cross-platform Go suite that builds `ao` and exercises the full behaviour. Runs natively on **ubuntu + macOS + windows** — the only way to cover the OS-specific process-detach paths (`setsid` vs `CREATE_NEW_PROCESS_GROUP`) and `os.UserConfigDir()` resolution. | `backend/internal/cli/e2e_test.go` (build tag `e2e`) | +| **Fresh-install (hardening)** | Proves a freshly installed binary works on a clean machine with no Go toolchain and no developer state. | `test/cli/Dockerfile` + `test/cli/install-check.sh` | + +## Run it + +**The Go suite (fastest, cross-platform):** +```bash +cd backend +go test -tags e2e ./internal/cli/... # run it +go test -tags e2e -v -run TestE2E ./internal/cli/... # verbose: prints every command + output +``` +It builds its own `ao` binary; `git` must be on PATH (required by `doctor`). +`-v` logs each `ao` invocation and its full output, which is the audit trail you +get for free from `go test`. + +**Fresh-machine install, in a clean container:** +```bash +docker build -f test/cli/Dockerfile -t ao-cli-smoke . +docker run --rm --init ao-cli-smoke +``` +> `--init` gives the container a real PID-1 reaper (tini) so the daemon the +> check starts is reaped after `stop` instead of lingering as a zombie. + +## What the Go suite covers + +`TestE2E_VersionAndHelp` (version/`--version`/help, daemon hidden) · +`TestE2E_DoctorDoesNotTouchTheStore` (doctor text + `--json`; proves it does +**not** create/migrate `ao.db`) · `TestE2E_StatusStopped` (stopped + idempotent +stop) · `TestE2E_Lifecycle` (start, ready, idempotent, daemon-created store, +`/healthz` identity, stop, run-file cleanup) · `TestE2E_ShutdownGuard` (the +`/shutdown` CSRF + DNS-rebinding 403 guard, daemon survives) · +`TestE2E_StaleRunFile` (dead-PID run-file → stale → cleaned) · `TestE2E_ExitCodes` +(2 usage / 1 runtime / config error) · `TestE2E_Completion` (all four shells). + +## Why a Go suite (not bash, not Python) + +The bash version grew past the point where bash was a good fit, and a Linux +container can't observe the macOS/Windows code paths at all. A Go `os/exec` +suite is the right home: it uses the repo's own toolchain (runs under `go test`), +gives real assertions and structured data, and — critically — runs natively on +the Windows and macOS runners, finally covering the `CREATE_NEW_PROCESS_GROUP` +detach path and per-OS config-dir resolution. The container stays as a thin +"clean install actually works" check. + +## Extending + +- **Add a case:** a new `TestE2E_*` function (or a `t.Run` subtest) in + `e2e_test.go`. Use `newEnv(t)` for isolated state and the `env.run`/`httpGet`/ + `postShutdown` helpers. +- **Add an OS:** extend the `matrix.os` list in `.github/workflows/cli-e2e.yml`. +- Deeper per-OS path assertions (state resolves under the OS-native config dir + when `AO_RUN_FILE`/`AO_DATA_DIR` are unset) fit best as unit tests in + `internal/config`. diff --git a/test/cli/install-check.sh b/test/cli/install-check.sh new file mode 100755 index 00000000..f4bcacd4 --- /dev/null +++ b/test/cli/install-check.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# Fresh-machine install check. The Dockerfile installs `ao` on PATH in a clean +# image and runs this; it proves a freshly installed binary actually works on a +# machine with no Go toolchain and no developer state. The COMPREHENSIVE, +# cross-platform behavioural suite lives in Go (backend/internal/cli/e2e_test.go, +# `go test -tags e2e`); this stays deliberately small and linear. + +set -euo pipefail + +AO_BIN="${AO_BIN:-ao}" +tmp="$(mktemp -d)" +export AO_RUN_FILE="$tmp/running.json" +export AO_DATA_DIR="$tmp/data" +export AO_PORT="${AO_PORT:-3001}" # the container is isolated; 3001 is free +trap '"$AO_BIN" stop >/dev/null 2>&1 || true; rm -rf "$tmp"' EXIT + +fail() { echo "FAIL: $1" >&2; exit 1; } + +echo "ao binary : $(command -v "$AO_BIN")" +"$AO_BIN" version >/dev/null || fail "version" +"$AO_BIN" doctor >/dev/null || fail "doctor" +"$AO_BIN" start >/dev/null || fail "start" + +"$AO_BIN" status --json | grep -q '"state": "ready"' || fail "daemon not ready after start" + +# the /shutdown control endpoint rejects a cross-origin caller (403) and survives +code="$(curl -s -o /dev/null -w '%{http_code}' -X POST \ + -H 'Origin: https://evil.example' "http://127.0.0.1:$AO_PORT/shutdown")" +[ "$code" = "403" ] || fail "cross-origin /shutdown returned $code, want 403" +"$AO_BIN" status --json | grep -q '"state": "ready"' || fail "daemon died after rejected shutdown" + +"$AO_BIN" stop >/dev/null || fail "stop" +"$AO_BIN" status --json | grep -q '"state": "stopped"' || fail "daemon not stopped" + +echo "fresh-install check: OK"