From 689861535c56fea13b88b79b4053acbbd9cd7134 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Mon, 15 Jun 2026 21:59:24 +0200 Subject: [PATCH 01/10] feat(agent): Pi-backed agent workflow service, template, tracing, and docs - New agent workflow service wrapping the Pi harness, served same-origin like chat/completion at /services/agent/v0 (Python service + Node Pi sidecar, ports/adapters). - Builtin 'agent' app type + create-app template; config is model + AGENTS.md. - /inspect chat schema and OpenTelemetry tracing into Agenta. - EE dev compose agent-pi sidecar; design docs under docs/design/agent-workflows. --- docs/design/agent-workflows/README.md | 123 ++ .../agent-workflows/research/auth-secrets.md | 441 ++++ .../research/daytona-sandbox.md | 482 +++++ .../research/diskless-in-memory-config.md | 460 ++++ .../research/open-questions.md | 312 +++ .../research/otel-instrumentation.md | 379 ++++ .../research/pi-interaction.md | 584 ++++++ .../research/sandbox-sharing.md | 359 ++++ .../agent-workflows/wp-1-pi-tracing/README.md | 73 + .../integrating-the-tracing-extension.md | 186 ++ .../wp-1-pi-tracing/poc/.env.example | 7 + .../wp-1-pi-tracing/poc/README.md | 86 + .../wp-1-pi-tracing/poc/agenta-otel.ts | 414 ++++ .../wp-1-pi-tracing/poc/package.json | 25 + .../wp-1-pi-tracing/poc/pnpm-lock.yaml | 1842 +++++++++++++++++ .../wp-1-pi-tracing/poc/run.ts | 197 ++ .../tracing-in-the-agent-service.md | 113 + .../wp-2-agent-service/README.md | 124 ++ .../wp-2-agent-service/implementation-plan.md | 273 +++ .../wp-3-daytona-sandbox/README.md | 99 + .../wp-3-daytona-sandbox/poc/README.md | 118 ++ .../poc/bench_coldstart.py | 49 + .../poc/build_snapshot.py | 95 + .../wp-3-daytona-sandbox/poc/cleanup.py | 43 + .../wp-3-daytona-sandbox/poc/run_agent.py | 325 +++ .../wp-4-multi-message-output/README.md | 55 + .../wp-5-chat-vs-completion/README.md | 51 + .../wp-6-workflow-type-and-template/README.md | 84 + .../agent-workflows/wp-7-tools/README.md | 214 ++ .../docker-compose/ee/docker-compose.dev.yml | 36 + .../agenta/sdk/engines/running/interfaces.py | 36 + .../agenta/sdk/engines/running/utils.py | 25 +- services/agent/.dockerignore | 3 + services/agent/README.md | 73 + services/agent/config/AGENTS.md | 7 + services/agent/config/agent.json | 4 + services/agent/docker-compose.agent.yml | 98 + services/agent/docker-compose.stack.yml | 86 + services/agent/docker/Dockerfile.dev | 28 + services/agent/package.json | 27 + services/agent/pnpm-lock.yaml | 1826 ++++++++++++++++ services/agent/scripts/register_agent_app.py | 166 ++ services/agent/src/agenta-otel.ts | 551 +++++ services/agent/src/cli.ts | 44 + services/agent/src/runPi.ts | 231 +++ services/agent/src/server.ts | 64 + services/agent/tsconfig.json | 16 + services/entrypoints/agent_main.py | 47 + services/entrypoints/main.py | 2 + services/oss/src/agent.py | 140 ++ services/oss/src/agent_pi/__init__.py | 11 + services/oss/src/agent_pi/config.py | 68 + services/oss/src/agent_pi/local_runtime.py | 59 + services/oss/src/agent_pi/pi_harness.py | 84 + services/oss/src/agent_pi/pi_http_harness.py | 64 + services/oss/src/agent_pi/ports.py | 121 ++ services/oss/src/agent_pi/schemas.py | 71 + .../components/CreateAppDropdown/index.tsx | 6 + .../modals/CreateAppTypeModal/index.tsx | 6 + .../pages/prompts/assets/iconHelpers.tsx | 4 +- .../src/workflow/state/appUtils.ts | 6 +- 61 files changed, 11619 insertions(+), 4 deletions(-) create mode 100644 docs/design/agent-workflows/README.md create mode 100644 docs/design/agent-workflows/research/auth-secrets.md create mode 100644 docs/design/agent-workflows/research/daytona-sandbox.md create mode 100644 docs/design/agent-workflows/research/diskless-in-memory-config.md create mode 100644 docs/design/agent-workflows/research/open-questions.md create mode 100644 docs/design/agent-workflows/research/otel-instrumentation.md create mode 100644 docs/design/agent-workflows/research/pi-interaction.md create mode 100644 docs/design/agent-workflows/research/sandbox-sharing.md create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/README.md create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md create mode 100644 docs/design/agent-workflows/wp-2-agent-service/README.md create mode 100644 docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/README.md create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py create mode 100644 docs/design/agent-workflows/wp-4-multi-message-output/README.md create mode 100644 docs/design/agent-workflows/wp-5-chat-vs-completion/README.md create mode 100644 docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md create mode 100644 docs/design/agent-workflows/wp-7-tools/README.md create mode 100644 services/agent/.dockerignore create mode 100644 services/agent/README.md create mode 100644 services/agent/config/AGENTS.md create mode 100644 services/agent/config/agent.json create mode 100644 services/agent/docker-compose.agent.yml create mode 100644 services/agent/docker-compose.stack.yml create mode 100644 services/agent/docker/Dockerfile.dev create mode 100644 services/agent/package.json create mode 100644 services/agent/pnpm-lock.yaml create mode 100644 services/agent/scripts/register_agent_app.py create mode 100644 services/agent/src/agenta-otel.ts create mode 100644 services/agent/src/cli.ts create mode 100644 services/agent/src/runPi.ts create mode 100644 services/agent/src/server.ts create mode 100644 services/agent/tsconfig.json create mode 100644 services/entrypoints/agent_main.py create mode 100644 services/oss/src/agent.py create mode 100644 services/oss/src/agent_pi/__init__.py create mode 100644 services/oss/src/agent_pi/config.py create mode 100644 services/oss/src/agent_pi/local_runtime.py create mode 100644 services/oss/src/agent_pi/pi_harness.py create mode 100644 services/oss/src/agent_pi/pi_http_harness.py create mode 100644 services/oss/src/agent_pi/ports.py create mode 100644 services/oss/src/agent_pi/schemas.py diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md new file mode 100644 index 0000000000..7d5784dfc2 --- /dev/null +++ b/docs/design/agent-workflows/README.md @@ -0,0 +1,123 @@ +# Agent Workflows + +Status: context draft. Research and design to follow. + +## Summary + +Add a new workflow type to the backend: **agents**. Today the backend runs +prompt-style workflows (completion, chat, LLM-as-a-judge). Agents are different. An +agent runs inside a sandbox, executes tools over multiple turns, returns a multi-message +output, and is instrumented end to end. Agents run on a **pi.dev** harness by default, +and the same harness can run locally so a configuration pulled from the server behaves +the same on a developer machine. + +This document only captures context. It does not propose a solution yet. The research +topics in [Open research topics](#open-research-topics) will be assigned to subagents and +written up in sibling files. + +## What an agent is + +An agent is a configured, sandboxed, instrumented runtime that: + +- Boots a sandbox through startup hooks that lay down files and inject secrets. +- Runs a harness (pi by default, configurable) that drives the model and its tools. +- Produces a multi-message output rather than a single completion. +- Carries a `session_id` so a run can be identified and, later, have its state stored. +- Emits instrumentation through pi instruments for tracing and observability. + +## Agent configuration + +The agent configuration is what gets stored on the server, versioned as a workflow +revision, and pulled down to run locally. It includes: + +- **`AGENTS.md`** — the agent's instructions. +- **Skills** — the skills available to the agent. +- **Model** — the model the agent runs on. +- **Tools** — the tools the agent has access to. +- **Files** — files that are part of the config and are laid into the sandbox by the + startup hook. +- **Secrets** — for example an OpenAI key, injected into the sandbox by the startup + hook. +- **Harness** — which harness runs the agent. Defaults to pi; configurable. + +## Runtime model + +- **Sandbox.** Agents run in a Daytona sandbox, or any sandbox provider that works with + our port. The sandbox is initialized by startup hooks: file setup, then secrets setup. +- **Harness.** The harness (pi by default) is the layer that exposes tools and drives the + agent loop. It is configurable per agent. +- **Output.** A run returns multiple messages, not one completion. +- **Instrumentation.** Runs are instrumented with pi instruments. +- **Sessions.** Each run has a `session_id`. Future work adds session storage alongside + global storage so session state can persist across runs. + +## Local execution parity + +The same harness that runs server-side must run locally on pi.dev abstractions (tools and +the rest). A user can pull an agent's configuration from the server and run it locally +with the same behavior. Local-server parity is a first-class requirement, not an +afterthought. + +## What the research established + +Full write-ups live in [`research/`](research/). The load-bearing conclusions: + +- **pi.dev is "Pi"**, an open-source TypeScript/Node agent harness by Earendil Inc. (MIT, + ~v0.79.4). It is local-first (a CLI/SDK/RPC, not a hosted service) and moves fast (0.x, + roughly weekly releases). There is no Python SDK. + See [`research/pi-interaction.md`](research/pi-interaction.md), + [`research/open-questions.md`](research/open-questions.md). +- **Pi can run fully diskless.** Via the SDK's `createAgentSession`, AGENTS.md + (`systemPromptOverride`/`agentsFilesOverride`), skills (`skillsOverride`), tools + (`customTools`), LLM auth (`setRuntimeApiKey` / `AuthStorage.inMemory()` / env), and + session/settings/model state (`*.inMemory()`) are all in-memory. The only forced disk + write is bash output spillover to `os.tmpdir()`, redirected with `TMPDIR` to a per-run + tmpfs. See [`research/diskless-in-memory-config.md`](research/diskless-in-memory-config.md). +- **"pi instruments" is not a product.** Pi emits no OTel by itself. Instrumentation is a + Pi extension on the `pi.on(...)` event bus that turns lifecycle events into OTLP spans. + Agenta already ingests OTLP at `POST /otlp/v1/traces` with adapters for GenAI semconv + and OpenInference, so `gen_ai.*` spans flow with little new backend code. Watch the + token-attribute drift (`input_tokens`/`output_tokens` vs the mapped + `prompt_tokens`/`completion_tokens`). See + [`research/otel-instrumentation.md`](research/otel-instrumentation.md). +- **The harness seam is ours to build.** Pi's own "harness" concept is not a swap point + for Codex or Claude Code. The recommended shape is a thin TypeScript wrapper that drives + Pi's SDK with the in-memory overrides above and exposes our own protocol on a port. That + wrapper is the "works with our port" contract, the swappable-harness boundary, and the + local/server parity point. See [`research/auth-secrets.md`](research/auth-secrets.md). +- **One shared sandbox is viable for v1.** Daytona supports one long-lived sandbox reused + across runs. It does not support swapping a volume per execution (volumes mount at create + time only). Per-run isolation comes from process memory plus a per-run tmpfs, not a + volume, which the diskless finding makes clean. Concurrency is contended, so bound it. + See [`research/sandbox-sharing.md`](research/sandbox-sharing.md), + [`research/daytona-sandbox.md`](research/daytona-sandbox.md). + +## POC work packages + +The POC runs as parallel tracks. Each has its own folder with scope and a definition of +done. WP-1 and WP-2 run against a local Pi install first (no Daytona). WP-3 takes the +sandbox path in parallel. WP-4 and WP-5 are design tasks that feed the WP-2 interface. WP-6 registers the agent as a +backend workflow type and template, and defines its configuration and connection to the +running agent. + +- [`wp-1-pi-tracing/`](wp-1-pi-tracing/README.md) — install Pi locally and send its agent + telemetry to Agenta as clean, structured traces. +- [`wp-2-agent-service/`](wp-2-agent-service/README.md) — a new service that wraps Pi and + exposes a completion/chat-style interface, with auth and AGENTS.md set up in memory. +- [`wp-3-daytona-sandbox/`](wp-3-daytona-sandbox/README.md) — create a Daytona sandbox with + Pi installed, inject files and secrets, run an agent, and stream output back. +- [`wp-4-multi-message-output/`](wp-4-multi-message-output/README.md) — define how an + agent's multi-message output is shaped, streamed, stored, and surfaced. +- [`wp-5-chat-vs-completion/`](wp-5-chat-vs-completion/README.md) — decide the interface + contract; start with chat that takes a single input. +- [`wp-6-workflow-type-and-template/`](wp-6-workflow-type-and-template/README.md) — register + the agent as a new backend workflow type and template; define its config (model) and the + connection to the running agent. +- [`wp-7-tools/`](wp-7-tools/README.md) — make runnable tools part of the agent config; resolve + Composio actions into Pi tools and route tool calls back through the existing + `POST /tools/call`, with MCP and workflow-as-tool as future adapters. + +## Related work + +- [`../prompt-runtime-unification/`](../prompt-runtime-unification/README.md) — the + prompt-side runtime that "future agent-style services" were already anticipated against. diff --git a/docs/design/agent-workflows/research/auth-secrets.md b/docs/design/agent-workflows/research/auth-secrets.md new file mode 100644 index 0000000000..b90af4ace5 --- /dev/null +++ b/docs/design/agent-workflows/research/auth-secrets.md @@ -0,0 +1,441 @@ +# Research: Auth and Secrets for the pi.dev Agent Harness + +Status: research only. No code changes. This file answers the five auth/secrets +questions for the agent-workflows feature (see +[`../README.md`](../README.md)). Every claim is cited. Items I could not verify +from a primary source are marked **UNVERIFIED**. + +## Summary + +- **pi is a local CLI/SDK, not a hosted service.** "pi.dev" is the marketing and + docs site plus a package registry. There is no pi.dev account, no pi-issued API + key, and no pi-managed model gateway. You authenticate to *model providers*, not + to pi. ("Pi is a local coding agent. It runs with the permissions of the user + account that starts it." — `security.md`.) +- **Provider auth is bring-your-own-key (BYOK) or provider OAuth.** pi reaches + OpenAI/Anthropic/etc. with the user's own provider keys, or with a provider's + subscription OAuth (Claude Pro/Max, ChatGPT Plus/Pro (Codex), GitHub Copilot). + Keys live in env vars or `~/.pi/agent/auth.json`. There is no pi gateway in the + middle, though pi can be *pointed at* a gateway you run (Cloudflare AI Gateway, + OpenShell inference routing, a corporate proxy). +- **There is no first-class "secrets vault" in pi core.** pi has an *auth* + concept (provider credentials) and a flexible key-resolution syntax + (`$ENV`, `${ENV}`, `!shell-command`, literal). Anything beyond provider creds is + just environment variables / files the host process already has. The "named + secrets, scoped, agent-never-sees-the-value" feature surfaced in searches is a + set of **third-party community extensions** (e.g. `pi-secret-guard`, + `pi-secured-setup`, `pi-heimdall`, "Greywall"), not pi core. +- **The Codex secret has two shapes.** (a) Keep pi as the harness and use pi's + native `openai-codex-responses` API + the built-in "ChatGPT Plus/Pro (Codex)" + OAuth login — the credential is a pi `OAuthCredentials` object in + `~/.pi/agent/auth.json`. (b) Swap the harness to the real **OpenAI Codex CLI** + (`codex exec`), in which case the "codex secret" is either an `OPENAI_API_KEY` + /`CODEX_API_KEY` value or a ChatGPT access token, materialized into + `~/.codex/auth.json` (or `$CODEX_HOME/auth.json`) before the headless run. +- **For the Agenta feature: manage secrets in Agenta and inject them.** pi has no + vault to delegate to. Agenta should store secrets at rest (encrypted), then the + startup/secrets hook lays them into the sandbox as env vars and/or the right + auth file. pi's observability layer is already designed to keep keys/headers/ + payloads out of traces by default — lean on that and verify it. + +## 1. pi.dev auth model + +### Authenticating to pi itself + +There is nothing to authenticate to. pi is installed locally (npm/pnpm/bun/curl) +and runs as the local user. The only network calls pi makes on its own behalf are +version/telemetry pings to `pi.dev`, which are opt-out: + +- `enableInstallTelemetry` -> `https://pi.dev/api/report-install` +- version check -> `https://pi.dev/api/latest-version` +- `PI_OFFLINE=1` / `--offline` disables all startup network ops; + `PI_SKIP_VERSION_CHECK=1` disables the version check; `PI_TELEMETRY=0` disables + the ping. (Source: `settings.md`, `usage.md`.) + +So "auth to pi.dev" is not a concept we need to model. There is no pi account, +no pi org, no pi-issued token. (Source: `security.md`; `pi.dev` landing page.) + +### How pi authenticates to model providers + +Three mechanisms, with a defined precedence. From `sdk.md` (AuthStorage) and +`providers.md`: + +1. CLI `--api-key ` flag (or SDK runtime override `setRuntimeApiKey`, not + persisted). +2. `~/.pi/agent/auth.json` entry (API key **or** OAuth tokens). Stored with `0600` + perms. Auth-file entries take priority over env vars. +3. Provider env var (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`, ...). +4. Fallback resolver for custom-provider keys from `models.json`. + +`auth.json` is a flat object keyed by provider name. API-key shape +(`providers.md`): + +```json +{ + "anthropic": { "type": "api_key", "key": "sk-ant-..." }, + "openai": { "type": "api_key", "key": "sk-..." } +} +``` + +Provider **OAuth / subscription login** is also first-class. `/login` (interactive) +supports Claude Pro/Max, **ChatGPT Plus/Pro (Codex)**, and GitHub Copilot. OAuth +tokens auto-refresh and persist in the same `auth.json` as an `OAuthCredentials` +object (`providers.md`, `custom-provider.md`): + +```ts +interface OAuthCredentials { + refresh: string; // refresh token + access: string; // access token (what getApiKey() returns) + expires: number; // ms epoch expiry +} +``` + +So the answer to "pass-through provider keys, a pi-managed gateway, or both?" is: +**pass-through only.** No pi-managed gateway exists. pi *can* be pointed at a +gateway you operate — Cloudflare AI Gateway as a unified-billing/observability +proxy ([issue #3850](https://github.com/earendil-works/pi/issues/3850)), a +corporate proxy via `pi.registerProvider("openai", { baseUrl, headers })` +(`custom-provider.md`), or OpenShell inference routing where the gateway injects +upstream provider creds and the sandbox only sees `https://inference.local` +(`containerization.md`). Those are *your* gateways, not pi's. + +## 2. Provider-key handling and the key-resolution syntax + +This matters because it is how a secret gets indirected instead of pasted as a +literal. `apiKey`, custom header values, and `auth.json` `key` values share one +resolution syntax (`providers.md`, `custom-provider.md`): + +- `!command` at the **start** of the value runs a shell command and uses its + output (e.g. `"!security find-generic-password -ws 'anthropic'"`, or + `"!op read 'op://vault/item/secret'"` for 1Password). +- `$ENV_VAR` and `${ENV_VAR}` interpolate environment variables. +- `$$` -> literal `$`; `$!` -> literal `!`. +- Otherwise the value is a literal. + +Custom providers/proxies can carry secrets in headers using the same syntax: + +```ts +pi.registerProvider("google", { + baseUrl: "https://ai-gateway.corp.com/google", + headers: { "X-Corp-Auth": "$CORP_AUTH_TOKEN" } // env var or literal +}); +``` + +Implication for Agenta: we do **not** have to write raw secrets into pi config +files. We can inject env vars into the sandbox and reference them as `$VAR` in +pi's `auth.json`/provider config, or reference a secrets manager via `!command`. + +## 3. Secrets concept + injection + +### Is there a first-class "secrets" feature in pi core? No. + +pi core has an **auth** concept (provider credentials, above) and project +**trust** (an input-loading guard for `.pi/` resources, not a secret store — +`security.md`). It does **not** ship a named-secret/vault/scoped-secret feature. +The "secrets with a value + allowed host patterns, where the agent never sees the +real value" model that searches surface is from **third-party extensions**, not +Earendil: + +- `pi-secret-guard` — author **acarerdinc**, third-party. Scans `git commit`/ + `git push` via the `tool_call` event and blocks if secrets are detected; + regex + LLM review. (Source: `https://pi.dev/packages/pi-secret-guard` package + page.) This is a *leak-prevention* tool, not a secret *store*. +- `pi-secured-setup`, `pi-heimdall`, "Greywall" — third-party permission/redaction + layers (community blogs; **UNVERIFIED** beyond existence — treat as ecosystem + examples, not core). + +Conclusion: if Agenta wants named, scoped secrets, Agenta owns that. pi gives us +the *injection surface* (env vars, files, `$ENV`/`!cmd` references), not a vault. + +### How secrets reach a pi run and the tools inside it + +Because pi runs as the local user with the local environment, **every secret a +tool sees is whatever is in the process environment / filesystem of the pi +process**. There is no per-tool secret broker in core. Built-in tools +(`read`, `write`, `edit`, `bash`, `grep`, `find`, `ls`) and extension tools run +"with the permissions of the pi process" (`security.md`). So a `bash` tool can +read any env var or file the process can. Scope is the *process/sandbox boundary*, +not a pi ACL. + +This is exactly why the Agenta design runs pi in a **sandbox** (Daytona) and uses +**startup hooks** to lay down files then inject secrets — that sandbox *is* the +secret-scoping boundary. pi's own docs say the same: for unattended/untrusted +work, "run pi in a contained environment ... with only the files and credentials +required for the task" and "pass the minimum required API keys or use short-lived +credentials" (`security.md`, `containerization.md`). + +### Where to inject (three concrete options, all supported by pi) + +1. **Env vars in the sandbox** (simplest; matches pi's BYOK model). Set + `OPENAI_API_KEY` etc. in the sandbox env; pi resolves them via precedence rule + #3. The Docker example does exactly this: `docker run -e ANTHROPIC_API_KEY ...` + (`containerization.md`). +2. **`~/.pi/agent/auth.json` file** laid into the sandbox (precedence #2, beats + env). Either literal keys or `$ENV`/`!cmd` indirection. Note the doc warning: + "Mounting your host `~/.pi/agent` exposes host auth and session files to the + container." For a sandbox we generate a fresh `auth.json`, we do not mount the + host's. +3. **Gateway / inference routing** (strongest isolation): the sandbox calls + `https://inference.local` and a gateway injects the real provider key upstream, + so "OpenShell providers can keep raw model API keys outside the sandbox" + (`containerization.md`). This keeps the model key out of the sandbox entirely. + +### Scoping per-agent / per-session + +- **Per-agent**: each agent revision's secrets become that sandbox's env/auth + files. Different agent => different sandbox => different secret set. pi's + precedence model means a per-sandbox `auth.json` or per-sandbox env fully + determines what that agent can use. +- **Per-session**: the SDK exposes `authStorage.setRuntimeApiKey(provider, key)` + (runtime override, **not persisted**) and a "custom auth storage location" + (`sdk.md`). A session can be given a short-lived key in memory without writing + it to disk — useful for per-`session_id` credentials that should not outlive the + run. **UNVERIFIED**: exact API for a fully custom per-session AuthStorage path + beyond `setRuntimeApiKey` and the "custom auth storage location" mention. + +## 4. The Codex secret (the swappable-harness question) + +The README says the harness is swappable and could run OpenAI Codex instead of +pi's own loop. There are two genuinely different ways to do this, and the "codex +secret" means something different in each. + +### Option A — keep pi as the harness, talk to the Codex backend through pi + +pi already speaks Codex natively. `custom-provider.md` lists an API type +**`openai-codex-responses`** ("OpenAI Codex Responses API"), and `/login` offers +**"ChatGPT Plus/Pro (Codex)"** OAuth login ("Officially endorsed by OpenAI: Codex +for OSS", per `providers.md`). In this option: + +- The "codex secret" is just a pi credential: either an `OPENAI_API_KEY` (env or + `auth.json` `{"openai": {"type":"api_key","key":"..."}}`) for API-key access, or + a pi `OAuthCredentials` object for ChatGPT-subscription Codex access. +- Injection is identical to any other pi provider (section 3). No separate Codex + install needed. This is the lowest-friction path and stays inside pi's + instrumentation/observability. + +### Option B — swap in the real OpenAI Codex CLI as the harness + +Here pi is replaced (or wrapped) by the `codex` CLI, run headless with +`codex exec`. The "codex secret" is Codex's own credential. How Codex authenticates +(OpenAI Codex docs): + +- **ChatGPT login (default)** when no valid session exists — interactive, browser + or device flow. Not suitable headless unless you transplant a token. +- **API key** — recommended for "programmatic Codex CLI workflows, such as CI/CD + jobs" (`developers.openai.com/codex/auth`). +- **Access token** — ChatGPT-workspace token for "trusted, non-interactive + workflows" (`developers.openai.com/codex/enterprise/access-tokens`). + +Credential storage: `~/.codex/auth.json` (plaintext) or an OS keyring, controlled +by `cli_auth_credentials_store` = `file` | `keyring` | `auto`; the file lives +under `CODEX_HOME` (default `~/.codex`). Treat `auth.json` "like a password" +(`developers.openai.com/codex/auth`). + +Headless injection patterns: + +1. **Per-invocation API key (no persisted login):** + ```bash + CODEX_API_KEY= codex exec --json "your task" + ``` + Set it only for the single invocation, not as a job-level env var, "in workflows + that execute untrusted code" (`developers.openai.com/codex/noninteractive`). +2. **Persisted API-key login (writes `auth.json`):** + ```bash + printenv OPENAI_API_KEY | codex login --with-api-key # reads key from stdin + codex login status # -> "Logged in using an API key - sk-proj-***ABCD1" + ``` + (`developers.openai.com/codex/auth`, simplified.guide.) Note: setting + `OPENAI_API_KEY` env var **alone does not persist a login** — you must run a + login command or use `CODEX_API_KEY` per invocation. A request to honor + `OPENAI_API_KEY` without writing `auth.json` was closed "not planned" + ([issue #5212](https://github.com/openai/codex/issues/5212)); the documented + workaround is a custom `[model_providers.*]` with `env_key = "OPENAI_API_KEY"`. +3. **ChatGPT access token via stdin (subscription/workspace, headless):** + ```bash + printenv CODEX_ACCESS_TOKEN | codex login --with-access-token + ``` + (`developers.openai.com/codex/auth`.) +4. **Transplant a prepared `auth.json`** generated on a machine that did the + browser login, copied into `$CODEX_HOME/auth.json` in the sandbox (SSH/Docker + copy pattern; `developers.openai.com/codex/auth`). + +Custom-provider config (e.g. proxy/Azure) uses `config.toml` with `env_key` so the +secret is never checked into the dotfile (`developers.openai.com/codex/config-advanced`): + +```toml +model = "gpt-5.4" +model_provider = "proxy" + +[model_providers.proxy] +name = "OpenAI using LLM proxy" +base_url = "http://proxy.example.com" +env_key = "OPENAI_API_KEY" +``` + +Useful headless flags: `codex exec --json`, `--output-schema `, +`--ephemeral` (don't persist session files), `--skip-git-repo-check`, +`--ignore-user-config`, `--sandbox ` (`developers.openai.com/codex/noninteractive`, +`/codex/cli/reference`). + +**Gotcha to design around:** Codex's API-key-via-env sign-in is blocked while a +ChatGPT subscription login is active in the same `CODEX_HOME` +([issue #3286](https://github.com/openai/codex/issues/3286)). For deterministic +headless runs give each agent run a clean `CODEX_HOME` and exactly one credential +mode. + +### Recommendation on the Codex secret + +Model a **harness-typed "codex secret"** in the agent config that can carry either +(i) an OpenAI API key or (ii) a ChatGPT access token, plus a target mode. The +startup/secrets hook then materializes it for whichever harness is selected: + +- pi harness, `openai-codex-responses` -> write to pi `auth.json` / env as the + `openai` credential. +- Codex CLI harness -> either export `CODEX_API_KEY` for the single `codex exec`, + or render a fresh `$CODEX_HOME/auth.json`, or pipe a token to + `codex login --with-access-token`. + +This keeps the secret abstraction harness-agnostic and matches the README's +"swappable harness" requirement. + +## 5. Security best practices + +### Keeping secrets out of logs / traces / instrumentation + +pi's observability design (`packages/agent/docs/observability.md`) already treats +this as a first-class concern. pi emits structured lifecycle events +(`pi.agent.prompt`, `pi.ai.provider.request`, `pi.agent.tool_call`, ...) that an +adapter turns into OTel/Sentry spans. The doc defines an explicit allow/deny list: + +- **Safe by default** (emitted): provider, model, API id, session id, entry type, + tool name, status code, stop reason, token counts, costs, durations. +- **Unsafe by default** (NOT emitted): prompts, completions, tool args, tool + results, shell output, file contents, provider request payloads, provider + response bodies, **API keys**, **headers**. "Content capture can be opt-in later + with explicit redaction hooks." + +So if Agenta maps pi observability events to its tracing/instrumentation, secrets +in keys/headers/payloads are excluded by default. **Action for Agenta:** verify our +adapter does not turn on content capture, and confirm we never log resolved +`auth.json` values or the sandbox env. Also: the `before_provider_request` / +`before_provider_payload` hooks can inspect/replace the outgoing payload, which is +the right place to add redaction if we ever capture content +(`packages/agent/docs/hooks.md`, `extensions.md`). + +Additional bleed paths to guard (pi-specific): + +- `!command` key resolution runs a shell; ensure the command itself does not echo + the secret to a place pi captures. +- pi tools include `bash`; agent-run shell output is large and can contain secrets. + pi keeps tool/shell output out of traces by default, but if we surface the + multi-message agent output to users, scrub it. +- Do not mount the host `~/.pi/agent` into the sandbox (would leak host + auth/sessions) — generate fresh files per sandbox (`containerization.md`). + +### Storage at rest + +pi stores provider creds in `~/.pi/agent/auth.json` at `0600` (or an OS keyring is +not offered by pi core — that's Codex's `cli_auth_credentials_store`, not pi). +**For Agenta:** the agent config carries secrets that get versioned as a workflow +revision, so they must be **encrypted at rest in Agenta's store**, not persisted in +plaintext alongside the rest of the config, and decrypted only at injection time. +pi gives no at-rest encryption beyond file perms, so this is Agenta's +responsibility. Prefer short-lived/scoped credentials where the provider supports +them (pi docs explicitly recommend this for sandboxed runs). + +### How secrets reach the sandbox: env vs file vs API + +Ranked by isolation: + +1. **Gateway / inference routing (best):** raw provider key stays *outside* the + sandbox; sandbox calls `inference.local`; gateway injects upstream + (`containerization.md`). Use when we don't want the model key in the sandbox at + all. +2. **Mounted auth file** (`auth.json` / `$CODEX_HOME/auth.json`): file perms + `0600`, generated per run, removed on teardown. Can use `$ENV`/`!cmd` + indirection so the file holds a reference, not the literal. +3. **Env vars (simplest, matches pi BYOK):** fine inside a per-run sandbox; avoid + job-level env in any context that runs untrusted code (Codex doc warning). + +In all cases the **sandbox is the scope**: one agent/session -> one sandbox -> one +minimal credential set, torn down after the run. + +## Open questions + +- **Per-session custom AuthStorage in pi SDK.** `setRuntimeApiKey` (non-persisted) + and a "custom auth storage location" are documented in `sdk.md`, but the full + API for a per-`session_id` in-memory credential store is not spelled out. + Confirm against `@earendil-works/pi-agent-core` / `pi-coding-agent` types. +- **Does Agenta want pi-harness Codex (`openai-codex-responses`) or the real Codex + CLI as the swappable harness?** They have different secret shapes and different + instrumentation stories (pi events vs Codex `--json` stream). Decide before + designing the "codex secret" type. +- **Daytona secret primitives.** This file covers pi + Codex. Whether Daytona has + its own secret/env-injection API that the startup hook should use (vs writing + files/env ourselves) is out of scope here — covered by the Daytona research + topic in the README. +- **Codex `CODEX_HOME` isolation per run.** Confirm we give each Codex-harness run + a clean `CODEX_HOME` to avoid the ChatGPT-vs-API-key conflict + ([issue #3286](https://github.com/openai/codex/issues/3286)). +- **Third-party secret extensions.** `pi-secured-setup` / `pi-heimdall` / + "Greywall" exist but are **UNVERIFIED** as to maintenance and fit; do not depend + on them. If we want redaction, build it on the core `before_provider_*` hooks. +- **pi's `enableAnalytics` / `trackingId`.** Opt-in analytics exists + (`PI_EXPERIMENTAL=1` setup). Confirm it is off in our sandbox image so nothing + leaves the box unexpectedly. + +## Sources + +pi.dev (Earendil) — primary: + +- pi.dev landing page — product overview, providers, modes: https://pi.dev +- providers.md (auth.json, provider env vars, /login, OAuth, ChatGPT Plus/Pro + (Codex)): https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/providers.md +- custom-provider.md (registerProvider, apiKey/header syntax, + `openai-codex-responses` API type, OAuthCredentials, authHeader): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/custom-provider.md +- security.md (local trust boundary, no built-in sandbox, "minimum credentials"): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/security.md +- containerization.md (Docker `-e` keys, Gondolin, OpenShell inference routing): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md +- settings.md (telemetry endpoints, PI_OFFLINE, analytics, sessionDir): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/settings.md +- usage.md (env vars, /login, --api-key): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/usage.md +- quickstart.md / index.md (subscription vs API-key first run): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/quickstart.md +- extensions.md (events: session_start, tool_call, before_provider_request): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md +- sdk.md (AuthStorage precedence, setRuntimeApiKey, custom auth storage): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- packages/agent/docs/observability.md (safe/unsafe-by-default trace fields): + https://github.com/earendil-works/pi/blob/main/packages/agent/docs/observability.md +- packages/agent/docs/hooks.md (before_provider_request/payload transform hooks): + https://github.com/earendil-works/pi/blob/main/packages/agent/docs/hooks.md +- Cloudflare AI Gateway request (gateway is user-operated): + https://github.com/earendil-works/pi/issues/3850 +- pi-secret-guard package page (third-party, author acarerdinc): + https://pi.dev/packages/pi-secret-guard + +OpenAI Codex — primary: + +- Codex authentication (ChatGPT vs API key, auth.json, CODEX_HOME, + cli_auth_credentials_store, --with-api-key, --with-access-token): + https://developers.openai.com/codex/auth +- Codex non-interactive (codex exec, CODEX_API_KEY, --ephemeral, --json, sandbox): + https://developers.openai.com/codex/noninteractive +- Codex CLI reference (flags): https://developers.openai.com/codex/cli/reference +- Codex advanced config (model_providers, env_key): + https://developers.openai.com/codex/config-advanced +- Codex enterprise access tokens: + https://developers.openai.com/codex/enterprise/access-tokens +- Issue #5212 (OPENAI_API_KEY without writing auth.json — closed not planned): + https://github.com/openai/codex/issues/5212 +- Issue #3286 (env API-key sign-in blocked when ChatGPT login active): + https://github.com/openai/codex/issues/3286 + +Secondary / corroborating (not load-bearing): + +- simplified.guide Codex API-key login (codex login --with-api-key, login status): + https://www.simplified.guide/codex/api-key-login +- Mario Zechner (pi author) build notes: https://mariozechner.at/posts/2025-11-30-pi-coding-agent/ diff --git a/docs/design/agent-workflows/research/daytona-sandbox.md b/docs/design/agent-workflows/research/daytona-sandbox.md new file mode 100644 index 0000000000..df794d25c8 --- /dev/null +++ b/docs/design/agent-workflows/research/daytona-sandbox.md @@ -0,0 +1,482 @@ +# Daytona sandbox integration for agent workflows + +Research only. This file documents how the backend would programmatically create a +Daytona sandbox, install and run the pi.dev harness inside it, lay down files, inject +secrets, run the agent, stream output, and tear down. Every claim is cited. Items I could +not confirm from a primary source are marked UNVERIFIED. + +Context: see [`../README.md`](../README.md). Agents run on a pi.dev harness inside a +Daytona sandbox ("or any provider that works with our port"). Startup hooks lay down +config files, then inject secrets. + +## Summary + +- Daytona is an open-source (AGPL 3.0) "secure and elastic infrastructure for running + AI-generated code." Sandboxes are isolated machines with their own kernel, filesystem, + and network. It advertises sandbox start "under 90ms from code to execution." + [README](https://github.com/daytonaio/daytona), [docs](https://www.daytona.io/docs/en/). +- There is a first-class **Python SDK** (`pip install daytona`, package `daytona`, with + both sync `Daytona` and async `AsyncDaytona` clients), plus TypeScript, Go, Ruby, and + Java SDKs, a REST API, and a CLI. + [Python SDK](https://www.daytona.io/docs/en/python-sdk/), + [docs landing](https://www.daytona.io/docs/en/). +- Lifecycle: `daytona.create(...)` → `sandbox.process.exec(...)` / sessions → + `sandbox.stop()` / `sandbox.delete()`. States are creating/started/stopping/stopped/ + archiving/archived/deleting/deleted/error. Auto-stop (default 15 min), auto-archive + (default 7 days), and auto-delete (off by default) timers manage idle sandboxes. + [Sandboxes](https://www.daytona.io/docs/en/sandboxes/), + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). +- **Installing pi**: best fit is to bake pi into a custom **snapshot** (reusable image + template) so cold start does not pay an `npm install`. Build the snapshot from a base + image plus install commands using the **declarative Image builder** or a Dockerfile, or + install pi at runtime via `npm i -g @earendil-works/pi-coding-agent` / + `curl -fsSL https://pi.dev/install.sh | sh`. pi runs headless in print/JSON/RPC modes. + [Snapshots](https://www.daytona.io/docs/en/snapshots/), + [Declarative builder](https://www.daytona.io/docs/en/declarative-builder/), + [pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). +- **Files**: `sandbox.fs.upload_file` / `upload_files` (in-memory bytes → remote path), + plus `git` clone and mounted **volumes**. **Secrets/env**: `env_vars={...}` at create + time, `env={...}` per `exec`, baked `.env` in the image, or write a `.env`-style file + via the filesystem API. [File system](https://www.daytona.io/docs/en/file-system-operations/), + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). +- **Streaming**: run the agent in a **session** with `run_async=True`, then stream + stdout/stderr through `get_session_command_logs_async(session_id, cmd_id, on_stdout, + on_stderr)`. This maps cleanly onto pi's multi-message output if pi runs in JSON/RPC + mode (each emitted JSON line is one log chunk). [Process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). +- **Ports / "works with our port"**: `sandbox.get_preview_link(port)` returns a public URL + `https://{port}-{sandboxId}.proxy.daytona.work` plus an auth `token` (sent as + `x-daytona-preview-token`). Any HTTP port 1–65535 can be previewed. This is the + provider-agnostic "port contract" the design alludes to. + [Preview](https://www.daytona.io/docs/en/preview/). +- **Self-host**: yes, AGPL, via docker-compose (local) or a domain deployment behind + Caddy. Auth is API keys (`DAYTONA_API_KEY`, `X-Daytona-Organization-ID` for JWT) backed + by Dex/Auth0 OIDC. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/), + [API keys](https://www.daytona.io/docs/en/api-keys/). + +## Daytona SDK and lifecycle (Python, with code) + +### Install and client + +```bash +pip install daytona # package name: "daytona"; module import: "daytona" +``` + +```python +from daytona import Daytona, DaytonaConfig + +# From env vars: DAYTONA_API_KEY, DAYTONA_API_URL, DAYTONA_TARGET +daytona = Daytona() + +# Or explicit config +daytona = Daytona(DaytonaConfig( + api_key="YOUR_API_KEY", + api_url="https://app.daytona.io/api", # point at self-hosted URL for own infra + target="us", +)) +``` + +Async client (recommended for a FastAPI backend): + +```python +from daytona import AsyncDaytona + +async with AsyncDaytona() as daytona: + sandbox = await daytona.create() +``` + +Source: [Python SDK](https://www.daytona.io/docs/en/python-sdk/), +[API keys](https://www.daytona.io/docs/en/api-keys/). + +### Create / exec / stop / delete + +```python +# Create (defaults: python language, 1 vCPU / 1GB RAM / 3GiB disk) +sandbox = daytona.create() + +# Run a command +resp = sandbox.process.exec("echo 'Hello, World!'") +print(resp.result) + +# Stop, then delete (method names per SDK reference and sandboxes doc) +sandbox.stop() +sandbox.delete() +``` + +`Daytona.create()` signatures (note the default 60s creation timeout): + +```python +create(params: CreateSandboxFromSnapshotParams | None = None, + *, timeout: float = 60) -> Sandbox + +create(params: CreateSandboxFromImageParams | None = None, + *, timeout: float = 60, + on_snapshot_create_logs: Callable[[str], None] | None = None) -> Sandbox +``` + +`Sandbox` exposes submodules: `process`, `fs` / `file_system`, `git`, `object_storage`, +`volume`. Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/), +[Sandboxes](https://www.daytona.io/docs/en/sandboxes/). + +### Creation params (the important fields) + +`CreateSandboxFromSnapshotParams` and `CreateSandboxFromImageParams` both inherit +`CreateSandboxBaseParams`: + +- `snapshot: str` (snapshot params) or `image: str | Image` (image params) +- `resources: Resources | None` — only on the image params variant +- `name`, `language` (default `"python"`), `os_user` +- `env_vars: dict[str, str] | None` — **environment variables in the sandbox** +- `labels: dict[str, str] | None` +- `public: bool | None` +- `timeout: float | None` +- `auto_stop_interval: int | None` — minutes; default 15; `0` disables +- `auto_archive_interval: int | None` — minutes; default 7 days; `0` = max +- `auto_delete_interval: int | None` — minutes; off by default; `0` deletes immediately +- `volumes: list[VolumeMount] | None` +- `network_block_all: bool | None`, `network_allow_list: str | None` (CIDRs) +- `ephemeral: bool | None` — sets `auto_delete_interval=0` when True +- `linked_sandbox: str | None` + +Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). + +## Installing pi (image / snapshot strategy) + +pi.dev (the "pi coding agent") is a minimal, swappable agent harness. Install options +([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)): + +```bash +npm install -g --ignore-scripts @earendil-works/pi-coding-agent +# or +curl -fsSL https://pi.dev/install.sh | sh +``` + +Three baking strategies, in order of recommendation for the agent loop: + +### 1. Prebuilt snapshot (recommended) + +A **snapshot** is a reusable sandbox template built from a Docker/OCI image. Bake pi (and +Node) into it once, reuse for every run, and you avoid paying `npm install` on each cold +start. [Snapshots](https://www.daytona.io/docs/en/snapshots/). + +```python +from daytona import Daytona, CreateSnapshotParams, Image, Resources + +daytona = Daytona() + +image = ( + Image.base("node:22-bookworm") + .run_commands("npm install -g --ignore-scripts @earendil-works/pi-coding-agent") + .workdir("/home/daytona") +) + +daytona.snapshot.create( + CreateSnapshotParams( + name="agenta-pi-harness", + image=image, + resources=Resources(cpu=2, memory=4, disk=8), + ), + on_logs=print, # build logs +) +``` + +Then create sandboxes from it (fast path): + +```python +from daytona import CreateSandboxFromSnapshotParams + +sandbox = daytona.create( + CreateSandboxFromSnapshotParams(snapshot="agenta-pi-harness") +) +``` + +CLI equivalents: `daytona snapshot create --image `, +`daytona snapshot create --dockerfile ./Dockerfile`, +`daytona snapshot push --name `, `daytona snapshot list|activate|delete`. + +### 2. Declarative Image built on demand + +Pass an `Image` object straight to `create()` and Daytona builds it on the fly. Good for +iteration, slower than a prebuilt snapshot on first use. +[Declarative builder](https://www.daytona.io/docs/en/declarative-builder/). + +```python +from daytona import CreateSandboxFromImageParams, Image + +image = ( + Image.debian_slim("3.12") + .run_commands( + "apt-get update && apt-get install -y curl", + "curl -fsSL https://pi.dev/install.sh | sh", + ) + .add_local_file("AGENTS.md", "/home/daytona/AGENTS.md") # config files + .env({"PI_HOME": "/home/daytona/.pi"}) + .workdir("/home/daytona") +) + +sandbox = daytona.create( + CreateSandboxFromImageParams(image=image), + timeout=0, # 0 = no timeout while the image builds + on_snapshot_create_logs=print, # stream build logs +) +``` + +Builder methods available: `Image.debian_slim(py_ver)`, `Image.base(ref)`, +`Image.from_dockerfile(path)`, `.pip_install([...])`, +`.pip_install_from_requirements(path)`, `.pip_install_from_pyproject(path, ...)`, +`.run_commands(...)`, `.env({...})`, `.workdir(path)`, `.add_local_file(src, dst)`, +`.add_local_dir(src, dst)`, `.dockerfile_commands([...])`. + +### 3. Install at runtime + +Create a plain sandbox, then `sandbox.process.exec("npm i -g @earendil-works/pi-coding-agent")`. +Simplest but pays install latency on every run; only sensible for prototyping. + +Note on local parity (design requirement): the same `@earendil-works/pi-coding-agent` +package and `AGENTS.md` / skills layout work identically on a developer machine, so a +config pulled from the server runs the same locally. pi resolves `AGENTS.md` from +`~/.pi/agent/agent.md` (global), parent dirs, and cwd; skills live in +`~/.pi/agent/skills/`, `.pi/skills/`, or project dirs. +[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). + +## Files + secrets injection + +Order matches the design's startup hooks: files first, secrets second. + +### Files into the sandbox + +In-memory upload (no local temp file needed — good for config blobs pulled from the DB): + +```python +# Single file: source bytes -> remote path +sandbox.fs.upload_file(agents_md_bytes, "/home/daytona/AGENTS.md") + +# Bulk +from daytona import FileUpload +sandbox.fs.upload_files([ + FileUpload(source=agents_md_bytes, destination="/home/daytona/AGENTS.md"), + FileUpload(source=skill_bytes, destination="/home/daytona/.pi/agent/skills/x/SKILL.md"), +]) + +sandbox.fs.create_folder("/home/daytona/.pi/agent/skills", "755") +sandbox.fs.set_file_permissions("/home/daytona/AGENTS.md", "644") +``` + +Source: [File system operations](https://www.daytona.io/docs/en/file-system-operations/). + +Other ways to get files in: `sandbox.git` clone; mounted **volumes** (`VolumeMount`, +shared persistent storage); baking files into the image with `.add_local_file` / +`.add_local_dir`. [Volumes](https://www.daytona.io/docs/en/volumes/) (UNVERIFIED on exact +volume API surface; listed in SDK submodules and snapshots doc). + +### Secrets / env vars + +Several layers, pick by sensitivity and lifetime: + +```python +# A) Whole-sandbox env at creation +sandbox = daytona.create(CreateSandboxFromSnapshotParams( + snapshot="agenta-pi-harness", + env_vars={"OPENAI_API_KEY": "sk-...", "ANTHROPIC_API_KEY": "sk-ant-..."}, +)) + +# B) Per-command env (scoped to one exec) +sandbox.process.exec("echo $CUSTOM_SECRET", env={"CUSTOM_SECRET": "DAYTONA"}) + +# C) Write a .env file via the filesystem API, then have pi/harness read it +sandbox.fs.upload_file(b"ANTHROPIC_API_KEY=sk-ant-...\n", "/home/daytona/.env") +``` + +`env_vars` is a field on `CreateSandboxBaseParams` +([SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/)); per-exec `env` +is shown in [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). +pi reads provider keys from standard env vars (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, +etc.), so `env_vars` at create time is the cleanest secret injection path +([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)). +The OpenClaw guide confirms the same pattern: extra keys (e.g. `ANTHROPIC_API_KEY`) added +to `.env.sandbox` are loaded into the sandbox +([OpenClaw guide](https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/)). + +Daytona also has a server-side **secrets** concept (scoped secret injection) referenced in +its security program, but I did not find a dedicated public SDK method for an +organization secret vault; treat that as UNVERIFIED and prefer `env_vars` for now. +[SECURITY.md](https://github.com/daytonaio/daytona/blob/main/SECURITY.md). + +## Process exec + streaming + ports + +### One-shot exec + +```python +resp = sandbox.process.exec("pi -p 'analyze repo'", cwd="/home/daytona", timeout=600) +print(resp.result) # buffered stdout; returned after the command finishes +``` + +`exec` supports `cwd`, `env`, and `timeout`. +[process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). + +### Long-running agent + live stdout/stderr streaming (the agent loop) + +Run the harness async inside a **session** and stream both streams via callbacks: + +```python +import asyncio +from daytona import SessionExecuteRequest + +session_id = "agent-run-" +sandbox.process.create_session(session_id) + +command = sandbox.process.execute_session_command( + session_id, + SessionExecuteRequest( + command="pi --mode json -p 'do the task'", + run_async=True, + ), +) + +logs_task = asyncio.create_task( + sandbox.process.get_session_command_logs_async( + session_id, + command.cmd_id, + lambda chunk: handle_stdout(chunk), # each chunk = pi JSON line(s) + lambda chunk: handle_stderr(chunk), + ) +) + +# Optional interactive input back into the process +sandbox.process.send_session_command_input(session_id, command.cmd_id, "y") + +await logs_task +``` + +This is the recommended shape for the multi-message agent output: run pi in +`--mode json` (or `--mode rpc`), and each emitted JSON line becomes a streamed log chunk +the backend forwards to the client. pi's JSON/RPC event stream emits typed events +(`agent_start`, `message_update` with `text_delta`, `tool_execution_start/update/end`, +`agent_end`), so the backend can map each event to an agent message / tool span for +tracing. RPC framing is strict LF-delimited JSONL — split on `\n` only. +Sources: [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx), +[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md), +[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). + +pi mode summary for headless use: +- `pi -p ""` — print mode, runs once and exits (buffered text). +- `pi --mode json` — same as print but emits all events as JSON lines (best for parsing). +- `pi --mode rpc` — bidirectional JSONL over stdin/stdout; send + `{"type":"prompt","message":"..."}`, receive `response` + streamed events; supports + `steer` / `followUp` mid-run, `get_state`, `fork`, `switch_session`. +- Flags: `--provider`, `--model` (or `--model anthropic/claude-opus`), `--name`, + `--no-session`. +[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md). + +### Ports / preview ("works with our port") + +If the harness or a tool serves HTTP, expose it with a preview link: + +```python +preview = sandbox.get_preview_link(3000) +print(preview.url) # https://3000-.proxy.daytona.work +print(preview.token) # send as header: x-daytona-preview-token +``` + +Any HTTP port 1–65535 is previewable; the port opens automatically if closed. For private +sandboxes the `token` is required (header `x-daytona-preview-token`), and the token resets +when the sandbox restarts, so re-fetch after a restart. This preview/port mechanism is the +provider-agnostic "port contract" the design refers to. A self-hosted deployment serves +the equivalent under `*.proxy.`. +[Preview](https://www.daytona.io/docs/en/preview/), +[Preview & auth](https://www.daytona.io/docs/en/preview-and-authentication/). + +## Cold start, lifecycle states, timeouts, limits + +- **Cold start:** advertised "under 90ms from code to execution" + ([README](https://github.com/daytonaio/daytona)). UNVERIFIED how that interacts with + on-demand image builds; a *prebuilt snapshot* should hit the fast path, whereas building + a declarative `Image` on first `create()` is a separate, slower one-time build. +- **States:** creating, started, stopping, stopped, archiving, archived, deleting, + deleted, error. Archived preserves state cheaply (on object storage); restarting from + archived is slower than from stopped. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). +- **Timeouts / timers:** + - `create(..., timeout=60)` default 60s creation timeout (use `timeout=0` for builds). + - `auto_stop_interval`: default **15 min** of inactivity → stop; `0` disables. + - `auto_archive_interval`: default **7 days** stopped → archive; `0` = max (30 days). + - `auto_delete_interval`: **disabled by default**; `0` = delete immediately on stop; + `-1` disables. `ephemeral=True` sets it to 0. + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/), + [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). +- **Resources:** default **1 vCPU / 1GB RAM / 3GiB disk**; per-sandbox org max + **4 vCPU / 8GB RAM / 10GB disk**. Set via `Resources(cpu=2, memory=4, disk=8)` on the + from-image path. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). + +Implication for an agent loop: a long agent run will hit the 15-min auto-stop unless you +raise `auto_stop_interval` or keep the session active; set it explicitly for runs expected +to exceed 15 minutes, and `delete()`/`ephemeral=True` to guarantee teardown. + +## Self-host + auth + +- **Self-hostable:** yes. AGPL 3.0; "free to deploy and run in any environment," + community-supported. If you modify it and expose over a network, AGPL requires releasing + your modifications. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/). +- **Deploy modes:** local docker-compose, or a domain deployment behind Caddy (TLS, DNS + provider token, ports 80/443/2222, 4GB+ RAM). Components: API (3000, dashboard + REST), + Proxy (4000, preview routing), SSH Gateway (2222), PostgreSQL, Redis, Dex (OIDC), + Registry, MinIO (S3-compatible storage). + ```bash + git clone https://github.com/daytonaio/daytona + docker compose -f docker/docker-compose.yaml up -d # http://localhost:3000 + # or: ./scripts/setup-domain-oss-deployment.sh # guided domain + TLS setup + ``` + Local default login: `dev@daytona.io` / `password` (Dex). Domain setup generates + `ENCRYPTION_KEY`, `ENCRYPTION_SALT`, `PROXY_API_KEY`, `RUNNER_API_KEY`, + `SSH_GATEWAY_API_KEY`. Auth0 OIDC is an optional alternative. + [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/). +- **Auth model (API):** API keys created in the Dashboard or via the API; SDK/CLI read + `DAYTONA_API_KEY` (and `DAYTONA_API_URL` to point at self-hosted). JWT-authenticated + requests additionally need `X-Daytona-Organization-ID`. For self-host, set + `api_url` / `DAYTONA_API_URL` to your deployment. + [API keys](https://www.daytona.io/docs/en/api-keys/). + +## Open questions + +- **Snapshot build pipeline ownership.** Who builds/owns the `agenta-pi-harness` snapshot + and how is it pinned/versioned per agent revision? Building a declarative `Image` on the + hot path is slow; we likely need a prebuild step in CI or at config-publish time. +- **Cold start with custom image.** The "<90ms" figure is for sandbox start; the + first-time build of a custom image/snapshot is separate and unmeasured here. UNVERIFIED: + start time from a *prebuilt* pi snapshot vs. the default image. +- **pi output → Agenta tracing mapping.** Which pi events (`message_update`, + `tool_execution_*`) map to Agenta's multi-message output and pi-instruments tracing, and + whether RPC mode (bidirectional, supports steering) or JSON print mode is the better fit + for our streaming endpoint. RPC's "bash output appears in context on the *next* prompt" + semantics needs design attention. +- **Secrets vault.** Whether Daytona exposes a real scoped-secret API beyond `env_vars` + (referenced in SECURITY.md but no public SDK method found). For now `env_vars` at + create time. UNVERIFIED. +- **Provider abstraction.** The design says "any provider that works with our port." The + Daytona preview-URL/port + token model is concrete; a sandbox-provider interface would + need to abstract create/exec/stream/preview across providers (e.g. E2B, Modal). Out of + scope here but the port + streaming-logs contract is the seam. +- **Volume API surface.** Exact `VolumeMount` / `daytona.volume` Python API not fully + confirmed here. UNVERIFIED. +- **Long-run auto-stop.** Confirm whether an actively streaming session resets the + `auto_stop_interval` idle timer or whether we must raise it explicitly. UNVERIFIED. + +## Sources + +- Daytona docs landing — https://www.daytona.io/docs/en/ +- Daytona GitHub (README, license, "<90ms") — https://github.com/daytonaio/daytona +- Python SDK overview — https://www.daytona.io/docs/en/python-sdk/ +- Python SDK reference (params, fields, create signatures) — https://www.daytona.io/docs/python-sdk/sync/daytona/ +- Sandboxes (lifecycle, states, resources, timers) — https://www.daytona.io/docs/en/sandboxes/ +- Snapshots (custom images, CLI) — https://www.daytona.io/docs/en/snapshots/ +- Declarative builder (Image API) — https://www.daytona.io/docs/en/declarative-builder/ +- Process & code execution (exec, sessions, async log streaming) — https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx +- File system operations (upload/download/permissions) — https://www.daytona.io/docs/en/file-system-operations/ +- Preview / ports / token — https://www.daytona.io/docs/en/preview/ +- Preview & authentication — https://www.daytona.io/docs/en/preview-and-authentication/ +- OSS deployment (self-host, components, auth) — https://www.daytona.io/docs/en/oss-deployment/ +- API keys (auth model) — https://www.daytona.io/docs/en/api-keys/ +- SECURITY.md (secrets management mention) — https://github.com/daytonaio/daytona/blob/main/SECURITY.md +- OpenClaw-in-sandbox guide (agent + secrets + preview pattern) — https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/ +- pi.dev landing — https://pi.dev , https://pi.dev/docs/latest +- pi coding-agent README (install, modes, AGENTS.md, skills) — https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md +- pi RPC protocol doc (JSONL events, streaming) — https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md +- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent diff --git a/docs/design/agent-workflows/research/diskless-in-memory-config.md b/docs/design/agent-workflows/research/diskless-in-memory-config.md new file mode 100644 index 0000000000..a4f13732ca --- /dev/null +++ b/docs/design/agent-workflows/research/diskless-in-memory-config.md @@ -0,0 +1,460 @@ +# Pi agent harness: diskless / in-memory config + +Research target: Pi coding agent (pi.dev, Earendil Inc.), npm +`@earendil-works/pi-coding-agent`, verified against version **0.79.4** (matches the +version installed by `npm view`). All signatures below are quoted from the published +package's TypeScript declaration files (`dist/**/*.d.ts`), the compiled JS +(`dist/**/*.js`), the bundled SDK examples (`examples/sdk/*.ts`), and the dependency +`@earendil-works/pi-ai@0.79.4`. Source URLs are in the Sources section. + +## Summary / net answer + +**Yes — Pi can run fully diskless with all invocation-specific data in process memory.** +Every invocation-specific input we care about has a confirmed in-memory path: + +- **System prompt / AGENTS.md**: pass as in-memory strings via `DefaultResourceLoader` + (`systemPrompt` / `systemPromptOverride`, `appendSystemPrompt` / + `appendSystemPromptOverride`, `agentsFilesOverride`). No file required. +- **Skills**: register in-memory `Skill` objects via `skillsOverride`, or point at an + arbitrary directory via `additionalSkillPaths`. No fixed disk convention required. +- **Provider auth**: `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` (not + persisted), or per-provider env vars. Both confirmed disk-free. +- **Custom tools**: defined in-process via `customTools: ToolDefinition[]` / + `defineTool(...)` or `pi.registerTool(...)` in an inline `extensionFactories` function. + No file. +- **Sessions/state**: `SessionManager.inMemory()` writes nothing. + `SettingsManager.inMemory()` and `ModelRegistry.inMemory()` likewise avoid disk. + +The one thing that is **not** purely in-memory is bash/tool **output spillover**: when a +bash command (or a tool using the output accumulator) exceeds an in-memory byte +threshold, Pi spills the tail to a temp file under `os.tmpdir()`. This is the only +unavoidable write in a headless run that uses the bash/grep/find tools. Point `TMPDIR` +at a tmpfs (or make `/tmp` tmpfs) and it never touches a persistent volume. + +If you drive Pi via the **SDK** (`createAgentSession`) rather than the CLI, you also avoid +startup migrations and the CLI's `agentDir` touches entirely. If you drive it via +`pi --mode rpc`/`--print` (the `main()` CLI entrypoint), redirect `agentDir` and +`sessionDir` to tmpfs and pass `--no-session`. + +--- + +## Per-question findings + +### 1. System prompt / AGENTS.md in memory — CONFIRMED in-memory + +The system prompt and AGENTS.md content are supplied through the `ResourceLoader`, not +through top-level `createAgentSession` options. `DefaultResourceLoaderOptions` exposes +both direct values and override callbacks (quoted from +`dist/core/resource-loader.d.ts`): + +```typescript +export interface DefaultResourceLoaderOptions { + cwd: string; + agentDir: string; + ... + noContextFiles?: boolean; // disable AGENTS.md discovery from disk + systemPrompt?: string; // in-memory base system prompt + appendSystemPrompt?: string[]; // in-memory appended instructions + ... + agentsFilesOverride?: (base: { + agentsFiles: Array<{ path: string; content: string }>; + }) => { agentsFiles: Array<{ path: string; content: string }> }; + systemPromptOverride?: (base: string | undefined) => string | undefined; + appendSystemPromptOverride?: (base: string[]) => string[]; +} +``` + +The `ResourceLoader` interface returns these to the session via +`getSystemPrompt(): string | undefined`, `getAppendSystemPrompt(): string[]`, and +`getAgentsFiles(): { agentsFiles: Array<{ path: string; content: string }> }`. + +**Replace the entire system prompt (in memory)** — from `examples/sdk/03-custom-prompt.ts`: + +```typescript +const loader1 = new DefaultResourceLoader({ + cwd, agentDir, + systemPromptOverride: () => `You are a helpful assistant that speaks like a pirate. +Always end responses with "Arrr!"`, + // Needed to avoid DefaultResourceLoader appending APPEND_SYSTEM.md from ~/.pi/agent or /.pi. + appendSystemPromptOverride: () => [], +}); +await loader1.reload(); +const { session } = await createAgentSession({ + resourceLoader: loader1, + sessionManager: SessionManager.inMemory(), +}); +``` + +**Inject AGENTS.md content in memory** — from `examples/sdk/07-context-files.ts`: + +```typescript +const loader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + agentsFilesOverride: (current) => ({ + agentsFiles: [ + ...current.agentsFiles, + { path: "/virtual/AGENTS.md", content: `# Project Guidelines ...` }, + ], + }), +}); +``` + +Note the file comment: "Disable context files entirely by returning an empty list in +`agentsFilesOverride`." (return `{ agentsFiles: [] }`), or set `noContextFiles: true`. + +**Where Pi reads AGENTS.md from disk by default** (so it can be pointed at tmpfs or +disabled): `loadProjectContextFiles({ cwd, agentDir })` walks from `cwd` upward and reads +the `agentDir`. CLI flag to disable: `--no-context-files` (`Args.noContextFiles`). +The CLI also exposes `--system-prompt` and `--append-system-prompt` +(`Args.systemPrompt?: string`, `Args.appendSystemPrompt?: string[]` in +`dist/cli/args.d.ts`), so over RPC/print mode you can pass the prompt as a process arg +(in memory, no file). + +### 2. Skills in memory — CONFIRMED both in-memory registration and arbitrary path + +Skills are normally a **directory-of-files** convention. From `dist/core/skills.d.ts` +(`loadSkillsFromDir` doc comment): + +> Discovery rules: +> - if a directory contains SKILL.md, treat it as a skill root and do not recurse further +> - otherwise, load direct .md children in the root +> - recurse into subdirectories to find SKILL.md + +Default discovery locations (from the docs and `DefaultResourceLoader`): `.pi/skills/`, +`.agents/skills/` (walking up), `~/.agents/skills/`, `~/.pi/agent/skills/`. + +A `Skill` is a plain object, so it can be created **in memory** with no file: + +```typescript +export interface Skill { + name: string; + description: string; + filePath: string; + baseDir: string; + sourceInfo: SourceInfo; + disableModelInvocation: boolean; +} +``` + +**Register an in-memory skill** — from `examples/sdk/04-skills.ts`: + +```typescript +const customSkill: Skill = { + name: "my-skill", + description: "Custom project instructions", + filePath: "/virtual/SKILL.md", + baseDir: "/virtual", + sourceInfo: createSyntheticSourceInfo("/virtual/SKILL.md", { source: "sdk" }), + disableModelInvocation: false, +}; +const loader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + skillsOverride: (current) => ({ + skills: [...current.skills, customSkill], + diagnostics: current.diagnostics, + }), +}); +``` + +**Point skills at an arbitrary path**: `DefaultResourceLoaderOptions.additionalSkillPaths?: +string[]` (and `noSkills?: boolean` to disable default discovery). CLI equivalents: +`--skills ` (`Args.skills?: string[]`) and `--no-skills` (`Args.noSkills`). +The lower-level `loadSkills({ cwd, agentDir, skillPaths, includeDefaults })` confirms +`skillPaths` is an explicit list and `includeDefaults` can be turned off. + +Caveat: the skill's `filePath`/`baseDir` only matter if the skill body is read lazily on +invocation. For a fully synthetic in-memory skill you must ensure the content is provided +up front; if Pi reads `filePath` on `/skill:name` invocation it would need that path to +exist. For pure "inject instructions into the system prompt" use, `formatSkillsForPrompt` +uses `name`/`description` and the prompt formatting only. UNVERIFIED whether explicit +`/skill:name` expansion re-reads `filePath` from disk for an SDK-injected synthetic skill; +to be safe, point synthetic skills at a tmpfs path or set +`disableModelInvocation`/use systemPrompt injection instead. + +### 3. Provider / LLM auth in memory — CONFIRMED (three disk-free paths) + +**(a) Environment variables.** `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js` +contains the canonical provider→env-var map (`getApiKeyEnvVars`). Exact names: + +- anthropic: `ANTHROPIC_OAUTH_TOKEN` (precedence) then `ANTHROPIC_API_KEY` +- openai: `OPENAI_API_KEY` +- google (Gemini): `GEMINI_API_KEY` +- google-vertex: `GOOGLE_CLOUD_API_KEY` (or ADC via `GOOGLE_APPLICATION_CREDENTIALS` + + `GOOGLE_CLOUD_PROJECT`/`GCLOUD_PROJECT` + `GOOGLE_CLOUD_LOCATION`) +- amazon-bedrock: `AWS_PROFILE` | `AWS_ACCESS_KEY_ID`+`AWS_SECRET_ACCESS_KEY` | + `AWS_BEARER_TOKEN_BEDROCK` | ECS/IRSA container creds +- azure-openai-responses: `AZURE_OPENAI_API_KEY` +- xai: `XAI_API_KEY`; groq: `GROQ_API_KEY`; cerebras: `CEREBRAS_API_KEY`; + deepseek: `DEEPSEEK_API_KEY`; mistral: `MISTRAL_API_KEY`; nvidia: `NVIDIA_API_KEY`; + openrouter: `OPENROUTER_API_KEY`; together: `TOGETHER_API_KEY`; + fireworks: `FIREWORKS_API_KEY`; vercel-ai-gateway: `AI_GATEWAY_API_KEY`; + github-copilot: `COPILOT_GITHUB_TOKEN`; huggingface: `HF_TOKEN`; + moonshotai / moonshotai-cn: `MOONSHOT_API_KEY`; kimi-coding: `KIMI_API_KEY`; + zai: `ZAI_API_KEY`; zai-coding-cn: `ZAI_CODING_CN_API_KEY`; + minimax: `MINIMAX_API_KEY`; minimax-cn: `MINIMAX_CN_API_KEY`; + opencode / opencode-go: `OPENCODE_API_KEY`; nvidia, etc.; + cloudflare-workers-ai / cloudflare-ai-gateway: `CLOUDFLARE_API_KEY`; + xiaomi family: `XIAOMI_API_KEY`, `XIAOMI_TOKEN_PLAN_{CN,AMS,SGP}_API_KEY`; + ant-ling: `ANT_LING_API_KEY`. + +**(b) Runtime in-memory setter — CONFIRMED.** `dist/core/auth-storage.d.ts`: + +```typescript +export declare class AuthStorage { + static create(authPath?: string): AuthStorage; + static fromStorage(storage: AuthStorageBackend): AuthStorage; + static inMemory(data?: AuthStorageData): AuthStorage; + /** Set a runtime API key override (not persisted to disk). Used for CLI --api-key flag. */ + setRuntimeApiKey(provider: string, apiKey: string): void; + removeRuntimeApiKey(provider: string): void; + setFallbackResolver(resolver: (provider: string) => string | undefined): void; + ... +} +export declare class InMemoryAuthStorageBackend implements AuthStorageBackend { ... } +``` + +So `setRuntimeApiKey(provider: string, apiKey: string): void` is real (UNVERIFIED in the +original brief — now CONFIRMED). Resolution priority in `getApiKey()`: +1. runtime override (`--api-key` / `setRuntimeApiKey`), 2. `auth.json` API key, +3. `auth.json` OAuth (auto-refreshed), 4. environment variable, 5. fallback resolver. + +`AuthStorage.inMemory()` plus `InMemoryAuthStorageBackend` give a fully in-memory store. +Verified in the compiled `dist/core/auth-storage.js`: every `writeFileSync`/`mkdirSync`/ +`chmodSync` call lives inside `FileAuthStorageBackend` (class starts line 17); the +`InMemoryAuthStorageBackend` class (line 127) performs no filesystem writes. + +From `examples/sdk/09-api-keys-and-oauth.ts`: + +```typescript +// Runtime API key override (not persisted to disk) +authStorage.setRuntimeApiKey("anthropic", "sk-my-temp-key"); +// No models.json - only built-in models +const simpleRegistry = ModelRegistry.inMemory(authStorage); +``` + +**(c) RPC protocol credential message — NOT PRESENT.** The full `RpcCommand` union in +`dist/modes/rpc/rpc-types.d.ts` has no `set_api_key` / `set_credential` / auth message +(commands are: prompt, steer, follow_up, abort, new_session, get_state, set_model, +cycle_model, get_available_models, set_thinking_level, cycle_thinking_level, +set_steering_mode, set_follow_up_mode, compact, set_auto_compaction, set_auto_retry, +abort_retry, bash, abort_bash, get_session_stats, export_html, switch_session, fork, +clone, get_fork_messages, get_last_assistant_text, set_session_name, get_messages, +get_commands). **Implication:** in RPC mode, credentials must be supplied at process spawn +— via env vars or the `--api-key`/`--provider` CLI flags (`Args.apiKey`, `Args.provider`). +You cannot inject a key over the JSONL channel after spawn. If you need post-spawn, +in-memory key injection without env vars, drive Pi via the **SDK** and pass a custom +`AuthStorage` instead of RPC mode. + +### 4. Tool auth / custom tools in memory — CONFIRMED in-process, no file + +Custom tools are pure in-process definitions. Two confirmed paths: + +**Via `customTools` on `createAgentSession`** (`dist/core/sdk.d.ts`): + +```typescript +export interface CreateAgentSessionOptions { + ... + /** Custom tools to register (in addition to built-in tools). */ + customTools?: ToolDefinition[]; + ... +} +``` + +A `ToolDefinition` (`dist/core/extensions/types.d.ts`) carries its own `execute(...)` +function — so any auth/config the tool needs is closed over in code, no on-disk config: + +```typescript +export interface ToolDefinition { + name: string; label: string; description: string; + parameters: TParams; // TypeBox schema + execute(toolCallId, params, signal, onUpdate, ctx): Promise>; + ... +} +export declare function defineTool<...>(tool: ToolDefinition<...>): ...; +``` + +**Via inline extension factory + `pi.registerTool`** (`examples/sdk/06-extensions.ts`): + +```typescript +const resourceLoader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + extensionFactories: [ + (pi) => { pi.on("agent_start", () => { ... }); }, + ], +}); +// inside an extension: pi.registerTool({ name: "my_tool", label: "My Tool", ... }) +``` + +`ExtensionRunner.registerTool<...>(tool: ToolDefinition<...>): void` is in the type +surface. Both paths require no file: the extension can be an inline function passed in +`extensionFactories`, and tool auth is whatever the closure references (e.g. an HTTP +client back to your backend). Built-in tool selection is also code-only via +`tools`/`excludeTools`/`noTools` on `createAgentSession`. + +### 5. Working directory / cwd and state files — what Pi writes, and how to redirect + +**Path knobs (from `dist/config.js`):** + +- `getAgentDir()` returns `process.env.PI_CODING_AGENT_DIR` (expanded) if set, else + `~/.pi/agent`. The env var name is built as + `` `${APP_NAME.toUpperCase()}_CODING_AGENT_DIR` `` with `APP_NAME = "pi"`, i.e. + **`PI_CODING_AGENT_DIR`**. +- Session dir env var **`PI_CODING_AGENT_SESSION_DIR`** (`ENV_SESSION_DIR`), read in + `main.js`. Resolution order in CLI: `--session-dir` flag → `PI_CODING_AGENT_SESSION_DIR` + → settings default. Default session dir: + `getDefaultSessionDir(cwd, agentDir)` = `/sessions/----/` + (it `mkdirSync`s the dir). +- All other config files hang off `agentDir`: `auth.json`, `models.json`, `settings.json`, + `tools/`, `bin/`, `prompts/`, `themes/`, `sessions/`, and the debug log + `/pi-debug.log`. Redirecting `PI_CODING_AGENT_DIR` moves all of them. + +**SDK-level in-memory replacements (no disk):** + +- `SessionManager.inMemory(cwd?)` — "Create an in-memory session (no file persistence)". + Verified: `SessionManager` only `writeFileSync`s when `this.persist` is true; `inMemory` + sets `persist=false`. +- `SettingsManager.inMemory(settings?)` — no `settings.json` read/write. +- `ModelRegistry.inMemory(authStorage)` — built-in models only, no `models.json`. +- `AuthStorage.inMemory()` / custom `AuthStorageBackend` — no `auth.json`. + +**What Pi writes on its own during a run (headless), and how to neutralize it:** + +| Writer (dist file) | Path | When | Redirect / avoid | +| --- | --- | --- | --- | +| `core/session-manager.js` | `/sessions/...*.jsonl` | every persisted session | `SessionManager.inMemory()` (SDK) or `--no-session` (CLI). Else `PI_CODING_AGENT_SESSION_DIR`→tmpfs. | +| `core/bash-executor.js` | `os.tmpdir()/pi-bash-.log` | only when bash output exceeds `DEFAULT_MAX_BYTES` (spillover) | set `TMPDIR` to tmpfs / make `/tmp` tmpfs | +| `core/tools/output-accumulator.js` | `os.tmpdir()/-.log` | tool output spillover above threshold | same (`TMPDIR`→tmpfs) | +| `core/settings-manager.js` | `/settings.json`, `/.pi/settings.json` | only on settings change with persistence | `SettingsManager.inMemory()` | +| `core/auth-storage.js` (`FileAuthStorageBackend`) | `/auth.json` | only with file-backed AuthStorage | `AuthStorage.inMemory()` / `setRuntimeApiKey` | +| `core/trust-manager.js` | project trust file under `/.pi` / agentDir | only when project-trust resolution runs | avoid project `.pi` resources; SDK path skips trust prompts | +| `core/package-manager.js` | `/tmp/extensions/` | only when installing/loading extension packages | use inline `extensionFactories` (no package install) | +| `core/agent-session-runtime.js` | `/` | only when attaching files + persistence | in-memory session; don't attach files | +| `core/agent-session.js` | export path | only on explicit `exportToHtml`/`exportToJsonl` | don't call exports | +| `utils/tools-manager.js` | `/bin/{rg,fd}` | only if `rg`/`fd` not found in PATH | pre-install ripgrep + fd in the sandbox image (it prefers system binaries in PATH) | +| `migrations.js` (CLI only) | `/auth.json`, `settings.json` | `main()` startup, only if legacy files present | SDK path doesn't call it; or point `PI_CODING_AGENT_DIR` at an empty tmpfs | + +The interactive TUI also writes `pi-debug.log` and reads more of `agentDir`, but those +code paths (`modes/interactive/*`) do not run in `--mode rpc`, `--print`, or the SDK. + +### 6. Net answer — concrete diskless recipe + +**Recommended: drive Pi via the SDK (`createAgentSession`), not the RPC CLI**, because the +SDK lets you inject `AuthStorage`, system prompt, skills, AGENTS.md, and custom tools as +in-memory objects, and skips CLI startup migrations. Run many sessions in one shared +sandbox, one `createAgentSession` per invocation, each with its own in-memory loader and +auth. + +Per invocation, in code (all in memory): + +```typescript +const auth = AuthStorage.inMemory(); +auth.setRuntimeApiKey("anthropic", perRunKey); // never persisted + +const loader = new DefaultResourceLoader({ + cwd: perRunWorkdir, // a per-run tmpfs subdir + agentDir: perRunAgentDir, // a per-run tmpfs subdir (or unused) + noContextFiles: true, // ignore on-disk AGENTS.md + systemPrompt: baseSystemPrompt, // in memory + appendSystemPromptOverride: () => [extraInstructions], + agentsFilesOverride: () => ({ agentsFiles: [{ path: "/virtual/AGENTS.md", content: agentsMd }] }), + skillsOverride: (cur) => ({ skills: [...inMemorySkills], diagnostics: cur.diagnostics }), + extensionFactories: [(pi) => { pi.registerTool(myProxyTool); }], +}); +await loader.reload(); + +const { session } = await createAgentSession({ + cwd: perRunWorkdir, + authStorage: auth, + modelRegistry: ModelRegistry.inMemory(auth), + settingsManager: SettingsManager.inMemory(), + sessionManager: SessionManager.inMemory(perRunWorkdir), + resourceLoader: loader, + model: getModel("anthropic", "claude-..."), + customTools: [/* or here instead of via extensionFactories */], +}); +``` + +Environment for the sandbox process: + +- `TMPDIR=/dev/shm/pi-tmp` (or any tmpfs) — captures bash/tool output spillover. +- Optionally `PI_CODING_AGENT_DIR=/dev/shm/pi-agent` and + `PI_CODING_AGENT_SESSION_DIR=/dev/shm/pi-sessions` as a belt-and-suspenders redirect for + any code path that still resolves `agentDir`/`sessionDir`. +- `PI_OFFLINE=1` to suppress version-check network/file activity (optional). +- Provider key via env var (e.g. `ANTHROPIC_API_KEY`) **only if** you use env-var auth + instead of `setRuntimeApiKey`. +- Pre-install `ripgrep` (`rg`) and `fd` in the sandbox image so the `grep`/`find` tools + never trigger a download to `/bin`. + +**What must be a file (therefore tmpfs):** nothing strictly required for config. The only +forced writes are (a) bash/tool **output spillover** to `os.tmpdir()` (point `TMPDIR` at +tmpfs), and (b) any session/settings/auth persistence you opt into — all avoidable with +the `inMemory()` factories. If you instead use `pi --mode rpc`, sessions and `agentDir` +are file-based by default, so you must pass `--no-session` and redirect both env vars to +tmpfs, and you lose post-spawn in-memory key injection (RPC has no auth message). + +**Verdict:** fully diskless (process memory + a tmpfs `TMPDIR`) is achievable via the SDK. +No persistent-volume write is required for prompts, skills, AGENTS.md, auth, tools, or +session state. + +--- + +## Open questions / UNVERIFIED + +- **Synthetic skill body re-read.** Whether an SDK-injected `Skill` whose `filePath` points + at a non-existent `/virtual/SKILL.md` is safe when the model triggers `/skill:name` + expansion (which may re-read `filePath`). The system-prompt listing only needs + `name`/`description`, but explicit invocation might hit disk. Mitigation: put synthetic + skills' `filePath`/`baseDir` on tmpfs, or rely on systemPrompt injection. Confirm by + reading `_expandSkillCommand` in `dist/core/agent-session.js` or testing. +- **`os.tmpdir()` honoring `TMPDIR`.** Node's `os.tmpdir()` respects `TMPDIR` on Linux, so + setting `TMPDIR` to a tmpfs path redirects the spillover files. This is standard Node + behavior, not Pi-specific; verify the sandbox doesn't override `TMPDIR`. +- **OAuth refresh writes.** If you use OAuth credentials (not API keys), token refresh in + `FileAuthStorageBackend` writes back to `auth.json`. With `AuthStorage.inMemory()` / + `InMemoryAuthStorageBackend`, refreshed tokens stay in memory — confirm refresh path + uses the injected backend (it goes through `withLock`/`withLockAsync`, which the + in-memory backend implements). +- **`ModelRegistry` provider registration side effects.** `ModelRegistry.inMemory` avoids + `models.json`, but custom provider registration (Bedrock/Vertex) may read other on-disk + creds (`~/.aws`, ADC json). Out of scope if using API-key providers. +- Version drift: verified at 0.79.4. Re-check `rpc-types.d.ts` for an auth message and + `resource-loader.d.ts` option names if upgrading. + +--- + +## Sources + +Primary (package source / types — inspected from the published tarball; equivalent files +on GitHub): + +- `@earendil-works/pi-coding-agent@0.79.4` npm tarball, files: + `dist/core/sdk.d.ts` (`CreateAgentSessionOptions`, `customTools`, `createAgentSession`), + `dist/core/resource-loader.d.ts` (`DefaultResourceLoaderOptions`: `systemPrompt`, + `appendSystemPrompt`, `systemPromptOverride`, `agentsFilesOverride`, `skillsOverride`, + `additionalSkillPaths`, `noContextFiles`, `noSkills`), + `dist/core/auth-storage.d.ts` + `dist/core/auth-storage.js` (`AuthStorage`, + `setRuntimeApiKey`, `inMemory`, `InMemoryAuthStorageBackend`), + `dist/core/session-manager.d.ts` + `.js` (`SessionManager.inMemory`, `getDefaultSessionDir`), + `dist/core/settings-manager.js` (`inMemory`), `dist/core/model-registry.js` (`inMemory`), + `dist/core/skills.d.ts` (`Skill`, `loadSkills`, `loadSkillsFromDir`), + `dist/core/extensions/types.d.ts` (`ToolDefinition`, `defineTool`, `registerTool`), + `dist/config.js` (`getAgentDir`, `ENV_AGENT_DIR=PI_CODING_AGENT_DIR`, + `ENV_SESSION_DIR=PI_CODING_AGENT_SESSION_DIR`, session/auth/bin paths), + `dist/cli/args.d.ts` (`--api-key`, `--system-prompt`, `--append-system-prompt`, + `--no-session`, `--session-dir`, `--skills`, `--no-skills`, `--no-context-files`), + `dist/modes/rpc/rpc-types.d.ts` (full `RpcCommand` union — no auth message), + `dist/core/bash-executor.js` + `dist/core/tools/output-accumulator.js` (tmpdir spillover), + `dist/utils/tools-manager.js` (rg/fd download, prefers system PATH binaries), + `dist/main.js` (`runMigrations`, session-dir resolution), + `examples/sdk/03-custom-prompt.ts`, `04-skills.ts`, `05-tools.ts`, `06-extensions.ts`, + `07-context-files.ts`, `09-api-keys-and-oauth.ts`, `11-sessions.ts`. +- `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js` — provider→env-var map + (`getApiKeyEnvVars`, `getEnvApiKey`). + +Docs / GitHub (corroborating): + +- SDK reference: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- npm: https://www.npmjs.com/package/@earendil-works/pi-coding-agent +- Docs site: https://pi.dev/docs/latest/sdk +- DeepWiki overview: https://deepwiki.com/earendil-works/pi/7.1-pi-coding-agent-sdk diff --git a/docs/design/agent-workflows/research/open-questions.md b/docs/design/agent-workflows/research/open-questions.md new file mode 100644 index 0000000000..dd9d37fd47 --- /dev/null +++ b/docs/design/agent-workflows/research/open-questions.md @@ -0,0 +1,312 @@ +# Agent Workflows: Daytona and pi.dev due-diligence + +Status: research only. Broad due-diligence to surface what the focused research topics +(interaction API, OTel instrumentation, sandbox creation, auth/secrets, sandbox-sharing) +might miss. Every claim is cited. Items I could not verify from a primary source are +marked UNVERIFIED. Researched 2026-06-15. + +## Summary + +- **pi.dev** is a young but very active open-source (MIT) agent harness from Earendil Inc., + authored by Mario Zechner (GitHub `badlogic`, creator of libGDX). The npm package + `@earendil-works/pi-coding-agent` first published 2026-05-07 and is on **0.79.4** (released + the day of this research), shipping roughly weekly with frequent **breaking changes** in + the 0.x line. It runs locally as a CLI/SDK/RPC server; **it does not depend on Daytona**. +- **Daytona** is a mature, well-funded ($5M, Upfront Ventures), SOC-2 open-source (AGPL-3.0) + sandbox platform for running AI-generated code. Sub-90ms container starts, usage-based + pricing, $200 free credits, US/EU regions. The managed cloud is the same codebase as the + OSS repo and can be self-hosted via Docker Compose. +- **Biggest risks for this project:** (1) pi's 0.x velocity and breaking changes mean we + pin a version and budget for upgrade churn; the RPC/SDK contract is pi-specific and + **not** a portable cross-harness standard, so "configurable harness" is an abstraction + *we* own. (2) pi has **no first-party OpenTelemetry**; the only OTel path today is a + third-party community extension. (3) Daytona uses shared-kernel containers (not microVMs), + a weaker isolation story for hostile code; (4) default **15-min auto-stop** can kill + long-running agents mid-run; (5) network egress is restricted by default below Tier 3. + +## Maturity & risk + +**pi.dev** +- Open source, **MIT** license; monorepo `earendil-works/pi` (mirror/origin also seen as + `badlogic/pi-mono`). Packages: `pi-coding-agent` (CLI), `pi-agent-core` (runtime, tool + calling, state), `pi-ai` (unified multi-provider LLM API), `pi-tui` (terminal UI). A + separate `pi-chat` repo does Slack/chat workflows. + [README](https://github.com/earendil-works/pi/blob/main/README.md), + [npm](https://www.npmjs.com/package/@earendil-works/pi-coding-agent) +- Author: **Mario Zechner** (`badlogic`), an experienced OSS developer (libGDX). Earendil Inc. + is the company. + [HN](https://news.ycombinator.com/item?id=46629341), + [GitHub badlogic](https://github.com/badlogic) +- **Very young, very active.** npm package created **2026-05-07**, latest **0.79.4** on + **2026-06-15**. Release cadence is ~weekly (0.75.0 2026-05-17 through 0.79.4 2026-06-15 = + ~15 releases in a month). Still firmly **pre-1.0**. + [npm metadata via `npm view`](https://www.npmjs.com/package/@earendil-works/pi-coding-agent), + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **Breaking-change history is real and frequent** (0.x). Recent examples from the changelog: + 0.75.0 raised min Node to 22.19.0 and reworked tool selection from cwd-bound instances to + tool-name allowlists; 0.72.0 replaced `compat.reasoningEffortMap` with `thinkingLevelMap`; + 0.71.0 removed built-in Gemini/Antigravity providers; 0.69.0 migrated TypeBox and + invalidated captured session-bound extension objects. A `legacy-node20` dist-tag (0.74.2) + exists for older Node. + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **Lock-in:** low at the model layer (15+ providers, MIT). But the integration surface + (RPC commands/events, extension API, session JSONL format) is **pi-specific** and changes + between minor versions, so coupling to pi is a real cost even though the code is open. +- Community size: hard to quantify; active HN presence, third-party extensions appearing + (otel, sandboxing, oh-my-pi fork). Smaller and newer than Claude Code / Codex ecosystems. + [HN](https://news.ycombinator.com/item?id=47634337) + +**Daytona** +- Open source, **AGPL-3.0**; repo `daytonaio/daytona` reports ~72k stars on the repo page + (other sources cite ~21k — figure is noisy, treat as "large, popular"). 200+ releases, + latest ~v0.187.0 (2026-06-11). Polyglot (TS/Go/Python/Ruby/Java SDKs). + [GitHub](https://github.com/daytonaio/daytona), + [stars/funding search](https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments) +- Company: Ivan Burazin (CEO, ex-Codeanywhere/Infobip), raised **$5M** (Upfront Ventures, + 500 EE). **SOC-2** compliant. + [PRNewswire](https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html) +- **AGPL note:** the AGPL-3.0 license is copyleft and network-triggered. We consume Daytona + as a hosted service or via SDK over the network (not by linking/modifying its source), so + AGPL obligations should not reach Agenta's own code, but legal should confirm before any + self-host-and-modify path. The cloud and OSS share a codebase, so self-hosting is a real + fallback (Docker Compose stack + customer-managed compute/BYOC). + [GitHub](https://github.com/daytonaio/daytona) + +## Pricing & limits + +**Daytona** (managed cloud, pay-as-you-go, no minimum/commitment): +- vCPU **$0.0504/h**; RAM **$0.0162/h per GiB**; storage **$0.000108/h per GiB** (first 5 GiB + free). Billed per second. GPU: H100 $3.95/h, RTX PRO 6000 $3.03/h. Windows/Android OS + add-ons extra. **$200 free credits** at signup (no card for trial); startups up to $50k. + [Pricing](https://www.daytona.io/pricing), + [pricing search](https://www.morphllm.com/comparisons/daytona-alternative) +- **Cost intuition:** a 1 vCPU / 2 GiB sandbox ≈ $0.0504 + 2×$0.0162 = **~$0.083/h** of + active compute (storage extra). 10 such sandboxes running continuously ≈ **$0.83/h** ≈ + ~$600/mo if never stopped; auto-stop after idle cuts this sharply since CPU/RAM stop + billing while stopped (storage persists). Costs scale with concurrency × active runtime, + not request count. (Derived from the per-hour rates above — arithmetic ours.) +- **Rate limits (per minute, by tier):** Tier1 10k general / 300 create / 10k lifecycle; + Tier2 20k/400/20k; Tier3 40k/500/40k; Tier4 50k/600/50k; Enterprise custom. +- **Resource quotas (per tier):** Tier1 10 vCPU / 20 GiB RAM / 30 GiB disk; Tier2 + 100/200/300; Tier3 250/500/2000; Tier4 500/1000/5000. Concurrency is gated by these + pooled quotas (how many sandboxes run at once depends on each one's size). +- **Tier gating:** Tier1 email-verified; Tier2 card + $25 top-up; Tier3 $500 top-up; Tier4 + $2000 top-up / 30 days; Enterprise contact. + [Limits](https://www.daytona.io/docs/en/limits/), + [DeepWiki quotas](https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits) + +**pi.dev** +- The harness itself is free/MIT. Cost is the **LLM provider tokens** (BYO key or OAuth to + Claude Pro/Max, ChatGPT/Codex, Copilot, plus API-key providers) plus whatever sandbox you + run it in. No pi-side metering. + [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/), + [pi.dev](https://pi.dev/) + +## Operational concerns + +**Daytona** +- **Cold start:** advertised sub-90ms sandbox creation (container-based). + [docs overview](https://www.daytona.io/docs), [vstorm](https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/) +- **Lifecycle/timeouts:** default **auto-stop after 15 min** of inactivity, **auto-archive + after 7 days** stopped; auto-delete configurable. Stopped = storage kept, CPU/RAM freed; + archived = no quota. **Sharp edge:** a long-running process (e.g. a >15-min agent run with + no external interaction) can be auto-stopped mid-run because the process itself does not + count as "activity" — set/extend auto-stop for long agents. + [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona), + [Northflank](https://northflank.com/blog/daytona-vs-modal) +- **Regions / residency:** shared regions **US** (`us`) and **EU** (`eu`); you can target a + region per sandbox. Custom Regions (BYO runners, full isolation, residency control) are + invite-only/experimental. Some sources note the **managed cloud is effectively single + primary region (us-east-1/iad1)** in practice — UNVERIFIED against official docs, treat + EU availability as "claimed, confirm before relying on it for residency". + [Regions](https://www.daytona.io/docs/en/regions/), + [single-region claim](https://www.zenml.io/blog/e2b-vs-daytona) +- **Networking egress:** per-sandbox network stack with firewall. **Tier 1 & 2: restricted + egress by default; Tier 3 & 4: full internet by default.** Controls: `networkAllowList` + (CIDR, max 10 /32 entries) and `networkBlockAll`. Only Tier 3/4 can change firewall after + creation. All tiers get allowlisted access to npm/PyPI, Docker/k8s registries, + GitHub/GitLab, CDNs, and AI providers (Anthropic/OpenAI/Google). **Implication:** to inject + an arbitrary secret endpoint or call a non-allowlisted internal service, plan for Tier 3+. + [Network limits](https://www.daytona.io/docs/en/network-limits/), + [egress issue](https://github.com/daytonaio/daytona/issues/3357) +- **Isolation:** container with dedicated kernel claims, but multiple comparisons note it + shares the host kernel (not Firecracker microVM) — weaker boundary for genuinely hostile + code than E2B/Fly. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) + +**pi.dev** +- Runs as a local process; operational profile (cold start, scaling) is whatever sandbox/ + host we run it on. No managed pi runtime to scale or rate-limit. Reliability is a function + of (a) pi's own stability at 0.x and (b) the chosen LLM provider's limits. + +## Local parity + +- **Strong yes — pi is local-first and needs no Daytona.** pi is a CLI/SDK/RPC harness that + runs in any project directory. Four surfaces: interactive TUI, print/JSON event-stream + mode, **RPC mode** (JSONL over stdin/stdout), and a **Node SDK** (`AgentSession`). The same + binary/SDK runs locally or inside a sandbox. + [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md), + [RPC docs](https://pi.dev/docs/latest/rpc) +- This makes "pull config from server, run the same harness locally" realistic: the agent + config (AGENTS.md, skills, model, tools, files) maps onto pi's own context model + (AGENTS.md/SYSTEM.md, skills, tool allowlists, presets/extensions). + [overview](https://pi.dev/), [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md) +- **What differs local vs sandboxed (the parity gaps we own):** + - **Sandbox/isolation layer.** Server runs pi inside Daytona; local runs pi on the host (or + pi's own local sandbox options: **Gondolin** QEMU micro-VM, plain Docker, OpenShell). + These are pi's *own* local isolation, not Daytona — so the file/secret startup hooks and + the FS/network surface differ between Daytona and a local run unless we replicate them. + [containerization search](https://github.com/pasky/pi-gondolin) + - **Secrets/auth injection.** Server injects secrets via startup hooks into the sandbox; + locally the user supplies keys/OAuth. Parity requires our wrapper to lay down the same + files/env both places. + - **Network egress.** Daytona's tiered firewall has no local equivalent; a tool that works + locally could be blocked in-sandbox below Tier 3. + - **Instrumentation.** OTel is an opt-in extension either way (see below); it is not on by + default, so parity depends on us loading the same extension/config in both modes. +- Net: pi gives genuine local parity for the *agent loop*; the *environment* (sandbox, + secrets, egress, telemetry) is the part Agenta must make identical across local and server. + +## Harness swappability + +- **Important framing:** in pi, "harness" means *the agent loop you customize within pi* + (tools, prompts, auth, event loop), not a pluggable adapter where you drop in Codex or + Claude Code behind a common interface. pi's own docs/talks define the harness as "the set + of abstractions which transforms [the] IO machine into an 'agent'" and emphasize + composition *within* pi, not interchangeable backends. + [harness-engineering slides](https://dmg-egg.github.io/slides-harness-engineering-with-pi/) +- pi supports many **models/providers** (Anthropic, OpenAI, Google, Bedrock, Mistral, xAI, + Groq, Cerebras, OpenRouter, Ollama, etc.) and **subscription OAuth** to Claude Pro/Max, + ChatGPT/Codex, and Copilot. But these are *models behind pi's loop*, not separate harnesses + like the Claude Code CLI or Codex CLI. + [providers/RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- The RPC protocol is rich (85+ commands, ~12 event types incl. `agent_start/end`, + `turn_start/end`, `message_*`, `tool_execution_*`, plus `get_state` exposing `sessionId`, + and `agent_end` carrying **all messages from the run** = the multi-message output). But it + is **pi-specific and unversioned** (no documented stability/deprecation policy), and pi's + own docs say to prefer `AgentSession` directly over the subprocess RPC when embedding in + Node. So it is a good integration surface for pi, **not** a neutral cross-harness standard. + [RPC docs](https://pi.dev/docs/latest/rpc) +- **Conclusion for the design:** "configurable/swappable harness" is **an abstraction Agenta + must own.** If we ever want to run Codex CLI or Claude Code as alternative harnesses, we + define our own port (config in -> sandbox setup -> run -> normalized multi-message output + + session_id + traces out) and write per-harness adapters. pi will be the first and + best-fitting adapter because of its RPC/SDK, but it does not hand us a ready-made + multi-harness interface. + +## Gotchas / sharp edges + +- **pi 0.x churn.** Weekly releases with breaking changes (Node-version bumps, tool-selection + model changes, provider removals, session-object invalidation). Pin an exact version, test + upgrades, watch the changelog. + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **No first-party OTel in pi.** The only OpenTelemetry path is a **third-party community + extension** (`mprokopov/pi-otel-telemetry`), which emits one trace tree per prompt (turns, + LLM requests, tool calls) over OTLP. It is unofficial and unversioned against pi; the + instrumentation research topic should treat first-party telemetry as absent today. + [pi-otel repo](https://github.com/mprokopov/pi-otel-telemetry), + [pi-otel writeup](https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html) +- **pi has no built-in permission system / MCP / sub-agents / plan mode** by design — they + are extension territory. Anything we assume "the agent will ask before X" must be added. + [README](https://github.com/earendil-works/pi/blob/main/README.md), + [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md) +- **JSONL framing is strict** in RPC mode: split on `\n` only; do not use Node `readline` + (it splits on Unicode separators too) or records corrupt. + [RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- **Daytona 15-min auto-stop** can kill long agent runs mid-flight (process activity does not + reset the idle timer) — set auto-stop explicitly for agents. + [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona) +- **Daytona egress is tiered**; below Tier 3 you cannot freely reach arbitrary endpoints and + cannot change the firewall post-creation. Budget for Tier 3 if agents call internal/custom + services. + [Network limits](https://www.daytona.io/docs/en/network-limits/) +- **Daytona shared-kernel isolation** is weaker than microVM competitors for untrusted code. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **pi.dev's own sandbox examples (Gondolin/Docker/OpenShell) are local/host-side**, with no + first-party Daytona integration — the pi <-> Daytona glue is ours to build. + [containerization search](https://github.com/pasky/pi-gondolin) + +## Alternatives (fallback landscape — one line each) + +Sandbox providers (alternatives to Daytona): +- **E2B** — Firecracker microVM with a dedicated kernel per sandbox; strongest isolation for + untrusted code. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Modal** — native GPU sandboxes; the pick when agents need inference/GPU in-sandbox. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Fly.io (Machines / "Sprites")** — full filesystem persistence across sessions so agents + resume without rebuilding; Firecracker-based. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Morph** — VM branching/fork in <250ms for parallel exploration of multiple solution paths. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Freestyle** — full root + nested virtualization (Docker-in-VM) for heavy/custom envs. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Vercel Sandbox / Northflank / Cloudflare / microsandbox** — other credible options that + show up in 2026 comparisons; differentiators not deeply verified here. UNVERIFIED specifics. + [comparison](https://northflank.com/blog/ai-sandbox-pricing), + [comparison](https://betterstack.com/community/comparisons/best-sandbox-runners/) + +Harnesses (alternatives to pi.dev): +- **Claude Code** (Anthropic) — the de-facto reference coding agent; more opinionated, larger + ecosystem, less "minimal/composable" than pi. Often cited by pi users as the thing they + came from. + [HN](https://news.ycombinator.com/item?id=47634337) +- **Codex CLI** (OpenAI) — OpenAI's agent CLI; pi can use Codex *as a provider via OAuth*, but + as a *harness* it's a separate tool with its own loop. + [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- **oh-my-pi** — a community fork of pi adding subagents/LSP/browser/optimized tool harness; + signal that pi's design invites forks, and a possible drop-in if pi mainline diverges. + [oh-my-pi](https://github.com/can1357/oh-my-pi) + +## Open questions (for the focused topics / before committing) + +1. Pin strategy for pi version (exact pin + upgrade cadence) given weekly breaking 0.x + releases. Who owns watching the changelog? +2. Telemetry: do we adopt/fork `pi-otel-telemetry`, or write our own pi extension to emit the + spans Agenta tracing expects? (No first-party OTel exists.) → instrumentation topic. +3. Confirm Daytona EU region + data-residency guarantees against official docs/sales; the + "single-region us-east-1" claim needs verification before we promise EU residency. +4. Decide the default auto-stop / max-run-duration for agent sandboxes so long runs aren't + killed at 15 min. → sandbox-creation topic. +5. Which Daytona tier do we operate on? Egress + post-creation firewall + concurrency quotas + all hinge on Tier 3+. → auth/secrets + sandbox-creation topics. +6. Define Agenta's own harness port (config -> setup -> run -> normalized output + session_id + + traces) since pi gives no neutral multi-harness interface; validate it against pi first, + then sketch a Codex/Claude-Code adapter to prove the abstraction. → pi.dev harness topic. +7. Local-parity contract: which startup hooks (files, secrets, egress, telemetry) must be + replicated locally, and do we reuse pi's Gondolin/Docker locally or run bare on host? + → local-execution topic. +8. AGPL review for any self-hosted-and-modified Daytona path (network copyleft). + +## Sources + +- pi.dev overview — https://pi.dev/ +- pi README — https://github.com/earendil-works/pi/blob/main/README.md +- pi docs index — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md +- pi coding-agent CHANGELOG — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md +- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent +- pi RPC docs — https://pi.dev/docs/latest/rpc +- pi RPC (mirror) — https://hochej.github.io/pi-mono/coding-agent/rpc/ +- Harness engineering with pi (slides) — https://dmg-egg.github.io/slides-harness-engineering-with-pi/ +- Mario Zechner GitHub — https://github.com/badlogic +- HN discussion on pi — https://news.ycombinator.com/item?id=47634337 and https://news.ycombinator.com/item?id=46629341 +- pi-otel telemetry extension — https://github.com/mprokopov/pi-otel-telemetry +- pi-otel writeup — https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html +- pi-gondolin sandbox extension — https://github.com/pasky/pi-gondolin +- oh-my-pi fork — https://github.com/can1357/oh-my-pi +- Daytona docs overview — https://www.daytona.io/docs +- Daytona limits — https://www.daytona.io/docs/en/limits/ +- Daytona resource quotas (DeepWiki) — https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits +- Daytona regions — https://www.daytona.io/docs/en/regions/ +- Daytona network limits — https://www.daytona.io/docs/en/network-limits/ +- Daytona dynamic egress issue — https://github.com/daytonaio/daytona/issues/3357 +- Daytona pricing — https://www.daytona.io/pricing +- Daytona GitHub — https://github.com/daytonaio/daytona +- Daytona funding (PRNewswire) — https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html +- Daytona funding (blog) — https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments +- E2B vs Daytona — https://www.zenml.io/blog/e2b-vs-daytona +- Daytona vs Modal — https://northflank.com/blog/daytona-vs-modal +- AI sandbox pricing comparison — https://northflank.com/blog/ai-sandbox-pricing +- Daytona alternatives — https://www.morphllm.com/comparisons/daytona-alternative +- Sandbox runners comparison — https://betterstack.com/community/comparisons/best-sandbox-runners/ +- Daytona sub-90ms (vstorm) — https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/ diff --git a/docs/design/agent-workflows/research/otel-instrumentation.md b/docs/design/agent-workflows/research/otel-instrumentation.md new file mode 100644 index 0000000000..5f632e8ca6 --- /dev/null +++ b/docs/design/agent-workflows/research/otel-instrumentation.md @@ -0,0 +1,379 @@ +# OTel Instrumentation for the pi.dev Agent Harness + +Status: research only. No code changed. Research date: 2026-06-15. + +This file answers the five research questions in the agent-workflows brief: +how to instrument the pi.dev harness with OpenTelemetry (OTel), what already +exists, what span conventions to use, how spans get out of a sandbox, and how +all of that lands in Agenta's existing OTel ingestion. + +## Summary + +- **pi.dev is "Pi", a minimal agent harness by Earendil Inc.** (the company is + "earendil-works" on GitHub, repo `earendil-works/pi`). It is a coding-agent + toolkit: a unified multi-provider LLM API, an agent loop with tool calling, + a TUI, and a CLI. It ships as npm packages `@earendil-works/pi-ai`, + `@earendil-works/pi-agent-core`, `@earendil-works/pi-coding-agent`, + `@earendil-works/pi-tui`. MIT licensed. +- **"pi instruments" is not a built-in OTel exporter.** Pi has no native OTel + emitter in its docs. What it has is an **extension event system**: an + extension registers handlers with `pi.on(, handler)` and gets + lifecycle events for the agent loop (session, agent_start/agent_end, + turn_start/turn_end, tool_execution_start/end, before_provider_request / + after_provider_response, message_start/message_end). "Instrumentation" = + writing (or installing) an extension that listens to those events and turns + them into OTel spans. There is no first-party Pi telemetry dashboard to + reuse. +- **Three community OTel extensions for Pi already exist** and all emit OTLP: + `maxmalkin/pi-OTEL`, `mprokopov/pi-otel-telemetry`, and the `pi-otel` covered + by the nikiforovall blog. They all use **OTel GenAI semantic conventions** + (`gen_ai.*`), not OpenInference. They are TypeScript Pi extensions. +- **Agenta already ingests exactly this.** Agenta exposes an OTLP/HTTP + protobuf endpoint at `POST /otlp/v1/traces` and normalizes incoming spans + through an adapter registry that already understands **OTel GenAI semconv**, + **OpenLLMetry (Traceloop)**, **OpenInference (Arize)**, **Logfire**, and + **Vercel AI**. A Pi extension that emits `gen_ai.*` spans over OTLP/HTTP to + Agenta's endpoint would flow through the existing pipeline with little or no + new backend code. +- **Recommended path:** emit OTel GenAI-semconv spans from a Pi extension + (fork/reuse one of the three), export OTLP/HTTP to Agenta's + `/otlp/v1/traces` with `Authorization: ApiKey ` and `?project_id=`, + and let the existing GenAI-semconv adapter map them. Add a thin Agenta-side + adapter only if we want richer agent/turn structure than `gen_ai.*` carries. + +## What "pi instruments" is + +**Product.** pi.dev = "Pi", "a minimal agent harness" by Earendil Inc. Tagline +"Adapt Pi to your workflows, not the other way around." Four operating modes: +interactive TUI, print/JSON output, RPC (stdin/stdout JSONL), and an SDK for +embedding in Node.js. It deliberately omits MCP, sub-agents, permission popups, +and plan mode from the core, expecting you to add them via extensions. +Source: https://pi.dev/ , https://github.com/earendil-works/pi/blob/main/README.md + +**Packages** (npm, scope `@earendil-works`): +- `pi-ai` — unified multi-provider LLM API (OpenAI, Anthropic, Google, etc.) +- `pi-agent-core` — agent runtime: tool calling + state management +- `pi-coding-agent` — interactive coding-agent CLI +- `pi-tui` — terminal UI library +Source: https://github.com/earendil-works/pi/blob/main/README.md + +**The instrumentation mechanism is the extension event bus, not a built-in +exporter.** Pi's official docs have an "Extensions" page but **no telemetry / +OTel / observability page**. Extensions are TypeScript modules that subscribe +to lifecycle events: + +```ts +pi.on(eventName, async (event, ctx) => { + // ctx is an ExtensionContext: ctx.sessionManager (read-only session), + // ctx.signal (abort-aware), ctx.ui (interaction) +}); +``` + +Events relevant to telemetry (exact names from the Extensions doc): +- Session lifecycle: `session_start` (reasons: startup/reload/new/resume/fork), + `session_shutdown`, `project_trust`, `resources_discover`. +- Agent loop: `before_agent_start`, `agent_start` (once per user prompt), + `agent_end` (has `event.messages`), `turn_start`, `turn_end` (per LLM + response cycle). +- Messages: `message_start`, `message_update`, `message_end` (user, assistant, + tool-result messages). +- Tools: `tool_execution_start` (has `toolCallId`, `toolName`, `args`), + `tool_execution_update`, `tool_execution_end`; plus `tool_call` (pre-exec, + can block) and `tool_result` (post-exec, can modify). +- Provider/model: `before_provider_request` (built payload, before HTTP), + `after_provider_response` (HTTP status/headers, before stream consumed), + `model_select`, `thinking_level_select`. +- Input: `input`, `user_bash`. +Source: https://pi.dev/docs/latest/extensions + +So when the agent-workflows README says runs are "instrumented through pi +instruments," concretely that means: **a Pi extension hooks these events and +produces spans/metrics.** There is no proprietary "instruments" object to +adopt; it is the standard extension API. (UNVERIFIED: whether "pi instruments" +is an internal Agenta shorthand for a specific bundled extension vs. the +generic extension mechanism. The public Pi docs only expose `pi.on` + tools.) + +Installation pattern for an extension (from pi-OTEL): +`pi install git:github.com//` (or `pi install npm:`), then +`/reload`. Source: https://github.com/maxmalkin/pi-OTEL + +## Existing libraries + +### Pi-specific OTel extensions (closest fit — reuse candidates) + +All three are TypeScript Pi extensions emitting OTLP and using OTel GenAI +semconv. They differ mainly in span tree shape and whether they also emit +metrics. + +1. **`maxmalkin/pi-OTEL`** — "OpenTelemetry harness for the Pi coding agent." + - Span tree: `pi.session` -> `pi.agent_turn` -> (`gen_ai.chat `, + `tool.`). + - Attributes follow OTel GenAI semconv. Honors standard OTLP env vars: + `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_ENDPOINT` + (appends `/v1/traces`), `OTEL_EXPORTER_OTLP_HEADERS` (`k=v,k=v`), + `OTEL_SERVICE_NAME` (default `pi`), `OTEL_RESOURCE_ATTRIBUTES`. + Pi-specific: `PI_OTEL_DISABLED` (default `0`), + `PI_OTEL_CAPTURE_CONTENT` (default `0`, gates prompt/completion/tool I/O). + Same keys accepted in `settings.json` under `otel`. Falls back to + `http://localhost:4318/v1/traces` (OTLP/HTTP). + - Runtime commands: `/otel-status`, `/otel-flush`. + - Source: https://github.com/maxmalkin/pi-OTEL + +2. **`pi-otel` (nikiforovall)** — emits one trace tree per user prompt. + - Span tree: `pi.interaction` (root, per prompt) -> `pi.turn` -> + (`pi.llm_request`, `pi.tool.`). Deliberately **does not** make the + session a span ("a pi session can run for hours; long-running root spans + are an OTel anti-pattern") — it correlates via `gen_ai.conversation.id`. + - Attributes: GenAI semconv — `gen_ai.system`, `gen_ai.request.model`, + `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, finish reason, + tool call ids, `gen_ai.conversation.id`. + - Config: default endpoint `http://localhost:4317` (OTLP **gRPC**), + `settings.json` `otel` block `{enabled, endpoint, protocol:"grpc"}`, + `OTEL_*` env overrides, `PI_OTEL_DISABLED=1` to disable. Default backend + is a local .NET Aspire dashboard (auto-spawned via `/otel start`); any + OTLP backend works (Grafana LGTM, Jaeger, Honeycomb). + - Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html + +3. **`mprokopov/pi-otel-telemetry`** — traces **and metrics**. + - Span tree: `session` (root) -> `agent.prompt` (per user message) -> + `agent.turn` (LLM call + tool cycle) -> `tool.` (e.g. `tool.bash`, + `tool.read`, `tool.edit`). Span events: `llm.request`, `model.changed`, + `session.compacted`. + - Metrics: `pi.tokens.input`, `pi.tokens.output` (counters); `pi.tool.calls`, + `pi.tool.errors` (counters, labelled `tool.name`); `pi.tool.duration` + (histogram ms); `pi.turns`, `pi.prompts` (counters); + `pi.session.duration` (histogram s). + - Attributes: `session.id`, `session.cwd`, token counts, user identity; + turn spans `turn.index`, `llm.usage.input_tokens`, + `llm.usage.output_tokens`; tool spans `tool.name`, `tool.call_id`, + `tool.duration_ms`. + - Config: `OTEL_EXPORTER_OTLP_ENDPOINT` default `http://localhost:4318` + (OTLP/HTTP), `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` / + `..._METRICS_ENDPOINT` overrides, `PI_OTEL_DEBUG=true`. + - Source: https://github.com/mprokopov/pi-otel-telemetry + +**Takeaway:** there is no single canonical Pi OTel package; the three diverge on +span-tree shape and span names (`pi.session` vs `pi.interaction` vs `session`). +What they agree on is **GenAI semconv `gen_ai.*` attributes over OTLP**. For +Agenta we should pick/fork one and pin the span tree we want; don't assume a +stable upstream contract. + +### Framework instrumentations (not Pi-specific) + +- **OpenInference (Arize)** — OTel-based semantic conventions + auto-instrumentors + for LangChain, LlamaIndex, OpenAI SDK, etc. Defines 10 span kinds via the + required `openinference.span.kind` attribute: `LLM`, `EMBEDDING`, + `RETRIEVER`, `RERANKER`, `TOOL`, `CHAIN`, `AGENT`, `GUARDRAIL`, `EVALUATOR`, + `PROMPT`. It does **not** ship a Pi instrumentor — Pi isn't one of its + supported frameworks — so using OpenInference for Pi means writing the span + kinds by hand in a Pi extension. Fit: good vocabulary for agent/tool/chain + structure, but no off-the-shelf Pi support. + Sources: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md , + https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions + +- **OpenLLMetry (Traceloop)** — OTel SDK + instrumentations that emit `gen_ai.*` + (plus `traceloop.*`, `llm.*`) attributes. Auto-instruments LLM providers and + some frameworks. No Pi instrumentor; same story as OpenInference — you'd hand + off via a Pi extension or rely on its provider-level auto-instrumentation of + the underlying LLM HTTP client (possible but indirect, and Pi's `pi-ai` may + not match a provider Traceloop patches). + (UNVERIFIED whether Traceloop's provider instrumentation intercepts + `@earendil-works/pi-ai`'s HTTP calls automatically.) + +- **OTel GenAI semantic conventions (official)** — the upstream spec the Pi + extensions follow. Operation names: `create_agent`, `invoke_agent`, + `execute_tool`, plus the chat/inference spans. Span naming guidance: + `invoke_agent {gen_ai.agent.name}` (or just `invoke_agent`), and + `execute_tool {gen_ai.tool.name}` for tool calls (used for MCP tool calls + too). Key attributes: `gen_ai.operation.name`, `gen_ai.agent.name`, + `gen_ai.agent.id`, `gen_ai.conversation.id`, `gen_ai.tool.name`, + `gen_ai.tool.call.id`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, + `gen_ai.usage.output_tokens`. This is the most "standard" and the most + future-proof target. + Sources: https://opentelemetry.io/docs/specs/semconv/gen-ai/ , + https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ + (NOTE: the gen-ai pages now redirect to the + `open-telemetry/semantic-conventions` repo; the agent-spans operation + names above come from the indexed spec text, lightly UNVERIFIED against the + latest repo revision.) + +## Span / attribute conventions and how well they map to agent runs + +A multi-turn agent run = one logical conversation -> N user prompts -> +per-prompt agent invocation -> M turns (each an LLM call) -> per-turn 0..K tool +calls. All three conventions can express this; they differ in vocabulary. + +| Layer in a Pi run | OTel GenAI semconv | OpenInference span kind | Pi extension span (varies) | +|---|---|---|---| +| Whole conversation | `gen_ai.conversation.id` (correlation, not a span) | `session.id` attr / CHAIN root | `pi.session` / `session` (or skipped) | +| Per-prompt agent invocation | `invoke_agent` op | `AGENT` | `pi.interaction` / `agent.prompt` / `pi.agent_turn` | +| Per-turn LLM call | chat/inference span, `gen_ai.request.model` | `LLM` | `gen_ai.chat ` / `pi.turn` / `pi.llm_request` | +| Tool call | `execute_tool`, `gen_ai.tool.name`, `gen_ai.tool.call.id` | `TOOL` | `tool.` | +| Glue/orchestration | (no dedicated kind) | `CHAIN` | n/a | +| Retrieval / rerank / embeddings | embeddings spans | `RETRIEVER` / `RERANKER` / `EMBEDDING` | n/a | + +Assessment: +- **GenAI semconv** maps cleanly to LLM calls and tool calls and has explicit + agent + tool operation names. Its weak spot is the multi-turn *tree*: it + leans on `gen_ai.conversation.id` for correlation rather than mandating a + session/turn span hierarchy, which is why the Pi extensions invent their own + parent spans (`pi.session`, `pi.interaction`, `pi.turn`). Good attribute + vocabulary; you still design the tree. +- **OpenInference span kinds** (AGENT/CHAIN/LLM/TOOL/RETRIEVER) map *very* + cleanly to a nested agent run and are what Agenta's UI already keys off (see + next section). The cost: no Pi auto-instrumentor, so you set + `openinference.span.kind` yourself. +- A pragmatic hybrid works: emit GenAI `gen_ai.*` attributes (what the Pi + extensions already produce) **and** set `openinference.span.kind` per span so + Agenta types the node correctly. Agenta's adapters read both. + +## Export-from-sandbox path + +Inside a Daytona (or other) sandbox the Pi extension runs the OTel SDK and +exports OTLP. To reach Agenta's collector across the sandbox boundary: + +1. **Endpoint.** Agenta accepts OTLP/HTTP **protobuf** at `POST /otlp/v1/traces` + (mounted in `api/entrypoints/routers.py` with prefix `/otlp/v1`). Binary + protobuf only (`Content-Type: application/x-protobuf`); JSON OTLP is **not** + accepted. Batch size limit default 10 MB (`AGENTA_OTLP_MAX_BATCH_BYTES`, + env `OTLPConfig.max_batch_bytes`); over-limit -> 413. (The router docstring + says "default 4 MB"; the actual env default in `env.py` is 10 MB — doc/code + drift worth noting.) + Files: `api/oss/src/apis/fastapi/otlp/router.py`, + `api/oss/src/utils/env.py` (`OTLPConfig`, line ~326), + `api/entrypoints/routers.py` (~line 770). + - So set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=https:///otlp/v1/traces` + and use the **OTLP/HTTP protobuf** exporter. The gRPC-default extension + (nikiforovall) would need reconfiguring to HTTP/protobuf, or a collector + sidecar to translate. +2. **Auth + tenant scope.** Agenta's auth middleware expects + `Authorization: ApiKey ` (prefix `ApiKey `) and resolves + organization/workspace/project/user from it; `project_id` can also come + from a `?project_id=` query param. So the exporter needs + `OTEL_EXPORTER_OTLP_HEADERS=Authorization=ApiKey ` and the project id + either in the key's scope or the URL query string. In EE the ingest path + also checks `EDIT_SPANS` permission and `TRACES_INGESTED` quota. + Files: `api/oss/src/middlewares/auth.py` (`_APIKEY_TOKEN_PREFIX = "ApiKey "`, + query `project_id` handling), `api/oss/src/apis/fastapi/otlp/router.py` + (EE permission + entitlement checks). +3. **Secret delivery.** The Agenta API key is a secret; per the agent-workflows + README, secrets are injected into the sandbox by the startup hook. The key + and the OTLP endpoint should be injected the same way (env vars consumed by + the OTel SDK), so the harness running locally vs server-side only differs in + endpoint/key values — preserving the local/server parity requirement. +4. **Trace-context propagation across the boundary.** Two cases: + - If the agent run is *initiated by* an Agenta backend request, propagate + W3C `traceparant`/`traceparent` into the sandbox (env or RPC metadata) so + the in-sandbox root span is a child of the backend span and the run shows + as one trace. (UNVERIFIED: whether Agenta currently sets/forwards + `traceparent` to invocations — needs a check of the invocation service.) + - If the run is standalone, the extension creates its own root and relies on + `gen_ai.conversation.id` / `session.id` for correlation; Agenta's + OpenInference + Logfire adapters map `session.id` / + `gen_ai.conversation.id` -> `ag.session.id`, which lines up with the + agent-workflows `session_id` concept. +5. **Network egress.** The sandbox must be allowed outbound HTTPS to the Agenta + host. With Daytona this is a sandbox network-policy concern (UNVERIFIED for + our port). A collector/agent sidecar in the sandbox is an alternative that + also lets us batch, retry, and strip content centrally. + +## How it maps to Agenta's existing OTel ingestion + +Agenta already has the whole receive-and-normalize pipeline; a Pi agent is just +another OTLP producer. + +- **Ingest.** `OTLPRouter.otlp_ingest` parses the protobuf + (`parse_otlp_stream`), converts each OTel span to an internal DTO + (`parse_from_otel_span_dto`), runs an EE quota soft-check, then queues spans + on a Redis stream for async persistence (same path as native ingest). + File: `api/oss/src/apis/fastapi/otlp/router.py`. +- **Normalization via adapter registry.** `AdapterRegistry` runs, in order: + `OpenLLMmetryAdapter`, `OpenInferenceAdapter`, `LogfireAdapter`, + `VercelAIAdapter`, `DefaultAgentaAdapter`. Each maps its vendor attributes to + Agenta's canonical `ag.*` namespace. + File: `api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py`. +- **GenAI semconv is already mapped.** `api/.../otlp/opentelemetry/semconv.py` + and the OpenLLMetry adapter map `gen_ai.system`, `gen_ai.request.model`, + `gen_ai.usage.prompt_tokens|completion_tokens|total_tokens`, + `gen_ai.prompt.*`, `gen_ai.completion.*`, etc. -> `ag.meta.*` / + `ag.data.*` / `ag.metrics.unit.tokens.*`. **This is precisely what the Pi + OTel extensions emit**, so Pi `gen_ai.*` spans largely normalize today. + - Caveat: the existing map uses the older `gen_ai.usage.prompt_tokens` / + `completion_tokens` names. The Pi extensions emit the newer + `gen_ai.usage.input_tokens` / `output_tokens`. Those newer keys are **not** + in `semconv.py` yet, so token metrics from Pi would be dropped until we add + the two aliases. (Verified by reading `semconv.py` — only `prompt_tokens` / + `completion_tokens` / `total_tokens` are present.) +- **Span typing / agent structure.** `OpenInferenceAdapter` maps + `openinference.span.kind` -> `ag.type.node` with + `OPENINFERENCE_TO_AGENTA_SPAN_KIND_MAP`: `CHAIN->chain`, `RETRIEVER->query`, + `RERANKER->rerank`, `LLM->chat`, `EMBEDDING->embedding`, `AGENT->agent`, + `TOOL->tool`, `GUARDRAIL->task`, `EVALUATOR->task`. It also normalizes tool + definitions (`llm.tools.{i}.tool.json_schema`), tool calls, and + input/output messages into the canonical OpenAI shape Agenta's UI expects. + File: `api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py`. +- **Session correlation.** `session.id` (OpenInference) and + `gen_ai.conversation.id` (Logfire adapter) both map to `ag.session.id`, + which aligns with the agent-workflows `session_id`. + +**Net:** the lowest-effort integration is a Pi extension emitting GenAI-semconv +spans **and** `openinference.span.kind` over OTLP/HTTP protobuf to +`/otlp/v1/traces`. To get full fidelity we'd add a small amount of backend +mapping (token-name aliases; optionally a dedicated "Pi/agent" adapter if we +want first-class agent/turn nodes instead of generic chat/tool). No new ingest +infrastructure is needed. + +## Open questions + +1. **Which span tree do we standardize on?** The three Pi extensions disagree + (`pi.session` vs `pi.interaction` vs `session`; whether the session is a + span at all). We must pin one to get a stable Agenta UI. The + "no long-running session root" argument (nikiforovall) matters if Pi + sessions can run for hours. +2. **Build vs fork.** Fork `maxmalkin/pi-OTEL` (OTLP/HTTP, content gate) or + `mprokopov/pi-otel-telemetry` (also metrics) vs write our own minimal + extension? Need to read their actual source for license/quality and to see + the exact `pi.on(...)` wiring (the READMEs describe spans, not code). +3. **Token attribute drift.** Add `gen_ai.usage.input_tokens` / + `output_tokens` (and `gen_ai.usage.*` newer keys) to Agenta's `semconv.py` + so Pi token metrics aren't silently dropped. Confirm against the live + GenAI semconv revision. +4. **Trace-context propagation.** Does Agenta forward W3C `traceparent` into an + invocation today? If we want the in-sandbox spans stitched under the + originating backend span, we need to propagate context across the + harness/sandbox boundary (env var or RPC metadata). Needs a code check of + the invocation/workflow run path. +5. **Content capture policy.** Pi extensions gate prompt/completion/tool I/O + behind `PI_OTEL_CAPTURE_CONTENT`. Decide default (privacy vs. eval + usefulness) and whether to enforce it server-side too. +6. **Transport mismatch.** Agenta is OTLP/HTTP **protobuf only**. The + gRPC-default extension and any JSON-OTLP setup need reconfiguration or a + collector sidecar in the sandbox. +7. **"pi instruments" terminology.** Confirm with whoever wrote the + agent-workflows README whether it refers to the generic `pi.on` extension + API or a specific Earendil/Agenta-internal "instruments" bundle. The public + Pi docs only expose `pi.on` + tool registration; no "instruments" object. +8. **Doc/code drift.** OTLP router docstring says 4 MB max batch; `env.py` + default is 10 MB. Worth fixing when this work lands. + +## Sources + +- Pi product site: https://pi.dev/ +- Pi repo README: https://github.com/earendil-works/pi/blob/main/README.md +- Pi extensions doc (event system / `pi.on`): https://pi.dev/docs/latest/extensions +- Pi docs index: https://pi.dev/docs/latest +- pi-OTEL extension (maxmalkin): https://github.com/maxmalkin/pi-OTEL +- pi-otel-telemetry (mprokopov): https://github.com/mprokopov/pi-otel-telemetry +- pi-otel blog (nikiforovall): https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html +- Pi as customer-hosted agent runtime discussion: https://github.com/earendil-works/pi/discussions/3337 +- OTel GenAI semconv (index): https://opentelemetry.io/docs/specs/semconv/gen-ai/ +- OTel GenAI agent spans: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ +- OpenInference semantic conventions spec: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md +- OpenInference conventions (Arize docs): https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions +- Agenta OTLP ingest router: api/oss/src/apis/fastapi/otlp/router.py +- Agenta adapter registry: api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py +- Agenta GenAI/OpenLLMetry semconv map: api/oss/src/apis/fastapi/otlp/opentelemetry/semconv.py +- Agenta OpenInference adapter: api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py +- Agenta auth middleware: api/oss/src/middlewares/auth.py +- Agenta OTLP config: api/oss/src/utils/env.py (OTLPConfig) +- Router mounting: api/entrypoints/routers.py diff --git a/docs/design/agent-workflows/research/pi-interaction.md b/docs/design/agent-workflows/research/pi-interaction.md new file mode 100644 index 0000000000..c5a1fee83c --- /dev/null +++ b/docs/design/agent-workflows/research/pi-interaction.md @@ -0,0 +1,584 @@ +# Research: Programmatically driving the pi.dev agent harness + +Status: research only. No code changed outside this file. +Scope: how the Agenta backend can drive a "pi.dev" harness for the new `agents` +workflow type. Answers questions 1-7 from the research brief, with sources. + +## Summary + +- **pi.dev is the Pi coding agent** by Earendil Inc.: "a minimal, extensible agent + harness." It is a TypeScript/Node monorepo, MIT-licensed, distributed on npm. + Latest published version at time of research: **0.79.4**. The CLI binary is `pi`. +- Three layers matter to us, smallest to largest: + - `@earendil-works/pi-ai` - unified multi-provider LLM API (`getModel`, `stream`, + `complete`, content blocks incl. images, image generation). + - `@earendil-works/pi-agent-core` - the agent loop: stateful `Agent` class, tool + calling, event stream, `sessionId`, before/after tool hooks, transport abstraction. + - `@earendil-works/pi-coding-agent` - the full harness + CLI: `createAgentSession`, + built-in tools (read/bash/edit/write/...), extensions/hooks, skills, AGENTS.md + loading, session persistence (JSONL), and four run surfaces (TUI, print/JSON, RPC, + SDK). +- **Four ways to drive it programmatically.** For a Python backend driving pi inside a + sandbox, the realistic options are (a) **RPC mode** (`pi --mode rpc`, JSONL over + stdin/stdout, bidirectional, supports follow-ups/steering/abort), or (b) **print/JSON + mode** (`pi --mode json "prompt"`, one-shot, JSON-lines events on stdout). The + **SDK** (`createAgentSession`) is the in-process TypeScript path and gives the richest + control; it is what you would use if any part of the harness is itself Node. +- **Multi-message output, sessions, streaming, hooks, tools, model selection** are all + first-class and map cleanly onto the design doc's requirements. The one soft spot is + **"pi instruments"**: pi itself ships no built-in "instruments" product. The + observability story is OpenTelemetry via the community `pi-otel` extension (built on + pi's hooks), plus an in-house extensions/hooks API you can instrument against. See + Question 3 and the Open questions section. +- **Swappable harness + local parity** are supported by design: the harness is the thing + behind a thin run surface (RPC/JSON/SDK), so a different harness (e.g. OpenAI Codex) + that speaks the same surface can be slotted in; and the same `pi` binary/SDK runs + locally and in the sandbox, which is exactly the parity the design wants. + +## What pi.dev is (with sources) + +"Pi is a minimal, extensible agent harness... Adapt Pi to your workflows, not the other +way around." It deliberately omits things like sub-agents and plan mode so you compose +them yourself via extensions. +Source: https://pi.dev/ and https://github.com/earendil-works/pi + +Packages (all MIT, all `0.79.4` at research time; confirmed via the npm registry API): +- `@earendil-works/pi-coding-agent` - "Coding agent CLI with read, bash, edit, write + tools and session management." Bin: `{"pi": "dist/cli.js"}`. Depends on + `pi-agent-core`, `pi-ai`, `pi-tui` (all `^0.79.4`), `typebox@1.x`, `undici`, etc. +- `@earendil-works/pi-agent-core` - "General-purpose agent with transport abstraction, + state management, and attachment support." +- `@earendil-works/pi-ai` - "Unified LLM API with automatic model discovery and provider + configuration." +Source: `https://registry.npmjs.org/@earendil-works/pi-coding-agent` (and `/pi-ai`, +`/pi-agent-core`), GitHub repo root README. + +Repository layout (monorepo): +``` +packages/ + coding-agent/ # CLI + harness (SDK lives here) + agent/ # @earendil-works/pi-agent-core + ai/ # @earendil-works/pi-ai + tui/ # @earendil-works/pi-tui +``` +Key docs in-repo: `packages/coding-agent/docs/{sdk,extensions,json,rpc,models,settings, +containerization}.md`. +Source: https://github.com/earendil-works/pi/tree/main/packages + +Why this matches the design doc's "agent harness with tools, hooks, instruments, +sessions, runs in sandboxes": pi provides tools (built-in + custom via TypeBox), +25+ TypeScript hooks, JSONL sessions with a `sessionId`, a documented containerization +story, and a community OTel instrumentation extension. The name "pi.dev" in the design +doc is unambiguously this product. + +Install (host or inside sandbox image): +```bash +npm install @earendil-works/pi-coding-agent # SDK + CLI +# CLI is also installable via curl / PowerShell / pnpm / bun per pi.dev +``` +Source: https://github.com/earendil-works/pi, https://pi.dev/ + +--- + +## Question 1 - How do you programmatically interact with pi.dev (API/SDK/CLI surface)? + +**Language:** TypeScript/Node. There is no first-party Python SDK; a Python backend +drives pi over a process boundary (RPC or print/JSON mode) or shells out to the `pi` CLI. + +**Four run surfaces** (pi's own term): +1. **Interactive TUI** - `pi` (not relevant to us). +2. **Print / JSON mode** - `pi -p "query"` or `pi --mode json "query"`. One-shot; + emits results (text or JSON-lines events) to stdout. Good for stateless single runs. +3. **RPC mode** - `pi --mode rpc`. JSON protocol over stdin/stdout; bidirectional and + long-lived. This is the canonical "drive it from another process/language" surface. +4. **SDK** - `import { createAgentSession } from "@earendil-works/pi-coding-agent"`. + In-process, richest control. This is what you embed if your harness runner is Node. +Sources: https://pi.dev/, https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md + +**SDK entrypoints** (from `docs/sdk.md`): +```typescript +import { + createAgentSession, + createAgentSessionRuntime, + SessionManager, + AuthStorage, + ModelRegistry, + DefaultResourceLoader, + defineTool, +} from "@earendil-works/pi-coding-agent"; + +const { session, extensionsResult, modelFallbackMessage } = + await createAgentSession({ + cwd: process.cwd(), + model: myModel, + thinkingLevel: "medium", + tools: ["read", "bash", "edit"], + sessionManager: SessionManager.inMemory(), + }); +``` +`createAgentSessionRuntime(factory, options)` is the multi-session variant +(`newSession()`, `switchSession()`, `fork()`, `importFromJsonl()`). + +The returned `AgentSession` interface (verbatim from docs): +```typescript +interface AgentSession { + prompt(text: string, options?: PromptOptions): Promise; + steer(text: string): Promise; + followUp(text: string): Promise; + subscribe(listener: (event: AgentSessionEvent) => void): () => void; + setModel(model: Model): Promise; + setThinkingLevel(level: ThinkingLevel): void; + cycleModel(): Promise; + navigateTree(targetId: string, options?: NavigateOptions): Promise; + compact(customInstructions?: string): Promise; + abort(): Promise; + dispose(): void; + sessionFile: string | undefined; + sessionId: string; // <-- session id, see Q7 + agent: Agent; + model: Model | undefined; + thinkingLevel: ThinkingLevel; + messages: AgentMessage[]; // <-- multi-message output, see Q4 + isStreaming: boolean; +} +``` +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +**Low-level loop** (in `pi-agent-core`) if you want to drive turns yourself: +```typescript +import { agentLoop, agentLoopContinue } from "@earendil-works/pi-agent-core"; +for await (const event of agentLoop([userMessage], context, config)) { /* ... */ } +``` +Source: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md + +**Recommendation for Agenta:** drive pi over **RPC mode** from the Python backend +process that owns the sandbox (long-lived, supports follow-ups/steering/abort and a +stable JSONL contract), and reserve print/JSON mode for stateless single-shot runs. Use +the SDK only if the in-sandbox runner is itself Node. RPC/JSON give the cleanest swappable +boundary for a non-pi harness (Codex) later (Question 7). + +--- + +## Question 2 - Sending messages and getting responses; streaming + +**SDK:** `await session.prompt(text, options?)` sends a user message and resolves when the +agent turn completes. Mid-stream you can `steer()` (replace current op) or `followUp()` +(queue after the turn). Streaming is via `subscribe()` callbacks (push-based observer, +not an async generator at the session level): +```typescript +const unsubscribe = session.subscribe((event) => { + switch (event.type) { + case "message_update": + if (event.assistantMessageEvent.type === "text_delta") { + process.stdout.write(event.assistantMessageEvent.delta); // streaming text + } + break; + case "tool_execution_start": /* event.toolName */ break; + case "tool_execution_end": /* event.isError */ break; + case "turn_end": /* event.message */ break; + case "agent_end": /* event.messages = full multi-message output */ break; + } +}); +``` +Full event set: `agent_start`, `agent_end`, `turn_start`, `turn_end`, `message_start`, +`message_update`, `message_end`, `tool_execution_start`, `tool_execution_update`, +`tool_execution_end`, `queue_update`, `compaction_start/end`, `auto_retry_start/end`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +**pi-agent-core** is where the async-generator streaming lives: `agentLoop()` / +`agentLoopContinue()` are `for await` async generators; the `Agent` class wraps them with +`subscribe()`. The low-level `pi-ai` `stream()` emits `text_start/delta/end`, +`thinking_*`, `toolcall_*`, `done`, `error`. +Sources: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +**RPC mode (cross-process / cross-language):** JSONL over stdin/stdout. +- Framing: strict LF (`\n`)-delimited JSON. Strip a trailing `\r`. **Do not** use + Node `readline` or other readers that split on Unicode separators (e.g. `U+2028`), + because those characters appear inside JSON payloads. +- Send a prompt (client -> pi stdin): + ```json + {"id": "req-1", "type": "prompt", "message": "Hello"} + ``` + Ack (pi stdout): `{"id": "req-1", "type": "response", "command": "prompt", "success": true}` +- Other commands: `steer`, `follow_up`, `abort`, `new_session`, `set_model`, + `cycle_model`, `get_state`, `get_messages`, `set_thinking_level`, `bash`, + `get_session_stats`, `switch_session`, `fork`, `clone`, `compact`, etc. +- Events stream back as JSON lines **without** an `id` (same event names as the SDK): + ```json + {"type":"message_update","assistantMessageEvent":{"type":"text_delta","delta":"Hello"}} + {"type":"message_update","assistantMessageEvent":{"type":"text_end"}} + {"type":"agent_end","messages":[...]} + ``` +- The optional `id` on a command is echoed back on its `response` for correlation. There + is **no handshake** - the protocol starts immediately; the first client command begins + interaction. +- Extension UI is also over the wire: `extension_ui_request` (stdout) / + `extension_ui_response` (stdin) for `select`/`confirm`/`input`/`editor`, plus + fire-and-forget `notify`/`setStatus`/`setWidget`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md + +**Streaming summary:** SDK = observer callbacks; agent-core/pi-ai = async generators; +RPC/JSON modes = JSON-lines event stream over stdout. No SSE or websockets in pi itself; +if Agenta needs SSE to a frontend, the backend wraps the JSONL/observer stream and +re-emits SSE. + +--- + +## Question 3 - Startup hooks (file setup, secret injection, env prep) + +pi has a rich **extension hook system**, plus an **app-level startup ordering** for the +sandbox that Agenta controls itself. Two layers: + +### 3a. pi extension hooks (in-process, TypeScript) +Extensions are default-exported factory functions auto-discovered from: +- Global: `~/.pi/agent/extensions/*.ts` (or `.../*/index.ts`) +- Project: `.pi/extensions/*.ts` (or `.../*/index.ts`) +- CLI: `pi -e ./path.ts` +```typescript +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; +export default function (pi: ExtensionAPI) { + pi.on("session_start", async (event, ctx) => { /* file setup / state restore */ }); + pi.registerTool({ /* ... */ }); +} +``` +Factory functions may be **async**, which is the supported way to do startup +initialization (e.g. fetch remote config) before the session begins. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Relevant hook points (25+ total) for startup/setup:** +- `project_trust` -> `{ trusted: "yes"|"no"|"undecided", remember? }` (gate before + loading dynamic configs). +- `session_start` -> reason `"startup"|"reload"|"new"|"resume"|"fork"`. The documented + place for one-time per-session setup and state restoration. This is the natural + **file-setup hook**. +- `session_shutdown` -> cleanup / persist state (`pi.appendEntry(...)`). +- `resources_discover` -> contribute `skillPaths`/`promptPaths`/`themePaths` (how skills + get injected). +- `before_agent_start` -> inject messages or modify the system prompt before the LLM turn. +- `context` / `before_provider_request` / `after_provider_response` -> mutate the + messages/payload around each LLM call (good instrumentation points). +- `tool_call` -> can **block** a tool (`{ block: true, reason }`); `tool_result` can + rewrite results. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Secret injection at the pi layer** is via provider registration with env interpolation: +```typescript +pi.registerProvider("provider-name", { + name: "Display Name", + baseUrl: "https://api.example.com", + apiKey: "$ENV_VAR", // "$VAR" / "${VAR}" interpolated; "$$" -> literal "$" + api: "anthropic-messages", + models: [/* ... */], +}); +``` +And/or `AuthStorage` (SDK): resolution order is runtime overrides -> `auth.json` -> +environment variables -> fallback resolver: +```typescript +const authStorage = AuthStorage.create(); +authStorage.setRuntimeApiKey("anthropic", process.env.MY_KEY); // not persisted +``` +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +### 3b. App-level (sandbox) startup ordering - Agenta's own hooks +The design doc's "startup hooks set up files then secrets" is the **sandbox boot +sequence**, which Agenta owns, not a pi API. pi's containerization doc shows secrets are +injected as env vars at container start and files via bind mounts: +```bash +docker run --rm -it \ + -e ANTHROPIC_API_KEY \ + -v "$PWD:/workspace" \ + -v pi-agent-home:/root/.pi/agent \ + pi-sandbox +``` +Three documented isolation modes: **Gondolin** (local micro-VM, tools run in VM, auth +stays on host), **plain Docker** (whole pi process containerized), and **OpenShell** +(policy-controlled gateway that can inject provider creds upstream so raw keys never +enter the sandbox). For Agenta's Daytona target, the equivalent is: lay files into the +workspace, then set secret env vars / write `auth.json`, then start `pi --mode rpc`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md + +So "file setup then secrets" maps to: (1) sandbox provisioning lays config files +(AGENTS.md, skills, files) into the workspace and `~/.pi/agent`; (2) secrets are set as +env vars / `auth.json`; (3) pi boots and its own `session_start` extension hook can do any +remaining in-process setup. Note: pi's own hooks fire **inside** pi after it starts, so +they cannot themselves be the mechanism that installs pi's secrets before pi starts - +that ordering belongs to the sandbox layer (the `$ENV_VAR`/`auth.json` is read by pi at +boot). + +--- + +## Question 4 - Returns as TEXT + +- **Streaming:** `message_update` events carry `assistantMessageEvent.type === + "text_delta"` with `.delta`. Concatenate deltas for live text. (RPC/JSON modes emit the + same shape on stdout.) +- **Final / multi-message:** the run produces an array of messages, not one completion. + - SDK: `session.messages` (all) and the `agent_end` event's `messages` array; per-turn + text is on `turn_end`'s `message`. + - The `agent_end` event is the canonical "full multi-message output" the design doc + wants. Each assistant message's `content` is an array of content blocks; text blocks + are `{ type: "text", text }`. +- **print mode:** `pi -p "query"` prints assistant text to stdout directly (simplest text + path for a one-shot run). +- **JSON mode filtering example** (text via `message_end`): + ```bash + pi --mode json "List files" 2>/dev/null | jq -c 'select(.type == "message_end")' + ``` +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +--- + +## Question 5 - Returns as IMAGES and other binary/file artifacts + +pi-ai content blocks include an explicit image block; images are base64 + MIME type: +```typescript +type ContentBlock = + | { type: 'text'; text: string } + | { type: 'image'; data: string; mimeType: string } // base64-encoded + | { type: 'toolCall'; id: string; name: string; arguments: Record } + | { type: 'thinking'; thinking: string }; +``` +Tool results carry their own `content: ContentBlock[]`, so a tool can return an image +block: +```typescript +{ + role: 'toolResult'; + toolCallId: string; + toolName: string; + content: ContentBlock[]; // may include { type: 'image', data, mimeType } + isError: boolean; + timestamp: number; +} +``` +- **Input images** (multimodal prompts): SDK `prompt(text, { images: [...] })` with + `ImageContent` = `{ type: "image", source: { type: "base64", mediaType, data } }` + (SDK shape). pi-agent-core's `prompt()` also accepts + `[{ type: "image", data, mimeType }]`. +- **Generated images:** pi-ai exposes `getImageModel(provider, modelId)` and + `generateImages(model, input, options)` (one-shot image generation). +- **Binary/file artifacts:** there is no dedicated "artifact" return channel. The two + practical paths are (a) tools return an `image` content block (base64), or (b) the + agent writes files to the sandbox workspace (write/bash tools) and Agenta collects them + from the filesystem after the run. pi-agent-core's package description explicitly + mentions "attachment support," which is worth confirming in source for non-image + binaries. +Sources: https://github.com/earendil-works/pi/blob/main/packages/ai/README.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +`https://registry.npmjs.org/@earendil-works/pi-agent-core` (description). The +attachment/binary specifics are **UNVERIFIED** beyond the image block - confirm in +`packages/agent` source / `packages/ai` source. + +--- + +## Question 6 - STRUCTURED OUTPUTS (JSON / schema-constrained) + +pi's idiomatic structured-output pattern is **a terminating tool**, not a provider-level +`response_format`/`json_schema`. You define a tool whose TypeBox parameters are your +output schema and return `terminate: true` so the agent stops without an extra LLM turn; +the validated arguments are your structured object. See +`packages/coding-agent/examples/extensions/structured-output.ts`: +```typescript +defineTool({ + name: "save_structured_output", + parameters: Type.Object({ + headline: Type.String({ description: "Short title for the result" }), + summary: Type.String({ description: "One-paragraph summary" }), + actionItems: Type.Array(Type.String(), { description: "Concrete next steps" }), + }), + async execute(_toolCallId, params) { + return { + content: [{ type: "text", text: `Saved structured output: ${params.headline}` }], + details: { // <-- machine-readable structured result + headline: params.headline, + summary: params.summary, + actionItems: params.actionItems, + } satisfies StructuredOutputDetails, + terminate: true, // <-- ends agent without follow-up turn + }; + }, +}); +``` +You then read the structured object from that tool call's arguments / the tool result's +`details`. TypeBox is the schema system throughout pi (`Type`, `Static`, `TSchema` are +re-exported from `@earendil-works/pi-ai`), and `validateToolCall(tools, toolCall)` +validates arguments against the schema before execution. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +**UNVERIFIED:** whether `pi-ai`'s `complete()`/`stream()` accept a provider-native +`responseFormat`/`jsonSchema` option (OpenAI/xAI-style strict JSON schema). The README +did not document one; the documented, portable pattern is the terminating-tool approach +above. Confirm by reading `packages/ai` source (`complete`/`stream` option types). + +--- + +## Question 7 - Tools, model selection, and the session_id + +### Tools +**Built-in:** enable per session: `tools: ["read", "bash", "edit", "write", "grep", +"find", "ls"]`. Read-only mode = `["read","grep","find","ls"]`. `excludeTools: [...]` +removes specific ones. + +**Custom (SDK):** +```typescript +import { Type } from "typebox"; +import { defineTool } from "@earendil-works/pi-coding-agent"; +const myTool = defineTool({ + name: "my_tool", + label: "My Tool", + description: "Does something useful", + parameters: Type.Object({ input: Type.String({ description: "Input value" }) }), + execute: async (_toolCallId, params) => ({ + content: [{ type: "text", text: `Result: ${params.input}` }], + details: {}, + }), +}); +await createAgentSession({ customTools: [myTool], tools: ["read", "bash", "my_tool"] }); +``` +**Custom (extension):** `pi.registerTool({...})` with the same shape plus TUI hooks +(`renderCall`, `renderResult`), `promptSnippet`, `promptGuidelines`, and optional +`onUpdate` streaming. `pi.getAllTools()`, `pi.getActiveTools()`, `pi.setActiveTools()` +manage the active set at runtime. `tool_call` hooks can block tools; MCP is composed via +extensions (not core). +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +### Model selection +```typescript +import { getModel } from "@earendil-works/pi-ai"; +const opus = getModel("anthropic", "claude-opus-4-5"); // built-in +const custom = modelRegistry.find("my-provider", "my-model"); // from models.json +const available = await modelRegistry.getAvailable(); // those with valid keys +await createAgentSession({ + model: opus, + thinkingLevel: "high", // off | minimal | low | medium | high | xhigh + scopedModels: [ { model: opus, thinkingLevel: "high" }, { model: haiku, thinkingLevel: "off" } ], + authStorage, modelRegistry, +}); +await session.setModel(newModel); // runtime switch +``` +If no model is provided: restore from session -> settings default -> first available. +15+ providers (Anthropic, OpenAI, Google, Bedrock, Ollama, ...). RPC equivalent: +`set_model`/`cycle_model`; CLI flags `--provider`, `--model`. Custom providers are added +via `pi.registerProvider(...)`. This is the swap point for "run on OpenAI/Codex models." +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md, +https://pi.dev/ + +### session_id +- **Creation:** a session has a `sessionId`. In JSON mode the run opens with a header + line: `{"type":"session","version":3,"id":"","timestamp":"...","cwd":"/path"}`. + The `id` is the session id (UUID). The SDK exposes it as `session.sessionId`; the + `Agent` constructor accepts an explicit `sessionId` (so Agenta can supply its own and + thread it through). +- **Threading:** sessions persist as JSONL files (`SessionManager.create(cwd)` for + on-disk, `SessionManager.inMemory()` for none). `createAgentSessionRuntime` supports + `newSession`/`switchSession`/`fork`/`importFromJsonl`, i.e. resume and branch by + session. In RPC mode, `new_session`/`switch_session`/`fork`/`clone` manage sessions; the + client correlates its own requests with the optional `id` field on each command. +- This matches the design doc's "carry a `session_id`... later have its state stored": + pi already persists session state to JSONL, and you can pass your own `sessionId`. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/agent/README.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md + +--- + +## Instrumentation ("pi instruments") - important nuance + +The design doc says runs are "instrumented with pi instruments." Findings: +- pi core ships **no product literally called "instruments."** Observability is delivered + through the **extension/hooks API** (you can instrument any of `context`, + `before_provider_request`, `after_provider_response`, `tool_call`, `tool_result`, + `agent_start/end`, `turn_start/end`, etc.). +- The mature path is **`pi-otel`**, a community OpenTelemetry extension: + - Install: `pi install npm:pi-otel`; activate `/otel start`. + - Span tree per prompt: `pi.interaction` -> `pi.turn` -> `pi.llm_request` / + `pi.tool.`, with GenAI semantic-convention attributes (model, token counts, + finish reason). + - Metrics: histograms for LLM request latency, token usage (input/output/cache), tool + execution time. + - Structured log events: `pi.session.start`, `pi.session.end`, `pi.tool.error`. + - Config via standard OTel env vars (`OTEL_EXPORTER_OTLP_ENDPOINT`, + `OTEL_EXPORTER_OTLP_HEADERS`) or `.pi/settings.json` `{ "otel": { endpoint, protocol } }`; + `PI_OTEL_DISABLED=1` disables it. +- There is also a proposed (issue-stage) session usage stats sink via `PI_USAGE_DIR`. +Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html, +https://github.com/earendil-works/pi/issues/2054, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Implication for Agenta:** "pi instruments" most likely means "instrument pi via its +hooks (OTel-style)," and Agenta's existing OTel-based tracing/observability can ingest +`pi-otel` OTLP output directly, or Agenta can write its own thin extension that emits +spans on the same hook points. Confirm with the design owner whether "pi instruments" +refers to `pi-otel`, a private Earendil "instruments" API, or just "instrumented via +hooks" - this wording is **UNVERIFIED**. + +--- + +## Local execution parity & swappable harness (design requirements) + +- **Parity:** the same `pi` binary / SDK that runs in the sandbox runs locally; pulling + the agent config (AGENTS.md, skills, model, tools, files, secrets) and starting pi + locally yields the same behavior. The four run surfaces are identical local vs sandbox. + Containerization doc shows host vs container are the same pi. +- **Swappable harness:** because the contract is a thin run surface (RPC JSONL / JSON + events / SDK events), a non-pi harness (e.g. OpenAI Codex) can be slotted behind the + same surface if Agenta defines its harness port against the RPC/event shapes. Within pi, + model/provider swapping (incl. OpenAI) is `getModel`/`registerProvider`/`set_model` - + but "swap the whole harness" is an Agenta-side abstraction over the run surface, not a + pi feature. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md, https://pi.dev/ + +--- + +## Open questions / unknowns + +1. **"pi instruments" exact meaning** - is it `pi-otel`, a private Earendil API, or + "instrument via hooks"? UNVERIFIED. Resolve with the design owner; if OTel, wire + `pi-otel` OTLP into Agenta's existing tracing. +2. **Provider-native structured output** - does `pi-ai` `complete()`/`stream()` accept a + `responseFormat`/`jsonSchema` option, or is the terminating-tool pattern the only + supported route? UNVERIFIED; confirm in `packages/ai` source. +3. **Non-image binary artifacts** - `pi-agent-core` advertises "attachment support," but + only the `image` content block is documented. How are arbitrary file/binary artifacts + returned (vs. written to the workspace and collected from disk)? UNVERIFIED; confirm in + `packages/agent`/`packages/ai` source. +4. **Daytona specifically** - pi documents Gondolin / Docker / OpenShell, not Daytona. The + Daytona port is Agenta's to build (lay files -> set secrets -> `pi --mode rpc`); no pi + Daytona integration exists today. +5. **Skills config -> pi** - how Agenta's stored "skills" map to pi skills (loaded via + `resources_discover` skillPaths and `~/.pi/agent` layout) needs a concrete mapping; + read `docs/settings.md` and the skills section of the SDK/extensions docs. +6. **Exact `agent_end.messages` schema** for storing multi-message output - capture the + precise `AgentMessage`/content-block JSON (read `packages/agent` types) before + designing Agenta's storage shape. +7. **Version pinning** - researched against `0.79.4`. The API is pre-1.0 and moving (RPC + command names, event names, hook names may change between minors); pin a version and + re-verify against that tag's docs before implementing. + +## Sources + +- https://pi.dev/ (and https://pi.dev/docs/latest) +- https://github.com/earendil-works/pi (repo root, package layout) +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts +- https://github.com/earendil-works/pi/blob/main/packages/agent/README.md +- https://github.com/earendil-works/pi/blob/main/packages/ai/README.md +- https://registry.npmjs.org/@earendil-works/pi-coding-agent (and /pi-ai, /pi-agent-core) - version, license, bin, deps +- https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html (pi-otel OTel extension) +- https://github.com/earendil-works/pi/issues/2054 (PI_USAGE_DIR usage stats proposal) +- https://deepwiki.com/earendil-works/pi (and /7.1-pi-coding-agent-sdk, /6.3-extension-examples-and-patterns) diff --git a/docs/design/agent-workflows/research/sandbox-sharing.md b/docs/design/agent-workflows/research/sandbox-sharing.md new file mode 100644 index 0000000000..9c8ffbaded --- /dev/null +++ b/docs/design/agent-workflows/research/sandbox-sharing.md @@ -0,0 +1,359 @@ +# Sandbox sharing: one sandbox for all agents, or one per agent? + +Status: research. Source of the question: the product owner wants v1 to mirror today's +prompt-style workflows, which run against one shared runtime/service rather than one per +workflow. The proposed shortcut is "reuse the same sandbox but connect it to a different +volume at each execution." + +This file answers: can we reuse one Daytona sandbox across many agent executions, can the +mounted volume change per execution, how do we isolate executions in a shared sandbox, +what is the concurrency model, how pi.dev views sessions, and what v1 should actually do. + +## Summary + +- **Reusing one long-lived sandbox: yes, supported.** A Daytona sandbox is designed for + long-lived reuse across many tasks, and the Process API provides both stateless one-off + `exec()` / `code_run()` and stateful named **Sessions** (`create_session` / + `execute_session_command` / `delete_session`) for running many independent command + streams in one sandbox. [daytona-sandboxes][daytona-sandboxes][daytona-process] +- **Swapping a different volume per execution: NO.** Daytona volumes are mounted **only at + sandbox creation** via `CreateSandboxFromSnapshotParams(volumes=[...])`. They cannot be + attached, detached, or changed on a running sandbox. Changing the mount requires + recreating the sandbox. The canonical docs say so explicitly. So the literal + "reuse the sandbox, attach a different volume each run" idea is **not feasible in + Daytona today.** [daytona-volumes][daytona-volumes-src] +- **Closest workable equivalent to "a volume per execution" without recreating the + sandbox:** give each execution its own **working directory** (e.g. + `/runs//`) and lay its config/files/secrets there per run, optionally with a + per-run OS user. That is the per-exec isolation lever in a shared sandbox, not volumes. + If you genuinely need a persistent named volume per agent, that belongs to the + sandbox-per-agent model, where `subpath` on one shared volume gives per-agent isolation + at create time. [daytona-process][daytona-volumes] +- **Isolation in a shared sandbox is weak by default.** All sessions and execs in one + sandbox share one kernel, one filesystem, one process table, one network stack, and one + set of OS env vars. Filesystem bleed, leftover processes, and secret bleed are real and + must be managed by convention (per-run dirs, per-command `env`, cleanup), not by the + platform. Daytona's own positioning is "isolated sandbox **per execution**" for safety. + [daytona-sandboxes][daytona-blog-best] +- **Concurrency is bounded and shares resources.** One sandbox defaults to 1 vCPU / 1 GiB + RAM (max 4 vCPU / 8 GiB), and an org's *total* active-sandbox budget is 4 vCPU / 8 GiB / + 10 GiB. Many agent runs can be launched as concurrent sessions in one sandbox, but they + contend for that single sandbox's CPU/RAM/disk and can step on each other's files. + Daytona has an open issue to add a Parallel Sandbox Execution API precisely because one + sandbox is not a clean unit for parallel independent workflows today. + [daytona-sandboxes][daytona-parallel-issue] +- **pi.dev does not need a dedicated machine per session, only a distinct session file and + working dir.** pi stores each session as a JSONL tree file; the SDK lets you point each + session at its own `cwd`, its own session file (`SessionManager.open(path)`), or its own + `agentDir`, and run in `--mode rpc --no-session`. So multiple pi sessions can coexist in + one environment as long as each gets its own directory/session file. This maps cleanly + onto "per-run working directory inside one shared sandbox." [pi-sdk][pi-docs] +- **Recommendation for v1:** one shared, long-lived sandbox for all agents, isolation by + **per-run working directory + per-command env + cleanup**, NOT by per-run volumes. + Treat the volume-per-execution idea as not feasible and substitute per-run dirs. + Serialize or cap concurrency on the shared sandbox. Keep the sandbox-provider port + abstraction so the migration to **sandbox-per-agent / sandbox-per-run** (with a + per-agent volume via `subpath` at create time) is a config swap, not a rewrite. + +## Reusing one sandbox (sessions / exec model) + +Daytona explicitly designs sandboxes for long-lived reuse: they keep filesystem state +across stop/start, can be archived and restored, and resized without recreation. +[daytona-sandboxes] Agenta already has the integration scaffolding: `DaytonaConfig` in +`api/oss/src/utils/env.py` carries `DAYTONA_API_KEY`, `DAYTONA_API_URL`, +`DAYTONA_SNAPSHOT`, `DAYTONA_TARGET`, which tells us the plan is snapshot-based sandbox +creation. + +The Process API gives two execution modes inside one sandbox: + +- **One-off, stateless:** `exec(command, cwd=None, env=None, timeout=None)` and + `code_run(code, params=None, timeout=None)`. Each invocation starts fresh; good for + isolated commands. Both accept per-call `cwd` and `env`. [daytona-process] +- **Stateful Sessions:** named background sessions that persist state across commands. + [daytona-process] + +Python session example (verbatim shape from the docs): [daytona-process-src] + +```python +session_id = "interactive-session" +sandbox.process.create_session(session_id) + +command = sandbox.process.execute_session_command( + session_id, + SessionExecuteRequest( + command="pip uninstall requests", + run_async=True, + ), +) +# later +sandbox.process.get_session(session_id) # status + command history +sandbox.process.delete_session(session_id) # cleanup +``` + +`SessionExecuteRequest` fields: `command` and `run_async` (Python) / `runAsync` (TS). +[daytona-process-src] Sessions are the natural home for one agent run: create a session +per run keyed by `session_id`, fire the harness command, monitor it, delete the session +when done. Many sessions can live in one sandbox at once. + +**Keeping the shared sandbox alive.** A running sandbox auto-stops after +`autoStopInterval` (default 15 min). Critically, **internal/background processes do NOT +reset the timer** — only lifecycle changes, preview network requests, active SSH, and +Toolbox SDK calls do. For an always-on shared sandbox, set `autoStopInterval: 0` or call +`sandbox.refreshActivity()` periodically. [daytona-sandboxes] + +## Volumes — can they change per execution? + +**No.** This is the central finding and it kills the literal proposal. + +> "Once a volume is created, it can be mounted to a sandbox by specifying it in the +> `CreateSandboxFromSnapshotParams` object." [daytona-volumes-src] + +Volumes mount **only at sandbox creation**. There is no API to attach/detach or swap a +volume on a running sandbox; the docs describe mounting exclusively through the create +params, and contain no running-sandbox mount operation. Changing what is mounted requires +**recreating** the sandbox. [daytona-volumes][daytona-volumes-src] + +Mounting example (Python): [daytona-volumes] + +```python +from daytona import CreateSandboxFromSnapshotParams, Daytona, VolumeMount + +daytona = Daytona() +volume = daytona.volume.get("my-volume", create=True) + +params = CreateSandboxFromSnapshotParams( + language="python", + volumes=[ + VolumeMount( + volume_id=volume.id, + mount_path="/home/daytona/volume", + subpath="users/alice", # optional per-tenant prefix + ) + ], +) +sandbox = daytona.create(params) +``` + +`VolumeMount` fields: `volume_id`, `mount_path` (absolute, not `/`, not a system dir like +`/proc`, `/etc`, `/bin`...), and optional `subpath`. [daytona-volumes][daytona-volumes-src] + +Other volume facts that matter: + +- **Persistence:** "The volume will persist even after the sandbox is removed." Good for + producer/consumer state across sandbox lifecycles. [daytona-volumes-src] +- **`subpath` isolation:** a sandbox mounted at `users/alice` cannot reach `users/bob` via + `../bob`; isolation is at the FUSE mount boundary. This is the supported way to give each + *sandbox* (created per agent/run) its own slice of one shared volume — but again, only at + create time. [daytona-volumes][daytona-volumes-src] +- **FUSE limits:** volumes are FUSE mounts — slower than local disk, not usable for block + storage (e.g. DB files), and "not transactional": concurrent writes to the same path are + last-write-wins. [daytona-volumes-src] +- **FUSE permission bugs:** an open issue reports `mv`, repeated `touch`, `stat`, and + `shutil.copystat()` failing with permission errors inside FUSE volumes. This makes + volumes a poor surface for frequent per-run file manipulation even where they do apply. + [daytona-fuse-issue] + +**Conclusion for the question as posed:** "reuse one sandbox, connect a different volume +each execution" is not achievable in Daytona. Volumes are a create-time-only mount. + +### Alternatives to per-execution volumes (in one shared sandbox) + +1. **Per-run working directory (recommended).** Lay each run's config/files/secrets under + `/runs//` (or a temp dir) and run the harness with that as `cwd`. Clean it + up on completion. This is the direct in-sandbox analog of "a different volume per run" + and avoids the FUSE limits entirely. `exec`/`execute_session_command` already take + `cwd`. [daytona-process] +2. **Copy files in/out per run** via the filesystem/Toolbox API, scoped to the per-run dir. +3. **Per-run OS user** for stronger separation (file ownership, home dir) if root isn't + required by the harness. (Standard Linux; UNVERIFIED whether Daytona's default image + permits adding users without extra config.) +4. **Recreate-per-run with a volume** (this is sandbox-per-run, not sandbox-sharing): if a + *persistent* per-agent volume is a hard requirement, create a fresh sandbox per run with + `volumes=[VolumeMount(volume_id, mount_path, subpath="agents/")]`. This is the + migration target, not v1. + +## Isolation in a shared sandbox + +A single Daytona sandbox is "isolated" from *other sandboxes and the host* — it gets a +dedicated kernel, filesystem, network stack, and resource allocation. [daytona-sandboxes] +But **within** one sandbox there is no isolation between executions. All sessions and execs +share: + +- **One filesystem** — files written by run A are visible to run B unless you scope each + run to its own directory and clean up. Filesystem bleed is the default. +- **One process table** — a leftover/background process from a prior run keeps running + (and does not even reset the auto-stop timer). You must track and kill per-run PIDs. + [daytona-sandboxes] +- **One set of OS environment variables** — sandbox-level env is global. Secret bleed is a + real risk if you `export` a secret. Mitigate by passing secrets per command via the `env` + parameter of `exec` / `execute_session_command` rather than setting them globally, and by + scoping secret files to the per-run dir. [daytona-process] +- **One network stack** — ports and outbound identity are shared. + +Practical isolation recipe for a shared sandbox: + +- Unique `session_id` per run; one Daytona Session per run. +- Per-run working dir `/runs//`; never write run state outside it. +- Pass secrets via per-command `env`, not global exports; keep secret files inside the + per-run dir with tight permissions; delete on completion. +- Explicit cleanup: kill the run's process group, remove the run dir, `delete_session`. +- Optional per-run OS user for ownership separation. + +Even with all of this, one sandbox is a **soft** isolation boundary (shared kernel, Docker +by default). For untrusted agent code or cross-tenant separation, this is weaker than +sandbox-per-run. Daytona's own marketing leans on "isolated sandbox **per execution**" for +exactly this reason, and notes the default Docker isolation is weaker than microVMs. +[daytona-blog-best] + +## Concurrency + +- **Resource budget.** One sandbox defaults to 1 vCPU / 1 GiB / 3 GiB disk, max + 4 vCPU / 8 GiB / 10 GiB. The whole org's active-sandbox budget is also 4 vCPU / 8 GiB / + 10 GiB. So a single shared sandbox is a small box, and packing many concurrent agent runs + into it means they contend for that fixed slice. [daytona-sandboxes] +- **Mechanically parallel, practically contended.** You *can* open multiple sessions and + run them concurrently in one sandbox, but they share CPU/RAM/disk and the filesystem, so + heavy or untrusted runs can starve or corrupt each other. There is no per-session cgroup + isolation documented. (UNVERIFIED: no documented per-session CPU/memory quota.) +- **Daytona itself flags this gap.** Open issue "Design and Implement Parallel Sandbox + Execution API" states that "developers working on AI agents or multi-threaded workflows + face limitations when trying to run multiple tasks concurrently," and that the current + workaround is "running multiple independent sandboxes manually (inefficient and + resource-heavy)." The proposed fix is forking sandbox state (filesystem + memory) — i.e. + Daytona's answer to parallel independent runs is *more sandboxes*, not more sessions in + one. [daytona-parallel-issue] + +Realistic v1 concurrency model for a shared sandbox: **serialize, or cap to a small N** of +concurrent sessions, each in its own working dir, sized to fit the sandbox's CPU/RAM. If +throughput needs to scale, that is the trigger to move to sandbox-per-run. + +## pi.dev session / workspace model + +pi (by Earendil Inc.) is a minimal, extensible agent harness — the harness Agenta's agent +workflow defaults to. It runs as an interactive TUI, a print/JSON one-shot, an RPC process +(stdin/stdout JSONL), or embedded via a Node SDK. [pi-home][pi-docs] + +Key points for sharing one sandbox: + +- **Sessions are files, not machines.** pi stores each session as a JSONL tree file + (branchable history). It does not require a dedicated host per session. [pi-docs] +- **Per-session isolation is by path.** The SDK's `SessionManager` controls where state + lives: `SessionManager.create(cwd)` (new session in a directory), + `SessionManager.continueRecent(cwd)`, `SessionManager.open("/path/to/session.jsonl")` + (explicit file), and `SessionManager.inMemory()` (ephemeral). You can also point at a + different global config via `agentDir`. [pi-sdk] +- **Multiple pi sessions coexist** in one environment by giving each a distinct `cwd`, + distinct session file, and/or distinct `agentDir` — "each combination isolates session + state, credentials, and settings files." [pi-sdk] +- **Context comes from the working dir.** pi loads `AGENTS.md` / `SYSTEM.md` from + `~/.pi/agent/`, parent dirs, and the cwd, so the per-run working dir naturally carries + per-run agent config. [pi-home] +- **Non-interactive runs:** `pi --mode rpc --no-session` (or `runRpcMode(runtime)`) for a + programmatic, sessionless subprocess driven over JSON-RPC. [pi-sdk] + +Implication: pi's design is fully compatible with "one shared sandbox, many runs." Each +agent run = one pi process pointed at its own per-run `cwd` (carrying that run's +`AGENTS.md`, skills, files) and its own session file. pi gives Agenta the per-run state +isolation that Daytona volumes do **not**. Agenta's `session_id` should map to (a) the pi +session file name and (b) the per-run working directory, and (c) the Daytona Session id — +one id threading all three layers. + +## Recommendation for v1 + migration path + +### v1: one shared sandbox, isolation by directory (not by volume) + +1. **One long-lived shared Daytona sandbox** created from `DAYTONA_SNAPSHOT`, with + `autoStopInterval: 0` (or periodic `refreshActivity()`), reused across all agents. + Matches the PO's "one runtime for all" goal and the existing prompt-runtime shared model. +2. **Per-run isolation by working directory, not volume.** For each run, create + `/runs//`, lay down that agent's config (`AGENTS.md`, skills, files) and + secrets there via startup hooks, and run pi with that dir as `cwd` and its own session + file. The "different volume per execution" intent is satisfied by a different *directory* + per execution. This sidesteps Daytona's create-time-only volume limit and the FUSE + permission/perf problems. [daytona-process][daytona-volumes][daytona-fuse-issue] +3. **One Daytona Session per run**, keyed by `session_id`; secrets passed via per-command + `env`, never global exports. [daytona-process] +4. **Mandatory cleanup** after each run: kill the run's process group, delete the run dir, + `delete_session`. This is what contains filesystem/process/secret bleed in a shared box. +5. **Bounded concurrency:** serialize, or cap to a small N sized to the sandbox's 1–4 vCPU. + [daytona-sandboxes] +6. **Keep the sandbox-provider port thin** so the unit of isolation (shared vs per-run) is + a config choice behind the same interface, as the design doc already anticipates. + +Honest framing for the PO: "one sandbox for all agents" is achievable, but **not by +swapping volumes** — by swapping working directories. The volume idea is the right +*instinct* (per-run isolated storage) attached to the wrong Daytona primitive. Use +directories in v1; use volumes only when you move to per-run/per-agent sandboxes. + +### Migration path to per-agent / per-run sandboxes + +When isolation, security (untrusted code), or concurrency throughput outgrow the shared +box: + +- Flip the provider port from "reuse shared sandbox" to "create sandbox per run." +- At creation, mount a per-agent persistent volume slice with + `VolumeMount(volume_id, mount_path, subpath="agents/")` — this is where the + "volume per agent" idea finally becomes native and correct. [daytona-volumes] +- Optionally enable stronger isolation (Kata/Sysbox) for untrusted code. + [daytona-blog-best] +- Lean on snapshot warm-starts to keep per-run create latency low. [daytona-sandboxes] + +Because pi already isolates by `cwd`/session file and `session_id` threads all layers, the +run-orchestration code barely changes between the two models; only the +"get-a-sandbox" step swaps. + +## Open questions + +- **Per-session resource quotas.** Can Daytona cap CPU/RAM/disk per Session (cgroups) + inside one sandbox, or is the only quota the whole-sandbox allocation? Not found in docs + — UNVERIFIED. If none, concurrent runs cannot be resource-isolated within one sandbox. +- **Default image users/permissions.** Does the snapshot image allow adding/switching OS + users per run without root issues? UNVERIFIED. +- **Toolbox filesystem API surface** for laying down per-run files/secrets and reading + outputs (upload/download/permissions) — needs confirmation against the Daytona Toolbox + SDK docs; sibling research on the sandbox port should pin this down. +- **pi `--no-session` vs Agenta `session_id`.** Agenta wants a `session_id` per run for + future state storage; pi can run sessionless (`--no-session`) or with an explicit session + file. Decide whether Agenta persists the pi JSONL session file (per the design doc's + "future session storage") or treats runs as sessionless and stores its own trace. The + design doc's session-storage goal points to keeping pi session files. +- **Concurrency ceiling.** Exact safe N of parallel pi runs in one 1–4 vCPU sandbox needs + empirical testing; treat as serialize-first until measured. +- **Daytona Parallel Sandbox Execution API status.** Issue #4001 is a proposal; if/when it + ships (fork filesystem+memory), it could change the cheapest path for parallel runs. + [daytona-parallel-issue] + +## Sources + +- [daytona-sandboxes] Daytona — Sandboxes (lifecycle, states, auto-stop/archive/delete, + refreshActivity, resource limits, per-sandbox isolation): + https://www.daytona.io/docs/en/sandboxes/ +- [daytona-process] Daytona — Process and Code Execution (exec/code_run vs Sessions, cwd, + env, create/execute/get/delete session): https://www.daytona.io/docs/en/process-code-execution/ +- [daytona-process-src] Daytona docs source — process-code-execution.mdx (verbatim session + example, SessionExecuteRequest fields): + https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx +- [daytona-volumes] Daytona — Volumes (creation, VolumeMount, mount_path/subpath, FUSE, + mounting via CreateSandboxFromSnapshotParams): https://www.daytona.io/docs/en/volumes/ +- [daytona-volumes-src] Daytona docs source — volumes.mdx (verbatim "mounted at creation", + persistence, FUSE not transactional, last-write-wins): + https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/volumes.mdx +- [daytona-fuse-issue] Daytona GitHub issue #3331 — FUSE volume permission limitations + (mv/touch/stat/copystat failures): https://github.com/daytonaio/daytona/issues/3331 +- [daytona-parallel-issue] Daytona GitHub issue #4001 — Design and Implement Parallel + Sandbox Execution API (fork filesystem+memory; current workaround = many sandboxes): + https://github.com/daytonaio/daytona/issues/4001 +- [daytona-blog-best] Northflank — "Best code execution sandbox for AI agents 2026" + (isolated sandbox per execution; Docker-default isolation weaker than microVMs): + https://northflank.com/blog/best-code-execution-sandbox-for-ai-agents +- [pi-home] pi.dev — product overview (harness, modes, AGENTS.md/SYSTEM.md context): + https://pi.dev +- [pi-docs] pi.dev — docs index (session tree, JSONL session format, RPC/SDK modes): + https://pi.dev/docs/latest +- [pi-sdk] pi.dev — SDK/RPC (SessionManager.create/continueRecent/open/inMemory, cwd, + agentDir, runRpcMode, `--mode rpc --no-session`): https://pi.dev/docs/latest/sdk +- Agenta repo — `api/oss/src/utils/env.py` `DaytonaConfig` (DAYTONA_API_KEY, + DAYTONA_API_URL, DAYTONA_SNAPSHOT, DAYTONA_TARGET). +- Agenta repo — `docs/design/agent-workflows/README.md` (agent workflow context, sandbox + + pi harness + session_id) and `docs/design/prompt-runtime-unification/README.md` (existing + shared prompt runtime model). diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/README.md b/docs/design/agent-workflows/wp-1-pi-tracing/README.md new file mode 100644 index 0000000000..0e0d1ee46a --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/README.md @@ -0,0 +1,73 @@ +# WP-1: Tracing Pi in Agenta + +Status: done. Working code in [`poc/`](poc/). To embed it in the agent runtime, follow +[`integrating-the-tracing-extension.md`](integrating-the-tracing-extension.md). + +## Goal + +Install Pi locally, run an agent, and get its telemetry into Agenta as a clean, structured +trace. Success looks like: a local Pi run shows up in Agenta observability as a sensible +span tree (session at the root, turns under it, LLM calls and tool calls as child spans) +with token usage and timings intact. + +## Scope + +In: + +- Run Pi locally (`@earendil-works/pi-coding-agent`), pin an exact version. +- A Pi extension on the `pi.on(...)` event bus that converts lifecycle events + (`session_start`, `turn_*`, `before_provider_request`/`after_provider_response`, + `tool_execution_*`, `message_*`) into OTel spans. +- Export OTLP/HTTP protobuf to Agenta's `POST /otlp/v1/traces`. +- Make the span tree read well in Agenta's UI. + +Out (later work packages): + +- Running inside Daytona. Local only here. +- The agent service itself (that is WP-2). This WP produces the tracing extension that + WP-2 later embeds. + +## Approach (grounded in research) + +See [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md) and +[`../research/pi-interaction.md`](../research/pi-interaction.md). + +- Pi emits no OTel on its own. Either adopt/fork a community extension (`pi-otel*`) or write + our own on the event bus. Writing our own is likely cleaner since we control the span + shape. +- Emit OTel GenAI semantic conventions (`gen_ai.*`) plus `openinference.span.kind` + (AGENT / CHAIN / LLM / TOOL) so Agenta types the nodes correctly. Agenta's adapter + registry already understands both. +- Export over OTLP/HTTP protobuf with `Authorization: ApiKey ` and `?project_id=`. + +## Known gotchas to handle + +- **Token attribute drift.** Pi-style extensions emit `gen_ai.usage.input_tokens` / + `output_tokens`, but Agenta's `semconv.py` maps the older + `prompt_tokens` / `completion_tokens` / `total_tokens`. Either normalize in the extension + or add aliases in Agenta, or token metrics drop silently. +- **Transport.** Agenta accepts OTLP/HTTP protobuf only. Not gRPC default, not JSON-OTLP. + Configure the exporter accordingly. +- **Trace-context propagation.** Whether a W3C `traceparent` is threaded into the run so + in-sandbox spans nest under an originating backend span is UNVERIFIED. Confirm during this + WP. + +## Definition of done + +- A local Pi run produces one trace in Agenta with a coherent span tree. +- LLM and tool spans are typed correctly and carry model, latency, and token usage. +- No silently dropped attributes (token usage in particular is present). +- The exporter config (endpoint, auth, project) is injected, not hard-coded, so it carries + over to the sandboxed and service contexts later. + +## Open questions + +- Adopt a community `pi-otel` extension or write our own? Lean: write our own. +- Final span-tree shape to standardize on (session vs interaction root naming). +- Does Agenta forward `traceparent` into an invocation for nesting? + +## Links + +- [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md) +- [`../research/pi-interaction.md`](../research/pi-interaction.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md b/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md new file mode 100644 index 0000000000..0be59d585c --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md @@ -0,0 +1,186 @@ +# Integrating the Pi tracing extension into the agent runtime + +Status: ready to integrate. Audience: whoever builds the Dockerized Pi agent runtime +(WP-2 service, WP-3 sandbox). Source of the working code: [`poc/`](poc/). + +## What this gives you + +A Pi extension that turns Pi's `pi.on(...)` lifecycle events into OpenTelemetry spans and +ships them to Agenta over OTLP/HTTP protobuf. Once it is loaded, every agent run shows up +in Agenta observability as a clean span tree with inputs, outputs, token usage, cost, and +latency, and runs in the same session are grouped by `session.id`. + +It is one self-contained file, `poc/agenta-otel.ts`. Copy it into the runtime as is. It is +written to be embedded, not just demoed. `poc/run.ts` is only an example driver; you will +write your own runner, but you can copy its wiring. + +This was verified end to end against the dev box: complex multi-tool runs, parallel tool +calls, structured returns, and multi-prompt sessions all trace correctly, and the agent +root reports the correct whole-run token and cost totals. + +## The span tree it produces + +``` +invoke_agent openinference.span.kind = AGENT (root, one per user prompt) + turn N CHAIN + chat LLM model, latency, token usage, finish reason, messages + execute_tool TOOL args in, result out +``` + +Agenta types nodes from `openinference.span.kind` (AGENT to agent, CHAIN to chain, LLM to +chat, TOOL to tool) and groups sessions from `session.id`. No backend change is needed. + +## How to wire it in + +The runtime is Node embedding Pi through the SDK, so use the SDK path. It is the one the +extension is built for, and it is the only path where session id and model name reach the +spans. + +```ts +import { + AuthStorage, createAgentSession, DefaultResourceLoader, + getAgentDir, ModelRegistry, SessionManager, +} from "@earendil-works/pi-coding-agent"; +import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel"; + +const loader = new DefaultResourceLoader({ + cwd, + agentDir: getAgentDir(), + extensionFactories: [agentaOtel], // <-- register the extension in-process +}); +await loader.reload(); + +const { session } = await createAgentSession({ + cwd, model, authStorage, modelRegistry, + tools: ["read", "bash", "edit", "write", "ls"], + sessionManager: SessionManager.inMemory(cwd), + resourceLoader: loader, +}); + +// Hand the session id and model to the extension so spans carry them. +runConfig.sessionId = session.sessionId; +runConfig.provider = model.provider; +runConfig.requestModel = model.id; + +await session.prompt(userPrompt); // run one or more prompts in the session +// ... +await shutdownTracing(); // flush before the process or container exits +``` + +If you instead run Pi from the CLI (`pi -e ./agenta-otel.ts ...`), the extension still +emits spans and flushes on `session_shutdown`, but `runConfig` is never set, so spans lose +`session.id` and the model name in the span title. Prefer the SDK path. + +## What you must not change, and why + +These five choices are load bearing. They were each found by reading how Agenta ingests +and normalizes spans. Changing them silently drops data. + +1. **Atomic, parent-first export per trace.** The extension uses a small custom + `TraceBatchProcessor`, not the OTel `BatchSpanProcessor`. It buffers a trace and exports + all of its spans in one OTLP request when the root span ends, ordered parent before + child. Agenta rolls token and cost totals up the tree by sorting spans on + millisecond-resolution `start_time` and attaching a span only once its parent is + present. The default batch processor splits long runs on its 5 second timer, and + same-millisecond siblings (`agent_start` and `turn_start` fire in the same millisecond) + tie and drop a subtree. Either one makes the agent root undercount, showing only the + last turn instead of the whole run. Keep the custom processor. + +2. **`ag.data.inputs` must be a JSON object.** Agenta moves any non-object input to + `ag.unsupported`. The agent and tool spans emit `input.value` as a JSON object. The chat + span emits OpenInference `llm.input_messages.*` and `llm.output_messages.*` so it renders + as a real message thread. Do not emit a raw string as `input.value`. + +3. **Both token naming conventions.** The extension writes token usage under the current + GenAI names (`gen_ai.usage.input_tokens` / `output_tokens`) and the legacy names + (`prompt_tokens` / `completion_tokens`). Agenta's default `semconv.py` only maps the + legacy names today. Emit both or token metrics drop. + +4. **`openinference.span.kind` on every span.** This is what types the node in the UI. + +5. **`session.id` and `gen_ai.conversation.id` on the root.** Both map to `ag.session.id`, + which groups runs into a session. Set them from the Pi `sessionId`. + +## Configuration + +All config is read from the environment at first use, so set it before the first run. + +| Env var | Meaning | +|---|---| +| `AGENTA_HOST` | Agenta base URL, for example `http://144.76.237.122:8280`. A trailing slash is stripped. | +| `AGENTA_API_KEY` | Agenta project API key. The project is resolved from the key, so no `project_id` is needed. | +| `PI_OTEL_CAPTURE_CONTENT` | Set to `0` to drop prompts, completions, and tool I/O from spans. Default is on. | +| `OTEL_SERVICE_NAME` | Resource `service.name`, default `pi-agent`. | + +The exporter posts to `${AGENTA_HOST}/api/otlp/v1/traces`. Note the `/api` prefix. The +transport is OTLP/HTTP protobuf only (`@opentelemetry/exporter-trace-otlp-proto`), with +header `Authorization: ApiKey `. JSON OTLP and gRPC are rejected. + +These are the same env vars whether the runtime runs locally or in a container, which keeps +local and server behavior identical. + +## Dockerized runtime notes + +- **Inject the two Agenta env vars** (`AGENTA_HOST`, `AGENTA_API_KEY`) into the container as + secrets at start. They are separate from the LLM provider credentials. +- **Allow outbound network** from the sandbox to the Agenta host over HTTP or HTTPS. +- **Flush before the container exits.** Call `shutdownTracing()` at the end of the run. The + per-trace processor already exports each trace when its root span ends, so a completed + trace is usually shipped mid-run, but a final flush guards the last trace. If the + container is killed before the flush, the last trace can be lost. If you cannot call + `shutdownTracing()`, make sure `SIGTERM` triggers Pi's `session_shutdown`, which the + extension also flushes on. +- **Node 22 or newer** is required by Pi 0.79.4. +- **LLM auth in the sandbox is your concern, not the tracing.** The interactive ChatGPT + Codex login used in the POC is local only. In the container use a non-interactive + credential (an API key or a transplanted token). +- **Trace context across the boundary is done for the WP-2 service.** The agent service + threads a W3C `traceparent` into the run and starts the agent span as a child of the + Agenta `/invoke` span, so the whole agent run is part of the response trace. See + [tracing-in-the-agent-service.md](tracing-in-the-agent-service.md). Standalone runs (no + `traceparent`) still create their own root and correlate by `session.id`. + +## Dependencies + +Pin these in the runtime image (the OTel versions are a known-compatible set): + +``` +@earendil-works/pi-coding-agent 0.79.4 +@opentelemetry/api 1.9.0 +@opentelemetry/exporter-trace-otlp-proto 0.54.0 +@opentelemetry/resources 1.28.0 +@opentelemetry/sdk-trace-base 1.28.0 +@opentelemetry/sdk-trace-node 1.28.0 +@opentelemetry/semantic-conventions 1.28.0 +``` + +## How to verify it works + +1. On startup you should see `[agenta-otel] exporting spans to .../api/otlp/v1/traces`. +2. After a run, fetch the trace and check the tree and totals: + ``` + curl -s "${AGENTA_HOST}/api/spans/?trace_id=" -H "Authorization: ApiKey ${AGENTA_API_KEY}" + ``` + Expect `invoke_agent` (agent) over `turn N` (chain) over `chat` (chat) and + `execute_tool` (tool). Expect `ag.data.inputs` and `ag.data.outputs` on the agent, chat, + and tool spans, and nothing under `ag.unsupported`. Expect the agent root's + `ag.metrics.tokens.cumulative` to equal the sum of the chat spans' incrementals. +3. Or open Agenta observability and confirm the trace reads well and the root shows the + full-run token count and cost. + +## Reference: attributes per span + +| Span | Key attributes the extension sets | +|---|---| +| `invoke_agent` (AGENT) | `openinference.span.kind=AGENT`, `gen_ai.operation.name=invoke_agent`, `session.id`, `gen_ai.conversation.id`, `input.value` as `{prompt}`, `output.value` final text | +| `turn N` (CHAIN) | `openinference.span.kind=CHAIN`, `pi.turn.index` | +| `chat ` (LLM) | `openinference.span.kind=LLM`, `gen_ai.system`, `gen_ai.request.model`, `gen_ai.response.model`, `gen_ai.response.finish_reasons`, `gen_ai.usage.{input,output,prompt,completion,total}_tokens`, `llm.input_messages.*`, `llm.output_messages.*` | +| `execute_tool ` (TOOL) | `openinference.span.kind=TOOL`, `gen_ai.tool.name`, `gen_ai.tool.call.id`, `input.value` as the args object, `output.value` the result | + +## One known gap, not on the agent side + +The Agenta Sessions tab groups our `session.id` correctly, and the per-session API +(`POST /api/traces/query` filtering `ag.session.id`) returns the right traces with costs, +but the Sessions table's aggregate columns render empty on the current dev build. The data +is correct and queryable. This is a frontend rendering gap, not something the instrumentation +or the runtime can fix. diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example b/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example new file mode 100644 index 0000000000..a1ca16a17b --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example @@ -0,0 +1,7 @@ +# Agenta collector (the runner also falls back to the repo-root .env.test.local). +AGENTA_HOST=http://144.76.237.122:8280/ +AGENTA_API_KEY=your-agenta-project-api-key + +# Optional: +# PI_OTEL_CAPTURE_CONTENT=0 # drop prompt/response/tool I/O from spans +# OTEL_SERVICE_NAME=pi-agent diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md b/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md new file mode 100644 index 0000000000..8d78fc4532 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md @@ -0,0 +1,86 @@ +# WP-1 POC: trace the Pi agent harness into Agenta + +Installs [Pi](https://pi.dev) locally, runs a small tool-using agent, and exports the +run to Agenta observability as a clean OpenTelemetry trace. + +## What's here + +- `agenta-otel.ts` — the deliverable: a Pi extension that turns `pi.on(...)` lifecycle + events into OTel spans and exports them (OTLP/HTTP protobuf) to Agenta. WP-2 embeds + this file as-is. +- `run.ts` — a runner that registers the extension in-process and drives one prompt. + +## Span tree + +``` +invoke_agent (openinference.span.kind = AGENT, carries session.id) + turn N (CHAIN) + chat (LLM — model, latency, token usage, finish reason) + execute_tool (TOOL — args + result) +``` + +Token usage is emitted under both the current (`input_tokens`/`output_tokens`) and +legacy (`prompt_tokens`/`completion_tokens`) GenAI names, so Agenta maps it regardless +of which adapter claims the span. + +## Setup + +```bash +pnpm install --ignore-workspace +``` + +### Authenticate Pi (one time) + +The runner uses `~/.pi/agent/auth.json`. Log in with your ChatGPT subscription — no API +key, no per-token billing: + +```bash +pnpm exec pi # opens the TUI +/login # choose "ChatGPT Plus/Pro (Codex)", finish the browser OAuth +# then quit the TUI +``` + +Alternatively, export `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. + +### Credentials for Agenta + +The runner reads `AGENTA_HOST` / `AGENTA_API_KEY` from a local `.env` (see `.env.example`) +or, failing that, from the repo-root `.env.test.local`. + +## Run + +```bash +pnpm start # uses gpt-5.5 by default +PI_MODEL=gpt-5.4 pnpm start # pick another available model +``` + +The runner prints the `trace_id` and a `/api/spans/?trace_id=...` fetch URL on exit. +Then open Agenta observability and find the `invoke_agent` trace. + +> Note: `gpt-5.3-codex-spark` is **not** usable on a ChatGPT (Codex) login — it 400s. +> Use `gpt-5.5` / `gpt-5.4`. + +## Verified mapping (Agenta conventional semantics) + +A run produces a coherent tree that types and maps correctly: + +``` +invoke_agent (agent) ag.data.inputs={prompt}, ag.data.outputs=text, ag.session.id, cumulative tokens + turn N (chain) + chat (chat) ag.data.inputs.prompt[] + ag.data.outputs.completion[] (OpenInference + messages), ag.meta.request.model, incremental token usage + execute_tool (tool) ag.data.inputs={args}, ag.data.outputs=result +``` + +Two things make the data land in `ag.data` instead of `ag.unsupported`: +`ag.data.inputs` must be a **JSON object** (Agenta exiles non-dict inputs), so the agent and +tool spans emit `input.value` as JSON; the chat span emits OpenInference +`llm.input_messages.*` / `llm.output_messages.*` so it renders as a message thread. Token +usage is emitted under both the new (`input_tokens`) and legacy (`prompt_tokens`) names. + +A third thing makes the **agent-root token/cost totals correct**: Agenta rolls metrics up +its span tree by sorting on millisecond-resolution `start_time` and attaching a span only +once its parent is present. Same-millisecond siblings (e.g. `agent_start`/`turn_start`) +tie and can drop a subtree from the roll-up. So the extension buffers each trace and +exports it in one OTLP batch when the root span ends, ordered **parent-first** — without +this, a multi-turn agent root undercounts (shows only the last turn's tokens/cost). diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts b/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts new file mode 100644 index 0000000000..a11d959d36 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts @@ -0,0 +1,414 @@ +/** + * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into + * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta. + * + * Span tree (one per user prompt): + * invoke_agent (openinference.span.kind = AGENT) + * turn N (CHAIN) + * chat (LLM) — the provider request for that turn + * execute_tool (TOOL) — each tool the turn ran + * + * Agenta's OpenInference adapter types nodes off `openinference.span.kind` + * (AGENT->agent, CHAIN->chain, LLM->chat, TOOL->tool) and `session.id` -> + * `ag.session.id`. Token usage is emitted under BOTH the legacy + * (`prompt_tokens`/`completion_tokens`) and current + * (`input_tokens`/`output_tokens`) GenAI names so it maps regardless of which + * Agenta adapter claims the span. + * + * Works two ways with the same file: + * - SDK: pass the default export to DefaultResourceLoader.extensionFactories, + * then call shutdownTracing() after the run to flush (see run.ts). + * - CLI: `pi -e ./agenta-otel.ts`; the session_shutdown handler flushes on exit. + * + * Config (read lazily so the runner can load .env first): + * AGENTA_HOST, AGENTA_API_KEY — exporter endpoint + auth (required) + * PI_OTEL_CAPTURE_CONTENT=0 — disable prompt/response/tool I/O capture + * OTEL_SERVICE_NAME — resource service.name (default "pi-agent") + */ +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; +import { + context, + trace, + SpanStatusCode, + type Context, + type Span, +} from "@opentelemetry/api"; +import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto"; +import { Resource } from "@opentelemetry/resources"; +import type { + ReadableSpan, + SpanExporter, + SpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; + +/** + * Buffer a trace's spans and export them in ONE OTLP batch when the root span + * ends. Agenta computes cumulative (rolled-up) token/cost metrics per ingest + * batch, so a trace split across batches (which BatchSpanProcessor does on its + * timer for long runs) loses the root aggregation — the agent node would show + * only the last turn's tokens/cost instead of the whole-run total. + */ +class TraceBatchProcessor implements SpanProcessor { + private readonly buffers = new Map(); + constructor(private readonly exporter: SpanExporter) {} + onStart(): void {} + onEnd(span: ReadableSpan): void { + const traceId = span.spanContext().traceId; + const spans = this.buffers.get(traceId) ?? []; + spans.push(span); + if (span.parentSpanId) { + this.buffers.set(traceId, spans); + } else { + // Root span ended: all descendants ended earlier, so the trace is complete. + this.buffers.delete(traceId); + this.exporter.export(orderParentFirst(spans), () => {}); + } + } + forceFlush(): Promise { + const leftovers = [...this.buffers.values()].flat(); + this.buffers.clear(); + if (leftovers.length === 0) return Promise.resolve(); + return new Promise((resolve) => + this.exporter.export(orderParentFirst(leftovers), () => resolve()), + ); + } + shutdown(): Promise { + return this.forceFlush().then(() => this.exporter.shutdown()); + } +} + +/** + * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at + * millisecond resolution and builds its roll-up tree by sorting on start_time, + * attaching a span only if its parent is already seen. Sibling events fired in + * the same millisecond (agent_start/turn_start) would otherwise tie, and a + * child sorted before its parent gets dropped from the cumulative tree. A + * parent-first request order makes the backend's stable sort keep parents ahead + * of children on ties. + */ +function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] { + const byId = new Map(spans.map((s) => [s.spanContext().spanId, s])); + const childrenOf = new Map(); + const roots: ReadableSpan[] = []; + for (const s of spans) { + const parentId = s.parentSpanId; + if (parentId && byId.has(parentId)) { + const list = childrenOf.get(parentId) ?? []; + list.push(s); + childrenOf.set(parentId, list); + } else { + roots.push(s); + } + } + const ordered: ReadableSpan[] = []; + const visit = (s: ReadableSpan) => { + ordered.push(s); + for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child); + }; + roots.forEach(visit); + // Any spans not reached (defensive) get appended so nothing is dropped. + if (ordered.length !== spans.length) { + const seen = new Set(ordered); + for (const s of spans) if (!seen.has(s)) ordered.push(s); + } + return ordered; +} + +/** Set by the runner before prompting so spans can carry session + model. */ +export const runConfig: { + sessionId?: string; + provider?: string; + requestModel?: string; + /** Filled by the extension on agent_start so the runner can print/fetch the trace. */ + traceId?: string; +} = {}; + +let provider: NodeTracerProvider | undefined; +let captureContent = true; + +function initTracing(): void { + if (provider) return; + + const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace( + /\/+$/, + "", + ); + const apiKey = process.env.AGENTA_API_KEY || ""; + const url = `${host}/api/otlp/v1/traces`; + captureContent = process.env.PI_OTEL_CAPTURE_CONTENT !== "0"; + + if (!apiKey) { + console.warn( + "[agenta-otel] AGENTA_API_KEY is not set — the collector will reject spans with 401.", + ); + } + console.log(`[agenta-otel] exporting spans to ${url} (content capture: ${captureContent})`); + + const exporter = new OTLPTraceExporter({ + url, + headers: { Authorization: `ApiKey ${apiKey}` }, + timeoutMillis: 10_000, + }); + + provider = new NodeTracerProvider({ + resource: new Resource({ + [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent", + }), + }); + provider.addSpanProcessor(new TraceBatchProcessor(exporter)); + provider.register(); +} + +/** Flush and shut down the exporter. Call from the runner after a run completes. */ +export async function shutdownTracing(): Promise { + if (!provider) return; + try { + await provider.forceFlush(); + await provider.shutdown(); + } finally { + provider = undefined; + } +} + +const tracer = () => trace.getTracer("agenta-pi-otel", "0.1.0"); + +// --- per-run span state (the POC runs one prompt at a time) --- +let agentSpan: Span | undefined; +let agentCtx: Context | undefined; +let pendingPrompt: string | undefined; +let currentTurn: { span: Span; ctx: Context; index?: number } | undefined; +let llmSpan: Span | undefined; +let lastContextMessages: any[] | undefined; +const toolSpans = new Map(); + +/** A string output → ag.data.outputs (any type is valid there). */ +function setOutput(span: Span, value: unknown): void { + if (!captureContent || value == null) return; + const text = typeof value === "string" ? value : JSON.stringify(value); + if (text.length > 0) span.setAttribute("output.value", text); +} + +/** + * ag.data.inputs must be a dict, so emit input.value as a JSON object string. + * A non-object (raw string) would be relocated to ag.unsupported by Agenta. + */ +function setInputs(span: Span, obj: Record): void { + if (!captureContent) return; + span.setAttribute("input.value", JSON.stringify(obj)); + span.setAttribute("input.mime_type", "application/json"); +} + +function oiRole(role: string): string { + return role === "toolResult" ? "tool" : role; // user | assistant | system | tool +} + +function messageText(msg: any): string { + const c = msg?.content; + if (typeof c === "string") return c; + if (Array.isArray(c)) { + return c + .filter((b: any) => b?.type === "text") + .map((b: any) => b.text) + .join(""); + } + return ""; +} + +/** + * Emit OpenInference structured messages so Agenta renders a proper message + * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*, + * `llm.output_messages.*` -> ag.data.outputs.completion.*. + */ +function emitMessages(span: Span, prefix: string, messages: any[]): void { + if (!captureContent || !Array.isArray(messages)) return; + messages.forEach((m, i) => { + const base = `${prefix}.${i}.message`; + span.setAttribute(`${base}.role`, oiRole(m.role)); + const text = messageText(m); + if (text) span.setAttribute(`${base}.content`, text); + if (m.role === "toolResult" && m.toolCallId) + span.setAttribute(`${base}.tool_call_id`, m.toolCallId); + if (Array.isArray(m.content)) { + m.content + .filter((b: any) => b?.type === "toolCall") + .forEach((call: any, j: number) => { + const tc = `${base}.tool_calls.${j}.tool_call`; + if (call.id) span.setAttribute(`${tc}.id`, call.id); + span.setAttribute(`${tc}.function.name`, call.name); + span.setAttribute( + `${tc}.function.arguments`, + JSON.stringify(call.arguments ?? {}), + ); + }); + } + }); +} + +function toolResultText(result: any): string { + if (!result) return ""; + if (typeof result === "string") return result; + if (Array.isArray(result)) { + return result + .filter((c: any) => c?.type === "text") + .map((c: any) => c.text) + .join(""); + } + if (result.content) return toolResultText(result.content); + return JSON.stringify(result); +} + +function lastAssistantText(messages: any): string { + if (!Array.isArray(messages)) return ""; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i]?.role === "assistant") return messageText(messages[i]); + } + return ""; +} + +/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */ +function applyAssistant(span: Span, msg: any): void { + if (msg.provider) span.setAttribute("gen_ai.system", msg.provider); + if (msg.model) span.setAttribute("gen_ai.request.model", msg.model); + if (msg.responseModel || msg.model) + span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model); + if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId); + if (msg.stopReason) + span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]); + + const u = msg.usage; + if (u) { + // Current GenAI names (mapped by Agenta's logfire adapter) ... + span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0); + // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token + // usage is never silently dropped regardless of which adapter wins. + span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0); + span.setAttribute( + "gen_ai.usage.total_tokens", + u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0), + ); + if (u.cacheRead) + span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead); + if (u.cacheWrite) + span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite); + if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total); + } + + emitMessages(span, "llm.output_messages", [msg]); + if (msg.stopReason === "error" || msg.errorMessage) { + span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage }); + } +} + +export default function agentaOtel(pi: ExtensionAPI): void { + initTracing(); + const t = tracer(); + + pi.on("before_agent_start", async (event: any) => { + pendingPrompt = event?.prompt; + }); + + pi.on("agent_start", async () => { + agentSpan = t.startSpan("invoke_agent"); + agentSpan.setAttribute("openinference.span.kind", "AGENT"); + agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); + agentSpan.setAttribute("gen_ai.agent.name", "pi"); + if (runConfig.sessionId) { + agentSpan.setAttribute("session.id", runConfig.sessionId); + agentSpan.setAttribute("gen_ai.conversation.id", runConfig.sessionId); + } + setInputs(agentSpan, { prompt: pendingPrompt ?? "" }); + runConfig.traceId = agentSpan.spanContext().traceId; + agentCtx = trace.setSpan(context.active(), agentSpan); + }); + + // The messages handed to the next LLM call — the chat span's input. + pi.on("context", async (event: any) => { + if (Array.isArray(event?.messages)) lastContextMessages = event.messages; + }); + + pi.on("turn_start", async (event: any) => { + const parent = agentCtx ?? context.active(); + const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn"; + const span = t.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "CHAIN"); + if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex); + currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex }; + }); + + pi.on("before_provider_request", async (_event: any, ctx: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const modelId = runConfig.requestModel ?? ctx?.model?.id; + const providerName = runConfig.provider ?? ctx?.model?.provider; + llmSpan = t.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent); + llmSpan.setAttribute("openinference.span.kind", "LLM"); + llmSpan.setAttribute("gen_ai.operation.name", "chat"); + if (providerName) llmSpan.setAttribute("gen_ai.system", providerName); + if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId); + if (lastContextMessages) emitMessages(llmSpan, "llm.input_messages", lastContextMessages); + }); + + pi.on("message_end", async (event: any) => { + const msg = event?.message; + if (!msg || msg.role !== "assistant" || !llmSpan) return; + applyAssistant(llmSpan, msg); + llmSpan.end(); + llmSpan = undefined; + }); + + pi.on("tool_execution_start", async (event: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool"; + const span = t.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "TOOL"); + span.setAttribute("gen_ai.operation.name", "execute_tool"); + if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName); + if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId); + setInputs(span, (event?.args as Record) ?? {}); + if (event?.toolCallId) toolSpans.set(event.toolCallId, span); + }); + + pi.on("tool_execution_end", async (event: any) => { + const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined; + if (!span) return; + setOutput(span, toolResultText(event?.result)); + if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR }); + span.end(); + toolSpans.delete(event.toolCallId); + }); + + pi.on("turn_end", async (event: any) => { + // Safety net: if the LLM span is still open (no assistant message_end seen), + // close it from the turn's assistant message. + if (llmSpan && event?.message) { + applyAssistant(llmSpan, event.message); + llmSpan.end(); + llmSpan = undefined; + } + if (currentTurn) { + currentTurn.span.end(); + currentTurn = undefined; + } + }); + + pi.on("agent_end", async (event: any) => { + if (!agentSpan) return; + setOutput(agentSpan, lastAssistantText(event?.messages)); + agentSpan.end(); + agentSpan = undefined; + agentCtx = undefined; + lastContextMessages = undefined; + }); + + // CLI (`pi -e`) flush path. The SDK runner additionally calls shutdownTracing(). + pi.on("session_shutdown", async () => { + try { + await provider?.forceFlush(); + } catch { + /* best effort */ + } + }); +} diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json b/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json new file mode 100644 index 0000000000..e3d23ae603 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json @@ -0,0 +1,25 @@ +{ + "name": "wp-1-pi-tracing-poc", + "version": "0.0.0", + "private": true, + "type": "module", + "description": "WP-1 POC: trace the Pi agent harness into Agenta via an OTel extension.", + "scripts": { + "start": "tsx run.ts", + "login": "pi" + }, + "dependencies": { + "@earendil-works/pi-coding-agent": "0.79.4", + "@opentelemetry/api": "1.9.0", + "@opentelemetry/exporter-trace-otlp-proto": "0.54.0", + "@opentelemetry/resources": "1.28.0", + "@opentelemetry/sdk-trace-base": "1.28.0", + "@opentelemetry/sdk-trace-node": "1.28.0", + "@opentelemetry/semantic-conventions": "1.28.0", + "dotenv": "17.2.3" + }, + "devDependencies": { + "tsx": "4.19.2", + "@types/node": "22.10.2" + } +} diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml b/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml new file mode 100644 index 0000000000..54c94564b7 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml @@ -0,0 +1,1842 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@earendil-works/pi-coding-agent': + specifier: 0.79.4 + version: 0.79.4(ws@8.21.0)(zod@4.4.3) + '@opentelemetry/api': + specifier: 1.9.0 + version: 1.9.0 + '@opentelemetry/exporter-trace-otlp-proto': + specifier: 0.54.0 + version: 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-node': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': + specifier: 1.28.0 + version: 1.28.0 + dotenv: + specifier: 17.2.3 + version: 17.2.3 + devDependencies: + '@types/node': + specifier: 22.10.2 + version: 22.10.2 + tsx: + specifier: 4.19.2 + version: 4.19.2 + +packages: + + '@anthropic-ai/sdk@0.91.1': + resolution: {integrity: sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw==} + hasBin: true + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + peerDependenciesMeta: + zod: + optional: true + + '@aws-crypto/crc32@5.2.0': + resolution: {integrity: sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==} + engines: {node: '>=16.0.0'} + + '@aws-crypto/sha256-browser@5.2.0': + resolution: {integrity: sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==} + + '@aws-crypto/sha256-js@5.2.0': + resolution: {integrity: sha512-FFQQyu7edu4ufvIZ+OadFpHHOt+eSTBaYaki44c+akjg7qZg9oOQeLlk77F6tSYqjDAFClrHJk9tMf0HdVyOvA==} + engines: {node: '>=16.0.0'} + + '@aws-crypto/supports-web-crypto@5.2.0': + resolution: {integrity: sha512-iAvUotm021kM33eCdNfwIN//F77/IADDSs58i+MDaOqFrVjZo9bAal0NK7HurRuWLLpF1iLX7gbWrjHjeo+YFg==} + + '@aws-crypto/util@5.2.0': + resolution: {integrity: sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==} + + '@aws-sdk/client-bedrock-runtime@3.1048.0': + resolution: {integrity: sha512-u+NT61JZEkRFtpL0CAw1N1dwxnaLgwVXQl/zjJxTGgLyS/jTIdg2SdoEoCTHxgDyCnqa1HEi9QOoE9/pYRNpOQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/core@3.974.20': + resolution: {integrity: sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-env@3.972.46': + resolution: {integrity: sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-http@3.972.48': + resolution: {integrity: sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-ini@3.972.53': + resolution: {integrity: sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-login@3.972.52': + resolution: {integrity: sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-node@3.972.55': + resolution: {integrity: sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-process@3.972.46': + resolution: {integrity: sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-sso@3.972.52': + resolution: {integrity: sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-web-identity@3.972.52': + resolution: {integrity: sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/eventstream-handler-node@3.972.21': + resolution: {integrity: sha512-mVC0hOmwGJmNFezZ+wM8Sqfap/LjsMavEf2Evl0YWrLAcrdZOEdjnY8nRvgakVViWJSGm2eJxLuPVHGdeV06kA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/middleware-eventstream@3.972.17': + resolution: {integrity: sha512-tdbnXbw73ww62ABWP0G0Z/euvFowEEvAoi/zG4NaZo7HJFpfGho/Z65HyVzkJLT1cMsUregr4pTyxljlarT0wA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/middleware-websocket@3.972.28': + resolution: {integrity: sha512-SCW06Zjugn86pq7+dxGnFcyWJuEWHT753HTU/Vj/OzVxP+NoShwdAr4ynxAcvWL883OgRVbSqW3ohnjIxwXjjw==} + engines: {node: '>= 14.0.0'} + + '@aws-sdk/nested-clients@3.997.20': + resolution: {integrity: sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/signature-v4-multi-region@3.996.34': + resolution: {integrity: sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/token-providers@3.1048.0': + resolution: {integrity: sha512-k0y/GcuesuSfWyUM0WamrGyeZmltRYaPbHO82UDA6mZ/doB+FOHKutikPAtSXMn/hDz970cF+iRuuiYO9VEbAA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/token-providers@3.1066.0': + resolution: {integrity: sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/types@3.973.12': + resolution: {integrity: sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/util-locate-window@3.965.7': + resolution: {integrity: sha512-M0D6oIpohdNHjc7udzTHEQyot0+0iuA36jc2I9Hps+f/GtKi2HO/pyijQnCnNcwZqLB5+rtn81z3eZK/GyjAmA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/xml-builder@3.972.29': + resolution: {integrity: sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==} + engines: {node: '>=20.0.0'} + + '@aws/lambda-invoke-store@0.2.4': + resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==} + engines: {node: '>=18.0.0'} + + '@babel/runtime@7.29.7': + resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==} + engines: {node: '>=6.9.0'} + + '@earendil-works/pi-agent-core@0.79.4': + resolution: {integrity: sha512-xkaZ3yK2XbP9HYdHrrdj/6HqZPM0o/mwbjMSU4RTJyR3HjDG0ZrPz76Hg6s0W+G4u6PpJr1mGx/srCG+3eQA8A==} + engines: {node: '>=22.19.0'} + + '@earendil-works/pi-ai@0.79.4': + resolution: {integrity: sha512-Z1j+YP+6ZyPBKDUoc5m0GO/o1hPK17fWeErtDgegCTpm2dcKzuFvL/7GTqHeJkVkfpeXRwO37xOfgozQbK6EUw==} + engines: {node: '>=22.19.0'} + hasBin: true + + '@earendil-works/pi-coding-agent@0.79.4': + resolution: {integrity: sha512-PthzVzM5m4XH/hrU+2fVjuwuH5M4eMFWbd0NCRScH14XKpwlPc8/Fh6JDz0jQb5kTBT9oQT183YLTHVVulFL9A==} + engines: {node: '>=22.19.0'} + hasBin: true + + '@earendil-works/pi-tui@0.79.4': + resolution: {integrity: sha512-/ZhfFiHSBMH7AbDrBQIN+UWlJnl9tSEpLYICRGGMzmNfyCqX+30NYacIhyOEaD8R5rS6wJZysAOPU0yNwigbXw==} + engines: {node: '>=22.19.0'} + + '@esbuild/aix-ppc64@0.23.1': + resolution: {integrity: sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + + '@esbuild/android-arm64@0.23.1': + resolution: {integrity: sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + + '@esbuild/android-arm@0.23.1': + resolution: {integrity: sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + + '@esbuild/android-x64@0.23.1': + resolution: {integrity: sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + + '@esbuild/darwin-arm64@0.23.1': + resolution: {integrity: sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + + '@esbuild/darwin-x64@0.23.1': + resolution: {integrity: sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + + '@esbuild/freebsd-arm64@0.23.1': + resolution: {integrity: sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + + '@esbuild/freebsd-x64@0.23.1': + resolution: {integrity: sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + + '@esbuild/linux-arm64@0.23.1': + resolution: {integrity: sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + + '@esbuild/linux-arm@0.23.1': + resolution: {integrity: sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + + '@esbuild/linux-ia32@0.23.1': + resolution: {integrity: sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + + '@esbuild/linux-loong64@0.23.1': + resolution: {integrity: sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + + '@esbuild/linux-mips64el@0.23.1': + resolution: {integrity: sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + + '@esbuild/linux-ppc64@0.23.1': + resolution: {integrity: sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + + '@esbuild/linux-riscv64@0.23.1': + resolution: {integrity: sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + + '@esbuild/linux-s390x@0.23.1': + resolution: {integrity: sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + + '@esbuild/linux-x64@0.23.1': + resolution: {integrity: sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + + '@esbuild/netbsd-x64@0.23.1': + resolution: {integrity: sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + + '@esbuild/openbsd-arm64@0.23.1': + resolution: {integrity: sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + + '@esbuild/openbsd-x64@0.23.1': + resolution: {integrity: sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + + '@esbuild/sunos-x64@0.23.1': + resolution: {integrity: sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + + '@esbuild/win32-arm64@0.23.1': + resolution: {integrity: sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + + '@esbuild/win32-ia32@0.23.1': + resolution: {integrity: sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + + '@esbuild/win32-x64@0.23.1': + resolution: {integrity: sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + + '@google/genai@1.52.0': + resolution: {integrity: sha512-gwSvbpiN/17O9TbsqSsE/OzZcpv5Fo4RQjdngGgogtuB9RsyJ8ZHhX5KjHj1bp5N9snN2eK8LDGXSaWW2hof8Q==} + engines: {node: '>=20.0.0'} + peerDependencies: + '@modelcontextprotocol/sdk': ^1.25.2 + peerDependenciesMeta: + '@modelcontextprotocol/sdk': + optional: true + + '@mariozechner/clipboard-darwin-arm64@0.3.9': + resolution: {integrity: sha512-BfgV7vCEWZwJwZJw03r6bP5+tf0iI/ANuQYCxi9RNn7FrWB3yzGuMKCrNLRl6V761vXRdL8+OqZ0wd4TqlsNOQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [darwin] + + '@mariozechner/clipboard-darwin-universal@0.3.9': + resolution: {integrity: sha512-BGGR4iA9Z2shAjI65eI5xtyb3LYNlDW9X3gxKxDbqtbnREohsrqznov6zpKoIrsRWpzlYVEdKphS7ksJ0/ndSQ==} + engines: {node: '>= 10'} + os: [darwin] + + '@mariozechner/clipboard-darwin-x64@0.3.9': + resolution: {integrity: sha512-4kURmCbS6nt8uYhtmWpUcJWyPHfmAr5dTpXD1nO3pIfa+TSQ9DbrGOYCKH+aEFW47XhQ4Vp8ZTszie+wfFvDKg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [darwin] + + '@mariozechner/clipboard-linux-arm64-gnu@0.3.9': + resolution: {integrity: sha512-g59OkUGP2DDfCOIKypHeYgv2M55u/cKvXa5dSxFbEJ34XvIQMdcVmpKCkGUro3ZgefXiGVdwguvTMQGpHWzIXw==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-arm64-musl@0.3.9': + resolution: {integrity: sha512-AGuJdgKsmJdm4Pych7kv3sqe591ERRaAHW3xjLooiFzn8J+PxUyof++7YZrB5Y5tpnTO+K18Og3taj2NpluCRQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [musl] + + '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9': + resolution: {integrity: sha512-DXBEAiuMpk7dhS1a9NzNxVAFi1vaKoPu7rQNgY8LIDLGrK3lnIp3nT10DUum+PKVJoJppIP+NAA8IZe4DMNDPw==} + engines: {node: '>= 10'} + cpu: [riscv64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-x64-gnu@0.3.9': + resolution: {integrity: sha512-WORrMLd6EpElEME7JRKfSaY34nW1P5LbdgK5YNCS1ncG2LqmITsSMEJ8nh2mpvxb3TxqbOOKgY7k9eMJYlW9Mw==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-x64-musl@0.3.9': + resolution: {integrity: sha512-/DHn+1DrfL6oRaPPWXaOKvonFFrni666fxd+zFqiQEfvBH0tsHVWjq9iqBk0oDp0qaPA72lIMy5BptxISBEhZQ==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [musl] + + '@mariozechner/clipboard-win32-arm64-msvc@0.3.9': + resolution: {integrity: sha512-O5FHD3ErkMwMhNzAfu3ggy0ug4z7btZuoQgwwxlzPrwV2bxlD6WDpqBY4NCgICAgZdDKdp+loUEKVAVt8aYnhQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [win32] + + '@mariozechner/clipboard-win32-x64-msvc@0.3.9': + resolution: {integrity: sha512-ihQC3EufqEY81vhXBgVBtK4prL+wc62zJsSvxrgz7K1hsdt6OObz6v9p3Rn1OG3GJksTTKMJF0u/guMISHPhSA==} + engines: {node: '>= 10'} + cpu: [x64] + os: [win32] + + '@mariozechner/clipboard@0.3.9': + resolution: {integrity: sha512-ABnA53mdfkGZwOFUdZNv2S0CWGO/EIuPj8Vv9xmBFmSYg/qFc7ihO6q5FcQjvoE67kZpWkEc4AhD6B/os04yuA==} + engines: {node: '>= 10'} + + '@mistralai/mistralai@2.2.1': + resolution: {integrity: sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ==} + + '@nodable/entities@2.2.0': + resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==} + + '@opentelemetry/api-logs@0.54.0': + resolution: {integrity: sha512-9HhEh5GqFrassUndqJsyW7a0PzfyWr2eV2xwzHLIS+wX3125+9HE9FMRAKmJRwxZhgZGwH3HNQQjoMGZqmOeVA==} + engines: {node: '>=14'} + + '@opentelemetry/api@1.9.0': + resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} + engines: {node: '>=8.0.0'} + + '@opentelemetry/context-async-hooks@1.28.0': + resolution: {integrity: sha512-igcl4Ve+F1N2063PJUkesk/GkYyuGIWinYkSyAFTnIj3gzrOgvOA4k747XNdL47HRRL1w/qh7UW8NDuxOLvKFA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.27.0': + resolution: {integrity: sha512-yQPKnK5e+76XuiqUH/gKyS8wv/7qITd5ln56QkBTf3uggr0VkXOXfcaAuG330UfdYu83wsyoBwqwxigpIG+Jkg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.28.0': + resolution: {integrity: sha512-ZLwRMV+fNDpVmF2WYUdBHlq0eOWtEaUJSusrzjGnBt7iSRvfjFE3RXYUZJrqou/wIDWV0DwQ5KIfYe9WXg9Xqw==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/exporter-trace-otlp-proto@0.54.0': + resolution: {integrity: sha512-cpDQj5wl7G8pLu3lW94SnMpn0C85A9Ehe7+JBow2IL5DGPWXTkynFngMtCC3PpQzQgzlyOVe0MVZfoBB3M5ECA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-exporter-base@0.54.0': + resolution: {integrity: sha512-g+H7+QleVF/9lz4zhaR9Dt4VwApjqG5WWupy5CTMpWJfHB/nLxBbX73GBZDgdiNfh08nO3rNa6AS7fK8OhgF5g==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-transformer@0.54.0': + resolution: {integrity: sha512-jRexIASQQzdK4AjfNIBfn94itAq4Q8EXR9d3b/OVbhd3kKQKvMr7GkxYDjbeTbY7hHCOLcLfJ3dpYQYGOe8qOQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/propagator-b3@1.28.0': + resolution: {integrity: sha512-Q7HVDIMwhN5RxL4bECMT4BdbyYSAKkC6U/RGn4NpO/cbqP6ZRg+BS7fPo/pGZi2w8AHfpIGQFXQmE8d2PC5xxQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-jaeger@1.28.0': + resolution: {integrity: sha512-wKJ94+s8467CnIRgoSRh0yXm/te0QMOwTq9J01PfG/RzYZvlvN8aRisN2oZ9SznB45dDGnMj3BhUlchSA9cEKA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.27.0': + resolution: {integrity: sha512-jOwt2VJ/lUD5BLc+PMNymDrUCpm5PKi1E9oSVYAvz01U/VdndGmrtV3DU1pG4AwlYhJRHbHfOUIlpBeXCPw6QQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.28.0': + resolution: {integrity: sha512-cIyXSVJjGeTICENN40YSvLDAq4Y2502hGK3iN7tfdynQLKWb3XWZQEkPc+eSx47kiy11YeFAlYkEfXwR1w8kfw==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-logs@0.54.0': + resolution: {integrity: sha512-HeWvOPiWhEw6lWvg+lCIi1WhJnIPbI4/OFZgHq9tKfpwF3LX6/kk3+GR8sGUGAEZfbjPElkkngzvd2s03zbD7Q==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.4.0 <1.10.0' + + '@opentelemetry/sdk-metrics@1.27.0': + resolution: {integrity: sha512-JzWgzlutoXCydhHWIbLg+r76m+m3ncqvkCcsswXAQ4gqKS+LOHKhq+t6fx1zNytvLuaOUBur7EvWxECc4jPQKg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.3.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.27.0': + resolution: {integrity: sha512-btz6XTQzwsyJjombpeqCX6LhiMQYpzt2pIYNPnw0IPO/3AhT6yjnf8Mnv3ZC2A4eRYOjqrg+bfaXg9XHDRJDWQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.28.0': + resolution: {integrity: sha512-ceUVWuCpIao7Y5xE02Xs3nQi0tOGmMea17ecBdwtCvdo9ekmO+ijc9RFDgfifMl7XCBf41zne/1POM3LqSTZDA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-node@1.28.0': + resolution: {integrity: sha512-N0sYfYXvHpP0FNIyc+UfhLnLSTOuZLytV0qQVrDWIlABeD/DWJIGttS7nYeR14gQLXch0M1DW8zm3VeN6Opwtg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/semantic-conventions@1.27.0': + resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==} + engines: {node: '>=14'} + + '@opentelemetry/semantic-conventions@1.28.0': + resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==} + engines: {node: '>=14'} + + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.5': + resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==} + + '@protobufjs/eventemitter@1.1.1': + resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==} + + '@protobufjs/fetch@1.1.1': + resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.1': + resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==} + + '@silvia-odwyer/photon-node@0.3.4': + resolution: {integrity: sha512-bnly4BKB3KDTFxrUIcgCLbaeVVS8lrAkri1pEzskpmxu9MdfGQTy8b8EgcD83ywD3RPMsIulY8xJH5Awa+t9fA==} + + '@smithy/core@3.24.7': + resolution: {integrity: sha512-KoUi4M1f3BG6kzN1FnCwL7oyFptTbyBJKjR6yhSib+JHRdUmM1o+VwsFtJ66NZCkCzVfJMWRHJNo0R0jznp0Pg==} + engines: {node: '>=18.0.0'} + + '@smithy/credential-provider-imds@4.3.9': + resolution: {integrity: sha512-ZlfJ/4Fa3jYb+3eaohPfG9utX9HmdhFNcFtpoGAhUhdynAOmGXtmigbi7eEiONKM+ykHw8RwKuDEb85Lx7t7fA==} + engines: {node: '>=18.0.0'} + + '@smithy/fetch-http-handler@5.4.7': + resolution: {integrity: sha512-NslaM2ir0N2hisDmzXLstPaVINZheh8SokyOC++kzFPloZucL2R7Y7bS57mSzx/1Fc/fqmn7twjkeezTTrV0EA==} + engines: {node: '>=18.0.0'} + + '@smithy/is-array-buffer@2.2.0': + resolution: {integrity: sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==} + engines: {node: '>=14.0.0'} + + '@smithy/node-http-handler@4.7.3': + resolution: {integrity: sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==} + engines: {node: '>=18.0.0'} + + '@smithy/node-http-handler@4.7.8': + resolution: {integrity: sha512-f+DbsWUwSbtMu1a/j8Y93KiU1SRg9nyzfjereqn1BJ33QOTUXxdlYvVXMhAYl1vuR1Kmna5aIJe09KSIfyFNYw==} + engines: {node: '>=18.0.0'} + + '@smithy/signature-v4@5.4.7': + resolution: {integrity: sha512-LwQZazFayImv+IOm0S0enoLeUJwmAlhGC5O6YCcLWezyu08dF46GOxPOq35OpBIHkgd7OvNvBStIFwVNyrvoBw==} + engines: {node: '>=18.0.0'} + + '@smithy/types@4.14.4': + resolution: {integrity: sha512-B2S9+UGm1+/pHkcx3ZoLVX1a+pmSk8rqxRR+ZsNqZaJ5q9FWX9AFGQVM4qG5+OBeQUZVy99HY8HqW8gK/wgXzQ==} + engines: {node: '>=18.0.0'} + + '@smithy/util-buffer-from@2.2.0': + resolution: {integrity: sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==} + engines: {node: '>=14.0.0'} + + '@smithy/util-utf8@2.3.0': + resolution: {integrity: sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==} + engines: {node: '>=14.0.0'} + + '@types/node@22.10.2': + resolution: {integrity: sha512-Xxr6BBRCAOQixvonOye19wnzyDiUtTeqldOOmj3CkeblonbccA12PFwlufvRdrpjXxqnmUaeiU5EOA+7s5diUQ==} + + '@types/retry@0.12.0': + resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==} + + agent-base@7.1.4: + resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} + engines: {node: '>= 14'} + + anynum@1.0.0: + resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==} + + balanced-match@4.0.4: + resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} + engines: {node: 18 || 20 || >=22} + + base64-js@1.5.1: + resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + + bignumber.js@9.3.1: + resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} + + bowser@2.14.1: + resolution: {integrity: sha512-tzPjzCxygAKWFOJP011oxFHs57HzIhOEracIgAePE4pqB3LikALKnSzUyU4MGs9/iCEUuHlAJTjTc5M+u7YEGg==} + + brace-expansion@5.0.6: + resolution: {integrity: sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==} + engines: {node: 18 || 20 || >=22} + + buffer-equal-constant-time@1.0.1: + resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} + + chalk@5.6.2: + resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==} + engines: {node: ^12.17.0 || ^14.13 || >=16.0.0} + + cross-spawn@7.0.6: + resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} + engines: {node: '>= 8'} + + data-uri-to-buffer@4.0.1: + resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==} + engines: {node: '>= 12'} + + debug@4.4.3: + resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + + diff@8.0.4: + resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==} + engines: {node: '>=0.3.1'} + + dotenv@17.2.3: + resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} + engines: {node: '>=12'} + + ecdsa-sig-formatter@1.0.11: + resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==} + + esbuild@0.23.1: + resolution: {integrity: sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==} + engines: {node: '>=18'} + hasBin: true + + extend@3.0.2: + resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} + + fast-xml-builder@1.2.0: + resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==} + + fast-xml-parser@5.7.3: + resolution: {integrity: sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==} + hasBin: true + + fetch-blob@3.2.0: + resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==} + engines: {node: ^12.20 || >= 14.13} + + formdata-polyfill@4.0.10: + resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==} + engines: {node: '>=12.20.0'} + + fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + gaxios@7.1.5: + resolution: {integrity: sha512-5FZy72Rh8LhtjmvDrKkI+lVhrsQrVKVsItxMoDm5mNQE+xR0WVIIs+jzPSJgBvKVsLi24fZhXJIsNI0bihDzFg==} + engines: {node: '>=18'} + + gcp-metadata@8.1.2: + resolution: {integrity: sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==} + engines: {node: '>=18'} + + get-east-asian-width@1.6.0: + resolution: {integrity: sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA==} + engines: {node: '>=18'} + + get-tsconfig@4.14.0: + resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==} + + glob@13.0.6: + resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==} + engines: {node: 18 || 20 || >=22} + + google-auth-library@10.7.0: + resolution: {integrity: sha512-QpTAbNJ36TliZLx3TTtahR8HG0hN9RllL1e3FymOvQSIKK8JmgV58H924ub2wa2DsS3ANjjP1Aw1N+Ramc8hqQ==} + engines: {node: '>=18'} + + google-logging-utils@1.1.3: + resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==} + engines: {node: '>=14'} + + graceful-fs@4.2.11: + resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + + highlight.js@10.7.3: + resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==} + + hosted-git-info@9.0.3: + resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==} + engines: {node: ^20.17.0 || >=22.9.0} + + http-proxy-agent@7.0.2: + resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==} + engines: {node: '>= 14'} + + https-proxy-agent@7.0.6: + resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} + engines: {node: '>= 14'} + + ignore@7.0.5: + resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==} + engines: {node: '>= 4'} + + isexe@2.0.0: + resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + + jiti@2.7.0: + resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==} + hasBin: true + + json-bigint@1.0.0: + resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==} + + json-schema-to-ts@3.1.1: + resolution: {integrity: sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==} + engines: {node: '>=16'} + + jwa@2.0.1: + resolution: {integrity: sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==} + + jws@4.0.1: + resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==} + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + + lru-cache@11.5.1: + resolution: {integrity: sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==} + engines: {node: 20 || >=22} + + marked@15.0.12: + resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==} + engines: {node: '>= 18'} + hasBin: true + + minimatch@10.2.5: + resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==} + engines: {node: 18 || 20 || >=22} + + minipass@7.1.3: + resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==} + engines: {node: '>=16 || 14 >=14.17'} + + ms@2.1.3: + resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + + node-domexception@1.0.0: + resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} + engines: {node: '>=10.5.0'} + deprecated: Use your platform's native DOMException instead + + node-fetch@3.3.2: + resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + + openai@6.26.0: + resolution: {integrity: sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA==} + hasBin: true + peerDependencies: + ws: ^8.18.0 + zod: ^3.25 || ^4.0 + peerDependenciesMeta: + ws: + optional: true + zod: + optional: true + + p-retry@4.6.2: + resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==} + engines: {node: '>=8'} + + partial-json@0.1.7: + resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==} + + path-expression-matcher@1.5.0: + resolution: {integrity: sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==} + engines: {node: '>=14.0.0'} + + path-key@3.1.1: + resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} + engines: {node: '>=8'} + + path-scurry@2.0.2: + resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==} + engines: {node: 18 || 20 || >=22} + + proper-lockfile@4.1.2: + resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==} + + protobufjs@7.6.4: + resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==} + engines: {node: '>=12.0.0'} + + resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + + retry@0.12.0: + resolution: {integrity: sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==} + engines: {node: '>= 4'} + + retry@0.13.1: + resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} + engines: {node: '>= 4'} + + safe-buffer@5.2.1: + resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} + + semver@7.8.0: + resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==} + engines: {node: '>=10'} + hasBin: true + + semver@7.8.4: + resolution: {integrity: sha512-rUCObTnP32Q08R2uuIrt7r9PlEonuTmtuXYcW6s5kjdlj3xbnwe+21yXptAUYcMAABLkYYTtnmzb3w3EDZfueA==} + engines: {node: '>=10'} + hasBin: true + + shebang-command@2.0.0: + resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} + engines: {node: '>=8'} + + shebang-regex@3.0.0: + resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} + engines: {node: '>=8'} + + signal-exit@3.0.7: + resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + + strnum@2.4.0: + resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==} + + ts-algebra@2.0.0: + resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==} + + tslib@2.8.1: + resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + + tsx@4.19.2: + resolution: {integrity: sha512-pOUl6Vo2LUq/bSa8S5q7b91cgNSjctn9ugq/+Mvow99qW6x/UZYwzxy/3NmqoT66eHYfCVvFvACC58UBPFf28g==} + engines: {node: '>=18.0.0'} + hasBin: true + + typebox@1.1.38: + resolution: {integrity: sha512-pZ0aQPmMmXoUvSbeuWf/Hzsc+avNw/Zd6VeE8CFgkVGWyuHPJvqeJJDeJqLve+K70LvjYIoleGcoJHPT17cWoA==} + + undici-types@6.20.0: + resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} + + undici@8.3.0: + resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==} + engines: {node: '>=22.19.0'} + + web-streams-polyfill@3.3.3: + resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} + engines: {node: '>= 8'} + + which@2.0.2: + resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} + engines: {node: '>= 8'} + hasBin: true + + ws@8.21.0: + resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==} + engines: {node: '>=10.0.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: '>=5.0.2' + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + xml-naming@0.1.0: + resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==} + engines: {node: '>=16.0.0'} + + yaml@2.9.0: + resolution: {integrity: sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==} + engines: {node: '>= 14.6'} + hasBin: true + + zod-to-json-schema@3.25.2: + resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==} + peerDependencies: + zod: ^3.25.28 || ^4 + + zod@4.4.3: + resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==} + +snapshots: + + '@anthropic-ai/sdk@0.91.1(zod@4.4.3)': + dependencies: + json-schema-to-ts: 3.1.1 + optionalDependencies: + zod: 4.4.3 + + '@aws-crypto/crc32@5.2.0': + dependencies: + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + tslib: 2.8.1 + + '@aws-crypto/sha256-browser@5.2.0': + dependencies: + '@aws-crypto/sha256-js': 5.2.0 + '@aws-crypto/supports-web-crypto': 5.2.0 + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + '@aws-sdk/util-locate-window': 3.965.7 + '@smithy/util-utf8': 2.3.0 + tslib: 2.8.1 + + '@aws-crypto/sha256-js@5.2.0': + dependencies: + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + tslib: 2.8.1 + + '@aws-crypto/supports-web-crypto@5.2.0': + dependencies: + tslib: 2.8.1 + + '@aws-crypto/util@5.2.0': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/util-utf8': 2.3.0 + tslib: 2.8.1 + + '@aws-sdk/client-bedrock-runtime@3.1048.0': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/core': 3.974.20 + '@aws-sdk/credential-provider-node': 3.972.55 + '@aws-sdk/eventstream-handler-node': 3.972.21 + '@aws-sdk/middleware-eventstream': 3.972.17 + '@aws-sdk/middleware-websocket': 3.972.28 + '@aws-sdk/token-providers': 3.1048.0 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.3 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/core@3.974.20': + dependencies: + '@aws-sdk/types': 3.973.12 + '@aws-sdk/xml-builder': 3.972.29 + '@aws/lambda-invoke-store': 0.2.4 + '@smithy/core': 3.24.7 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + bowser: 2.14.1 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-env@3.972.46': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-http@3.972.48': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.8 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-ini@3.972.53': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/credential-provider-env': 3.972.46 + '@aws-sdk/credential-provider-http': 3.972.48 + '@aws-sdk/credential-provider-login': 3.972.52 + '@aws-sdk/credential-provider-process': 3.972.46 + '@aws-sdk/credential-provider-sso': 3.972.52 + '@aws-sdk/credential-provider-web-identity': 3.972.52 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/credential-provider-imds': 4.3.9 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-login@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-node@3.972.55': + dependencies: + '@aws-sdk/credential-provider-env': 3.972.46 + '@aws-sdk/credential-provider-http': 3.972.48 + '@aws-sdk/credential-provider-ini': 3.972.53 + '@aws-sdk/credential-provider-process': 3.972.46 + '@aws-sdk/credential-provider-sso': 3.972.52 + '@aws-sdk/credential-provider-web-identity': 3.972.52 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/credential-provider-imds': 4.3.9 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-process@3.972.46': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-sso@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/token-providers': 3.1066.0 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-web-identity@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/eventstream-handler-node@3.972.21': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/middleware-eventstream@3.972.17': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/middleware-websocket@3.972.28': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/nested-clients@3.997.20': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/core': 3.974.20 + '@aws-sdk/signature-v4-multi-region': 3.996.34 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.8 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/signature-v4-multi-region@3.996.34': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/token-providers@3.1048.0': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/token-providers@3.1066.0': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/types@3.973.12': + dependencies: + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/util-locate-window@3.965.7': + dependencies: + tslib: 2.8.1 + + '@aws-sdk/xml-builder@3.972.29': + dependencies: + '@smithy/types': 4.14.4 + fast-xml-parser: 5.7.3 + tslib: 2.8.1 + + '@aws/lambda-invoke-store@0.2.4': {} + + '@babel/runtime@7.29.7': {} + + '@earendil-works/pi-agent-core@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3) + ignore: 7.0.5 + typebox: 1.1.38 + yaml: 2.9.0 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-ai@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@anthropic-ai/sdk': 0.91.1(zod@4.4.3) + '@aws-sdk/client-bedrock-runtime': 3.1048.0 + '@google/genai': 1.52.0 + '@mistralai/mistralai': 2.2.1 + '@smithy/node-http-handler': 4.7.3 + http-proxy-agent: 7.0.2 + https-proxy-agent: 7.0.6 + openai: 6.26.0(ws@8.21.0)(zod@4.4.3) + partial-json: 0.1.7 + typebox: 1.1.38 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-coding-agent@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@earendil-works/pi-agent-core': 0.79.4(ws@8.21.0)(zod@4.4.3) + '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3) + '@earendil-works/pi-tui': 0.79.4 + '@silvia-odwyer/photon-node': 0.3.4 + chalk: 5.6.2 + cross-spawn: 7.0.6 + diff: 8.0.4 + glob: 13.0.6 + highlight.js: 10.7.3 + hosted-git-info: 9.0.3 + ignore: 7.0.5 + jiti: 2.7.0 + minimatch: 10.2.5 + proper-lockfile: 4.1.2 + semver: 7.8.0 + typebox: 1.1.38 + undici: 8.3.0 + yaml: 2.9.0 + optionalDependencies: + '@mariozechner/clipboard': 0.3.9 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-tui@0.79.4': + dependencies: + get-east-asian-width: 1.6.0 + marked: 15.0.12 + + '@esbuild/aix-ppc64@0.23.1': + optional: true + + '@esbuild/android-arm64@0.23.1': + optional: true + + '@esbuild/android-arm@0.23.1': + optional: true + + '@esbuild/android-x64@0.23.1': + optional: true + + '@esbuild/darwin-arm64@0.23.1': + optional: true + + '@esbuild/darwin-x64@0.23.1': + optional: true + + '@esbuild/freebsd-arm64@0.23.1': + optional: true + + '@esbuild/freebsd-x64@0.23.1': + optional: true + + '@esbuild/linux-arm64@0.23.1': + optional: true + + '@esbuild/linux-arm@0.23.1': + optional: true + + '@esbuild/linux-ia32@0.23.1': + optional: true + + '@esbuild/linux-loong64@0.23.1': + optional: true + + '@esbuild/linux-mips64el@0.23.1': + optional: true + + '@esbuild/linux-ppc64@0.23.1': + optional: true + + '@esbuild/linux-riscv64@0.23.1': + optional: true + + '@esbuild/linux-s390x@0.23.1': + optional: true + + '@esbuild/linux-x64@0.23.1': + optional: true + + '@esbuild/netbsd-x64@0.23.1': + optional: true + + '@esbuild/openbsd-arm64@0.23.1': + optional: true + + '@esbuild/openbsd-x64@0.23.1': + optional: true + + '@esbuild/sunos-x64@0.23.1': + optional: true + + '@esbuild/win32-arm64@0.23.1': + optional: true + + '@esbuild/win32-ia32@0.23.1': + optional: true + + '@esbuild/win32-x64@0.23.1': + optional: true + + '@google/genai@1.52.0': + dependencies: + google-auth-library: 10.7.0 + p-retry: 4.6.2 + protobufjs: 7.6.4 + ws: 8.21.0 + transitivePeerDependencies: + - bufferutil + - supports-color + - utf-8-validate + + '@mariozechner/clipboard-darwin-arm64@0.3.9': + optional: true + + '@mariozechner/clipboard-darwin-universal@0.3.9': + optional: true + + '@mariozechner/clipboard-darwin-x64@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-arm64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-arm64-musl@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-x64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-x64-musl@0.3.9': + optional: true + + '@mariozechner/clipboard-win32-arm64-msvc@0.3.9': + optional: true + + '@mariozechner/clipboard-win32-x64-msvc@0.3.9': + optional: true + + '@mariozechner/clipboard@0.3.9': + optionalDependencies: + '@mariozechner/clipboard-darwin-arm64': 0.3.9 + '@mariozechner/clipboard-darwin-universal': 0.3.9 + '@mariozechner/clipboard-darwin-x64': 0.3.9 + '@mariozechner/clipboard-linux-arm64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-arm64-musl': 0.3.9 + '@mariozechner/clipboard-linux-riscv64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-x64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-x64-musl': 0.3.9 + '@mariozechner/clipboard-win32-arm64-msvc': 0.3.9 + '@mariozechner/clipboard-win32-x64-msvc': 0.3.9 + optional: true + + '@mistralai/mistralai@2.2.1': + dependencies: + ws: 8.21.0 + zod: 4.4.3 + zod-to-json-schema: 3.25.2(zod@4.4.3) + transitivePeerDependencies: + - bufferutil + - utf-8-validate + + '@nodable/entities@2.2.0': {} + + '@opentelemetry/api-logs@0.54.0': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/api@1.9.0': {} + + '@opentelemetry/context-async-hooks@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/core@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/core@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/exporter-trace-otlp-proto@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-exporter-base': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/otlp-exporter-base@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/otlp-transformer@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/api-logs': 0.54.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-logs': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-metrics': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0) + protobufjs: 7.6.4 + + '@opentelemetry/propagator-b3@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/propagator-jaeger@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/resources@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/resources@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-logs@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/api-logs': 0.54.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/sdk-metrics@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/sdk-trace-base@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-trace-base@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-trace-node@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/context-async-hooks': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-b3': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-jaeger': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.28.0(@opentelemetry/api@1.9.0) + semver: 7.8.4 + + '@opentelemetry/semantic-conventions@1.27.0': {} + + '@opentelemetry/semantic-conventions@1.28.0': {} + + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.5': {} + + '@protobufjs/eventemitter@1.1.1': {} + + '@protobufjs/fetch@1.1.1': + dependencies: + '@protobufjs/aspromise': 1.1.2 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.1': {} + + '@silvia-odwyer/photon-node@0.3.4': {} + + '@smithy/core@3.24.7': + dependencies: + '@aws-crypto/crc32': 5.2.0 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/credential-provider-imds@4.3.9': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/fetch-http-handler@5.4.7': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/is-array-buffer@2.2.0': + dependencies: + tslib: 2.8.1 + + '@smithy/node-http-handler@4.7.3': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/node-http-handler@4.7.8': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/signature-v4@5.4.7': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/types@4.14.4': + dependencies: + tslib: 2.8.1 + + '@smithy/util-buffer-from@2.2.0': + dependencies: + '@smithy/is-array-buffer': 2.2.0 + tslib: 2.8.1 + + '@smithy/util-utf8@2.3.0': + dependencies: + '@smithy/util-buffer-from': 2.2.0 + tslib: 2.8.1 + + '@types/node@22.10.2': + dependencies: + undici-types: 6.20.0 + + '@types/retry@0.12.0': {} + + agent-base@7.1.4: {} + + anynum@1.0.0: {} + + balanced-match@4.0.4: {} + + base64-js@1.5.1: {} + + bignumber.js@9.3.1: {} + + bowser@2.14.1: {} + + brace-expansion@5.0.6: + dependencies: + balanced-match: 4.0.4 + + buffer-equal-constant-time@1.0.1: {} + + chalk@5.6.2: {} + + cross-spawn@7.0.6: + dependencies: + path-key: 3.1.1 + shebang-command: 2.0.0 + which: 2.0.2 + + data-uri-to-buffer@4.0.1: {} + + debug@4.4.3: + dependencies: + ms: 2.1.3 + + diff@8.0.4: {} + + dotenv@17.2.3: {} + + ecdsa-sig-formatter@1.0.11: + dependencies: + safe-buffer: 5.2.1 + + esbuild@0.23.1: + optionalDependencies: + '@esbuild/aix-ppc64': 0.23.1 + '@esbuild/android-arm': 0.23.1 + '@esbuild/android-arm64': 0.23.1 + '@esbuild/android-x64': 0.23.1 + '@esbuild/darwin-arm64': 0.23.1 + '@esbuild/darwin-x64': 0.23.1 + '@esbuild/freebsd-arm64': 0.23.1 + '@esbuild/freebsd-x64': 0.23.1 + '@esbuild/linux-arm': 0.23.1 + '@esbuild/linux-arm64': 0.23.1 + '@esbuild/linux-ia32': 0.23.1 + '@esbuild/linux-loong64': 0.23.1 + '@esbuild/linux-mips64el': 0.23.1 + '@esbuild/linux-ppc64': 0.23.1 + '@esbuild/linux-riscv64': 0.23.1 + '@esbuild/linux-s390x': 0.23.1 + '@esbuild/linux-x64': 0.23.1 + '@esbuild/netbsd-x64': 0.23.1 + '@esbuild/openbsd-arm64': 0.23.1 + '@esbuild/openbsd-x64': 0.23.1 + '@esbuild/sunos-x64': 0.23.1 + '@esbuild/win32-arm64': 0.23.1 + '@esbuild/win32-ia32': 0.23.1 + '@esbuild/win32-x64': 0.23.1 + + extend@3.0.2: {} + + fast-xml-builder@1.2.0: + dependencies: + path-expression-matcher: 1.5.0 + xml-naming: 0.1.0 + + fast-xml-parser@5.7.3: + dependencies: + '@nodable/entities': 2.2.0 + fast-xml-builder: 1.2.0 + path-expression-matcher: 1.5.0 + strnum: 2.4.0 + + fetch-blob@3.2.0: + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 3.3.3 + + formdata-polyfill@4.0.10: + dependencies: + fetch-blob: 3.2.0 + + fsevents@2.3.3: + optional: true + + gaxios@7.1.5: + dependencies: + extend: 3.0.2 + https-proxy-agent: 7.0.6 + node-fetch: 3.3.2 + transitivePeerDependencies: + - supports-color + + gcp-metadata@8.1.2: + dependencies: + gaxios: 7.1.5 + google-logging-utils: 1.1.3 + json-bigint: 1.0.0 + transitivePeerDependencies: + - supports-color + + get-east-asian-width@1.6.0: {} + + get-tsconfig@4.14.0: + dependencies: + resolve-pkg-maps: 1.0.0 + + glob@13.0.6: + dependencies: + minimatch: 10.2.5 + minipass: 7.1.3 + path-scurry: 2.0.2 + + google-auth-library@10.7.0: + dependencies: + base64-js: 1.5.1 + ecdsa-sig-formatter: 1.0.11 + gaxios: 7.1.5 + gcp-metadata: 8.1.2 + google-logging-utils: 1.1.3 + jws: 4.0.1 + transitivePeerDependencies: + - supports-color + + google-logging-utils@1.1.3: {} + + graceful-fs@4.2.11: {} + + highlight.js@10.7.3: {} + + hosted-git-info@9.0.3: + dependencies: + lru-cache: 11.5.1 + + http-proxy-agent@7.0.2: + dependencies: + agent-base: 7.1.4 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + https-proxy-agent@7.0.6: + dependencies: + agent-base: 7.1.4 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + ignore@7.0.5: {} + + isexe@2.0.0: {} + + jiti@2.7.0: {} + + json-bigint@1.0.0: + dependencies: + bignumber.js: 9.3.1 + + json-schema-to-ts@3.1.1: + dependencies: + '@babel/runtime': 7.29.7 + ts-algebra: 2.0.0 + + jwa@2.0.1: + dependencies: + buffer-equal-constant-time: 1.0.1 + ecdsa-sig-formatter: 1.0.11 + safe-buffer: 5.2.1 + + jws@4.0.1: + dependencies: + jwa: 2.0.1 + safe-buffer: 5.2.1 + + long@5.3.2: {} + + lru-cache@11.5.1: {} + + marked@15.0.12: {} + + minimatch@10.2.5: + dependencies: + brace-expansion: 5.0.6 + + minipass@7.1.3: {} + + ms@2.1.3: {} + + node-domexception@1.0.0: {} + + node-fetch@3.3.2: + dependencies: + data-uri-to-buffer: 4.0.1 + fetch-blob: 3.2.0 + formdata-polyfill: 4.0.10 + + openai@6.26.0(ws@8.21.0)(zod@4.4.3): + optionalDependencies: + ws: 8.21.0 + zod: 4.4.3 + + p-retry@4.6.2: + dependencies: + '@types/retry': 0.12.0 + retry: 0.13.1 + + partial-json@0.1.7: {} + + path-expression-matcher@1.5.0: {} + + path-key@3.1.1: {} + + path-scurry@2.0.2: + dependencies: + lru-cache: 11.5.1 + minipass: 7.1.3 + + proper-lockfile@4.1.2: + dependencies: + graceful-fs: 4.2.11 + retry: 0.12.0 + signal-exit: 3.0.7 + + protobufjs@7.6.4: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.5 + '@protobufjs/eventemitter': 1.1.1 + '@protobufjs/fetch': 1.1.1 + '@protobufjs/float': 1.0.2 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.1 + '@types/node': 22.10.2 + long: 5.3.2 + + resolve-pkg-maps@1.0.0: {} + + retry@0.12.0: {} + + retry@0.13.1: {} + + safe-buffer@5.2.1: {} + + semver@7.8.0: {} + + semver@7.8.4: {} + + shebang-command@2.0.0: + dependencies: + shebang-regex: 3.0.0 + + shebang-regex@3.0.0: {} + + signal-exit@3.0.7: {} + + strnum@2.4.0: + dependencies: + anynum: 1.0.0 + + ts-algebra@2.0.0: {} + + tslib@2.8.1: {} + + tsx@4.19.2: + dependencies: + esbuild: 0.23.1 + get-tsconfig: 4.14.0 + optionalDependencies: + fsevents: 2.3.3 + + typebox@1.1.38: {} + + undici-types@6.20.0: {} + + undici@8.3.0: {} + + web-streams-polyfill@3.3.3: {} + + which@2.0.2: + dependencies: + isexe: 2.0.0 + + ws@8.21.0: {} + + xml-naming@0.1.0: {} + + yaml@2.9.0: {} + + zod-to-json-schema@3.25.2(zod@4.4.3): + dependencies: + zod: 4.4.3 + + zod@4.4.3: {} diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts b/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts new file mode 100644 index 0000000000..03164e6311 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts @@ -0,0 +1,197 @@ +/** + * WP-1 runner: install Pi, run a small tool-using agent task, and export the run + * to Agenta as OpenTelemetry traces via the agenta-otel extension. + * + * Auth: uses AuthStorage.create(), which reads ~/.pi/agent/auth.json. Log in once + * with `pnpm exec pi` -> `/login` -> "ChatGPT Plus/Pro (Codex)" (no API key needed), + * or set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment. + * + * Run: `pnpm start` + */ +import dotenv from "dotenv"; +import { existsSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { + AuthStorage, + createAgentSession, + DefaultResourceLoader, + getAgentDir, + ModelRegistry, + SessionManager, +} from "@earendil-works/pi-coding-agent"; + +import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel.ts"; + +// Load env before anything reads it: poc-local .env first, then walk up to the +// repo-root .env.test.local for the shared dev-box Agenta credentials. +function loadEnv(): void { + dotenv.config(); + let dir = dirname(fileURLToPath(import.meta.url)); + for (let i = 0; i < 8; i++) { + const candidate = join(dir, ".env.test.local"); + if (existsSync(candidate)) { + dotenv.config({ path: candidate }); + break; + } + const parent = dirname(dir); + if (parent === dir) break; + dir = parent; + } +} + +type Scenario = { name: string; seed: (dir: string) => void; prompts: string[] }; + +const SCENARIOS: Record = { + simple: { + name: "simple", + seed: (dir) => + writeFileSync( + join(dir, "notes.txt"), + "TODO: greet the user by name (use 'Mahmoud')\n" + + "TODO: add a two-line haiku about tracing\n", + ), + prompts: [ + "Read notes.txt in the current directory, then create greeting.txt that " + + "addresses each TODO. Keep it short.", + ], + }, + // Many tool calls across several turns, ending in a structured return. + complex: { + name: "complex", + seed: (dir) => { + writeFileSync( + join(dir, "alpha.py"), + "def add(a, b):\n return a + b\n\n\ndef sub(a, b):\n return a - b\n", + ); + writeFileSync( + join(dir, "beta.py"), + "import math\n\n\ndef area(r):\n return math.pi * r * r\n", + ); + writeFileSync(join(dir, "README.md"), "# demo\n\nA tiny demo package.\n"); + }, + prompts: [ + "Explore this directory: list the files, read every .py file, and use bash " + + "(wc -l) to count the total number of lines across the .py files. Then write " + + "REPORT.md describing what each .py file does and the total line count. " + + "Finally, reply with ONLY a JSON object: " + + '{"files": ["..."], "total_py_lines": , "report": "REPORT.md"}.', + ], + }, + // A longer, multi-prompt session: each prompt is its own trace, all sharing one session.id. + session: { + name: "session", + seed: () => {}, + prompts: [ + "Create todo.md with exactly 3 short tasks about adding distributed tracing to a service.", + "Append 2 more tasks to todo.md, then show me the full file with the bash 'cat' command.", + 'Read todo.md and reply with ONLY a JSON object: {"count": , "tasks": ["..."]}.', + ], + }, +}; + +function pickScenario(cliPrompts: string[]): Scenario { + if (cliPrompts.length > 0) { + return { name: "custom", seed: SCENARIOS.complex.seed, prompts: cliPrompts }; + } + const key = process.env.PI_SCENARIO || "complex"; + return SCENARIOS[key] ?? SCENARIOS.complex; +} + +async function main(): Promise { + loadEnv(); + + // A throwaway working dir seeded per scenario so the agent actually uses tools. + const cwd = mkdtempSync(join(tmpdir(), "pi-poc-")); + const scenario = pickScenario(process.argv.slice(2)); + scenario.seed(cwd); + + const authStorage = AuthStorage.create(); + const modelRegistry = ModelRegistry.create(authStorage); + const available = await modelRegistry.getAvailable(); + if (available.length === 0) { + console.error( + "\nNo model is available. Authenticate Pi first:\n" + + " pnpm exec pi then /login -> \"ChatGPT Plus/Pro (Codex)\"\n" + + "or export OPENAI_API_KEY / ANTHROPIC_API_KEY.\n", + ); + process.exit(1); + } + + const wanted = process.env.PI_MODEL; // "gpt-5.5" or "openai-codex/gpt-5.5" + const model = + (wanted && + available.find( + (m: any) => m.id === wanted || `${m.provider}/${m.id}` === wanted, + )) || + available.find((m: any) => m.id === "gpt-5.5") || + available.find((m: any) => !/spark|mini/i.test(m.id)) || + available[0]; + if (wanted && model.id !== wanted && `${model.provider}/${model.id}` !== wanted) { + console.warn(`[run] PI_MODEL="${wanted}" not available; using ${model.id}`); + } + console.log(`[run] scenario: ${scenario.name} (${scenario.prompts.length} prompt(s))`); + console.log(`[run] model: ${model.provider}/${model.id}`); + console.log(`[run] cwd: ${cwd}`); + + const loader = new DefaultResourceLoader({ + cwd, + agentDir: getAgentDir(), + extensionFactories: [agentaOtel], + }); + await loader.reload(); + + const { session } = await createAgentSession({ + cwd, + model, + authStorage, + modelRegistry, + tools: ["read", "bash", "edit", "write", "ls"], + sessionManager: SessionManager.inMemory(cwd), + resourceLoader: loader, + }); + + // Hand the session id + model to the extension so spans carry them. + runConfig.sessionId = session.sessionId; + runConfig.provider = model.provider; + runConfig.requestModel = model.id; + + session.subscribe((event: any) => { + if ( + event.type === "message_update" && + event.assistantMessageEvent?.type === "text_delta" + ) { + process.stdout.write(event.assistantMessageEvent.delta); + } else if (event.type === "tool_execution_start") { + process.stdout.write(`\n[tool] ${event.toolName}\n`); + } + }); + + const traceIds: string[] = []; + for (let i = 0; i < scenario.prompts.length; i++) { + const p = scenario.prompts[i]; + console.log(`\n[run] prompt ${i + 1}/${scenario.prompts.length}: ${p}\n`); + await session.prompt(p); + if (runConfig.traceId) traceIds.push(runConfig.traceId); + } + + console.log("\n\n[run] flushing spans to Agenta..."); + session.dispose(); + await shutdownTracing(); + + const host = (process.env.AGENTA_HOST || "").replace(/\/+$/, ""); + console.log("[run] flushed."); + console.log(`[run] session_id=${session.sessionId}`); + traceIds.forEach((tid, i) => { + console.log(`[run] trace ${i + 1}: ${tid}`); + console.log(` ${host}/api/spans/?trace_id=${tid}`); + }); + process.exit(0); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md new file mode 100644 index 0000000000..0bb4b12777 --- /dev/null +++ b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md @@ -0,0 +1,113 @@ +# Tracing the agent run into the response, like completion and chat + +Status: built and verified end to end against the dev box (2026-06-15). Audience: +whoever works on the agent service (WP-2) and its tracing. + +This is the follow-on to [integrating-the-tracing-extension.md](integrating-the-tracing-extension.md). +That doc made a standalone Pi run show up in Agenta as its own trace. This one wires +the same extension into the WP-2 agent service so the agent's whole run becomes part +of the `/invoke` trace, the way completion and chat nest their LLM spans under the +workflow span. + +## What changed and why + +Completion and chat are traced as one tree: the SDK opens a workflow span for the +`/invoke` request, the LLM call nests under it, and the response carries that +`trace_id`. Open the trace and you see the whole call. + +The agent service runs the model work in a separate Node process (the Pi wrapper), so +its spans were not part of that tree. The WP-1 doc flagged the fix as future work: +thread a W3C `traceparent` across the boundary and start the agent span as its child. +That is what this change does. + +The result is one tree under the response's `trace_id`: + +``` +_agent workflow (the Python /invoke span, root) + invoke_agent AGENT (the Pi run, now a child of _agent) + turn N CHAIN + chat LLM model, tokens, cost, message thread + execute_tool ... TOOL +``` + +Verified shape from a live run (trace `0f47e5f5...`): four spans, one trace, the +`chat` span carrying `ag.data.inputs`/`outputs` as a message thread, token usage +(598/21/619), and cost, with nothing in `ag.unsupported`. + +## How it works + +Three seams carry the context from the Python service to the Pi spans. + +1. **Capture (Python, `services/oss/src/agent.py`).** Inside the instrumented + `_agent` handler the current OpenTelemetry span is the workflow span. `_trace_context()` + reads it with the SDK's `propagation.inject()`, which yields the `traceparent`, + `baggage`, and the request `Authorization`. It also reads the OTLP endpoint from + `ag.tracing.otlp_url`, the exact URL the Python spans use. This is best effort: if + capture fails the run still works, just without cross-trace linking. + +2. **Carry (`services/oss/src/agent_pi`).** `HarnessRequest` gains a `TraceContext` + (`ports.py`). `TraceContext.to_wire()` serializes it to the camelCase shape the + wrapper expects, and both harness adapters send it: the local subprocess one + (`pi_harness.py`) and the HTTP sidecar one (`pi_http_harness.py`). + +3. **Consume (Node, `services/agent/src/agenta-otel.ts`).** When a `traceparent` is + present the extension starts `invoke_agent` as a child of that remote span, so the + whole Pi subtree shares the caller's `trace_id`. It exports each trace to the + endpoint and with the `Authorization` the caller passed, falling back to env. The + runner (`runPi.ts`) flushes the trace before it returns the result. + +Because the Python span and the Pi spans share one `trace_id` and the Pi root points +at the Python span, Agenta merges them into one tree at ingest. No backend change. + +## What is different from the POC extension + +The service build keeps the POC's span tree and every load-bearing attribute choice +(read the [five rules](integrating-the-tracing-extension.md#what-you-must-not-change-and-why) +again before touching attributes). It adds three things the service needs: + +- **Per-run state, not module globals.** The POC ran one prompt at a time. The HTTP + sidecar can drive several runs in one process, so all span state lives in the + closure `createAgentaOtel()` returns. Only the tracer, provider, and exporter cache + stay process wide. +- **A remote parent.** `invoke_agent` nests under the incoming `traceparent` instead + of starting a fresh root. The parent has no end event in this process, so the + per-trace batch flushes by trace id after the run rather than only on root-end. +- **Per-trace export target.** The OTLP endpoint and `Authorization` come from the run + config, so one shared process can serve more than one project. They fall back to + `AGENTA_HOST` / `AGENTA_API_KEY` when the caller passes nothing. + +## Auth and endpoint + +The Node side ships spans to the same place and with the same credentials as the +Python span. When the request carries `Authorization` (the project key or service +secret) the wrapper uses it verbatim, matching how the SDK exporter authorizes per +trace. With auth disabled locally there is no request credential, so the wrapper falls +back to the container's `AGENTA_API_KEY`. Set `AGENTA_AGENT_CAPTURE_CONTENT=0` on the +Python service to drop prompts, completions, and tool I/O from the spans. + +For the HTTP sidecar the endpoint passed from Python is the URL the Python container +uses to reach Agenta. The sidecar must be able to reach the same host. On one Docker +network the internal hostname resolves from both; if it does not, the sidecar's +`AGENTA_HOST` fallback applies. + +## How to verify + +1. Start the service (`entrypoints.agent_main:app`) with `AGENTA_HOST` and + `AGENTA_API_KEY` set and a Pi login or provider key available. +2. POST a chat-style body to `/agent/v0/invoke` and read `x-ag-trace-id` from the + response headers (it equals `trace_id` in the body). +3. Fetch the trace and confirm the merged tree and the totals: + ``` + curl -s "${AGENTA_HOST}/api/spans/?trace_id=" -H "Authorization: ApiKey ${AGENTA_API_KEY}" + ``` + Expect `_agent` (workflow) over `invoke_agent` (agent) over `turn N` (chain) over + `chat` (chat), all sharing one `trace_id`, with token usage and cost on the `chat` + span and nothing under `ag.unsupported`. + +## Files + +- `services/oss/src/agent.py` — `_trace_context()` captures the workflow span context. +- `services/oss/src/agent_pi/ports.py` — `TraceContext` and `HarnessRequest.trace`. +- `services/oss/src/agent_pi/pi_harness.py`, `pi_http_harness.py` — forward the context. +- `services/agent/src/agenta-otel.ts` — the service build of the extension. +- `services/agent/src/runPi.ts` — registers the extension, sets run config, flushes. diff --git a/docs/design/agent-workflows/wp-2-agent-service/README.md b/docs/design/agent-workflows/wp-2-agent-service/README.md new file mode 100644 index 0000000000..c0a5731f6a --- /dev/null +++ b/docs/design/agent-workflows/wp-2-agent-service/README.md @@ -0,0 +1,124 @@ +# WP-2: Agent service wrapping Pi + +Status: not started. + +## Goal + +Stand up a new service that wraps Pi and exposes an interface like Agenta's completion/chat +services, so we can talk to an agent: set it up (auth, AGENTS.md), send a message, and get response streamed back. Local only for the POC. No Daytona yet. + +Basically we want: + +- A new docker service that has the same structure as completion and chat +- that opens endpoints for the same interface as chat +- that you can send a message history and context and get back response + + + + +## Scope + +In: + +- A thin TypeScript harness-wrapper that drives Pi's SDK (`createAgentSession`). +- Configure the agent fully in memory: AGENTS.md, LLM auth, model. Skills and custom tools + can be stubbed for the first cut. +- Expose our own protocol on a port: a send-message / get-response surface that mirrors the + shape of the existing completion/chat services. + +Out (later work packages): + +- Daytona sandbox. The wrapper runs as a local process for the POC. +- Swapping in other harnesses (Codex, Claude Code). Design the protocol so it is possible, + but only implement Pi here. +- Persisting sessions or storing config server-side. Use a config passed in at startup. +- Stream the multi-message output back to the caller. +- multimessages +- tools + +In step 1 we will hard code the auth for pi.dev (the openai api key for instance or codex). We wont have any configuration just ability to run things. The docker compose will be reloadable automatic change which mean we can simply change the files in the volume locally and change things there. + +We will make sure in the implementation to first think about the port and adapters. So that even the first MVP is very simple it has the right ports and adapters. + +First between our agent implementation and calling pi.dev and setting it up there is a clear port. pi is an implementation for this. + +there is also another port for setting up the run environment. So it's not just setup the agent but also the run environment. + +because you might run pi.dev or claude code locally. As you might run each in daytona or something else. + +We need to set these up. EAch with an adapter. starting env - shutting down - pausing - connecting volume - + +then set up pi.dev setting up - invoking - stoping? (all the rpc interactions) - shutting down + +For pi.dev it might make sense to have two adapters one for RPC and the other for json + +Success for this WP1 is: +- I go to the UI +- Create a new agent (with some hard coded config Say hello world) +- I run it in the playground and I see the output. + +note here that instrumentation here might needed, we are working in parallel on the research for that + + +As soon as we have that we can start working on adding a config first to the playground. which include agents.md then authentication (model used) then setting up tools. then we can talk about streaming, multi messages, intermediate messages. + + + + +--- The rest of the article might be out of date for some parts. The main requirements are above --- + + +## Approach (grounded in research) + +See [`../research/pi-interaction.md`](../research/pi-interaction.md), +[`../research/auth-secrets.md`](../research/auth-secrets.md), and +[`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md). + +- Use the **SDK**, not RPC. The SDK is what exposes the in-memory overrides and runtime + credential injection; RPC mode cannot inject credentials post-spawn. +- Inject everything in memory: + - AGENTS.md via `systemPromptOverride` / `appendSystemPrompt` / `agentsFilesOverride`, + with `noContextFiles` so no on-disk AGENTS.md leaks in. + - LLM auth via `setRuntimeApiKey(provider, key)` or `AuthStorage.inMemory()` (env at + spawn also works). + - State via `SessionManager.inMemory()`, `SettingsManager.inMemory()`, + `ModelRegistry.inMemory()`. +- Diskless: set `TMPDIR` to a per-run tmpfs for bash output spillover; pre-install `rg`/`fd` + so search tools do not write binaries to disk. +- Stream output via `session.subscribe()` callbacks (`message_update` -> `text_delta`), + mapping Pi events onto the service's streamed response. +- This wrapper is the "works with our port" contract and the swappable-harness seam. Keep + the protocol harness-agnostic. + +## Interface to mirror + +Match the existing Agenta completion/chat service surface so callers and the playground can +treat an agent like the other workflow types. Reconcile the single-output completion/chat +shape with Pi's multi-message output (the response is a list of messages, not one +completion). + +## Definition of done + +- The service starts locally with a passed-in config (AGENTS.md text, model, provider key). +- A caller can send a message and receive the streamed multi-message response. +- Auth and AGENTS.md are applied in memory, with nothing invocation-specific written to a + persistent disk. +- The same wrapper binary runs as a plain local process (parity baseline for later sandbox + and pull-config-and-run-locally work). + +## Open questions + +- Where the service lives in the repo (a new entry under `services/`, or alongside `api/`), + and how a Node service fits the Python backend. Decide before writing code. +- The exact protocol on the port (JSON-lines over stdio, a small HTTP/SSE server, or + websockets). Pick the one that matches how Agenta calls completion/chat today. +- How the multi-message output maps to the completion/chat response contract. +- Whether WP-1's tracing extension is embedded here from the start or added after. + +## Links + +- [`../research/pi-interaction.md`](../research/pi-interaction.md) +- [`../research/auth-secrets.md`](../research/auth-secrets.md) +- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md) +- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md new file mode 100644 index 0000000000..81f8cb6e88 --- /dev/null +++ b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md @@ -0,0 +1,273 @@ +# WP-2 implementation plan: agent service wrapping Pi + +Status: MVP built and verified by curl (2026-06-15). Decisions below were taken; the +"Implemented" section records what shipped. Original decision points are kept marked +**[DECISION]** for history. + +## Implemented (MVP, verified by curl) + +Per the decisions: a Python service exposes the Agenta `/invoke` contract (auth, +middleware, CORS via `ag.create_app`) and calls a thin TypeScript Pi wrapper. Standalone, +verified with curl. Pi runs on the local login (`openai-codex` / `gpt-5.5`). + +What shipped: + +- TypeScript Pi wrapper: `services/agent/` (`src/runPi.ts`, `src/cli.ts`). One-shot + JSON-over-stdio: read a request on stdin, drive Pi's SDK (`createAgentSession`) with + AGENTS.md injected in memory, write the reply as JSON on stdout. Pinned + `@earendil-works/pi-coding-agent@0.79.4`. Editable config in `services/agent/config/` + (`AGENTS.md`, `agent.json`), read per request so edits need no restart. +- Python service: `services/oss/src/agent.py` mirrors `chat.py` (`ag.create_app` + + `ag.workflow` + `ag.route`, `is_chat` flag). Ports and adapters in + `services/oss/src/agent_pi/`: `Harness` port + `PiHarness` (spawns the wrapper over the + JSON transport), `Runtime` port + `LocalRuntime` (local subprocess; Daytona slots in + here later). +- Standalone entrypoint: `services/entrypoints/agent_main.py` mounts only the agent app + + `/health` for isolated local runs. + +How to run and verify locally: + +```bash +cd services/agent && pnpm install # once +cd ../ && set -a && source ../.env.test.local && set +a +AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false \ + uv run uvicorn entrypoints.agent_main:app --host 0.0.0.0 --port 8090 + +curl -s -X POST http://localhost:8090/agent/v0/invoke -H "Content-Type: application/json" \ + -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}' +# -> {"data":{"outputs":{"role":"assistant","content":"Hi! I'm your friendly hello-world AI assistant."}}, "status":{"code":200}, ...} +``` + +## Dockerized (verified by curl) + +The agent now runs fully in Docker via a dedicated, self-contained compose that does not +touch other stacks. Two containers: + +- `agent-pi`: the TypeScript Pi wrapper as an HTTP sidecar + (`services/agent/src/server.ts`, `docker/Dockerfile.dev`). It copies the read-only + mounted `~/.pi/agent` login into a writable container path at startup, so OAuth refresh + never writes back to the host. `node_modules` is baked into the image; `src` is + bind-mounted so `tsx watch` hot-reloads code edits. Adding npm deps needs a rebuild. +- `agent-api`: the Python agent service, built from the current services dev Dockerfile + (`agenta-agent-api:dev`, a dedicated tag). Selects the HTTP harness via + `AGENTA_AGENT_PI_URL` and calls the sidecar in-network. Published on host port 8092. + +The Python -> Pi seam is now two adapters behind the same Harness port: `PiHarness` +(subprocess, local) and `PiHttpHarness` (HTTP, docker). `agent.py` picks by env. + +Run and verify: + +```bash +docker compose -f services/agent/docker-compose.agent.yml up --build -d +curl localhost:8092/health +curl -s -X POST localhost:8092/agent/v0/invoke -H 'Content-Type: application/json' \ + -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}' +# -> 200, {"data":{"outputs":{"role":"assistant","content":"Hello from your friendly Docker agent!"}}, ...} +docker compose -f services/agent/docker-compose.agent.yml down # tear down +``` + +Note: do not reuse the stale `agenta-oss-dev-services:latest` image (Python 3.11, old SDK +without `route(app=...)`); the compose builds a fresh `agenta-agent-api:dev` from the +current Dockerfile instead. + +Known gaps / next steps: auth header is bypassed for local curl; streaming, multi-message +output, and tools; tracing across the boundary is being wired in (OTel deps + `agenta-otel.ts` +in the wrapper, `TraceContext` in the ports) and the HTTP path / OTLP target still need +finishing; registering `agenta:builtin:agent:v0` as a real workflow type + template (WP-6) +and pointing a real dev stack at the sidecar so it runs from the playground. + +--- + +Status: draft for review. Add inline comments anywhere. Decision points are marked +**[DECISION]** and have a recommended default. + +## Context + +Agenta runs prompt-style workflows today (completion, chat, LLM-as-a-judge). Each is a +Python FastAPI app exposing `/invoke` and `/inspect`, all mounted in one `services` +container (`services/entrypoints/main.py`). The backend and playground call a service by +POSTing a `WorkflowInvokeRequest` to `{serviceUrl}/invoke` and reading +`WorkflowBatchResponse.data.outputs` back. + +WP-2 adds a new kind of workflow: an agent. An agent runs a harness (Pi by default) that +drives a model over multiple turns. Pi is a TypeScript/Node SDK +(`@earendil-works/pi-coding-agent`, pinned `0.79.4`). It has no Python SDK. So the agent +service is a Node service, the first non-Python service in the dev stack. + +This work package builds only the service. It runs Pi locally (no Daytona), with hardcoded +config (AGENTS.md text, model, provider key from env). The goal is to stand up the right +ports and adapters even for the simplest MVP, so Daytona and other harnesses slot in later +without reshaping the service. + +Source: `wp-2-agent-service/README.md` and the research it links +(`research/pi-interaction.md`, `research/diskless-in-memory-config.md`). + +## What I confirmed in the codebase + +- All Python services run in one `services` container, each mounted at its own path and + exposing `/invoke` + `/inspect` (`services/entrypoints/main.py:135`). +- The chat handler takes `inputs`, `messages`, and `parameters` + (`services/oss/src/chat.py:18`). The routing decorator pulls these from the + `WorkflowInvokeRequest` envelope. +- The playground resolves `serviceUrl` from the workflow's `data.url` (or builds it from + `data.uri`) and POSTs directly from the browser to `{serviceUrl}/invoke` + (`web/packages/agenta-entities/src/workflow/state/runnableSetup.ts:246`). So the service + needs the same request/response shapes and CORS as the Python services + (`services/entrypoints/main.py:115`). +- The dev stack hot-reloads via bind mounts plus uvicorn `--reload`, and traefik routes + `PathPrefix(/services/)` after stripping the prefix + (`hosting/docker-compose/oss/docker-compose.dev.yml:351`). +- Research confirms Pi runs fully diskless through its SDK: in-memory auth, AGENTS.md, + model, and sessions (`research/diskless-in-memory-config.md`). + +## Scope + +In: +- A new Node/TypeScript service that exposes the Agenta `/invoke` contract directly. +- Drives Pi through its SDK (`createAgentSession`) in-process, config in memory. +- Hardcoded config: AGENTS.md text, model id, provider key from env. Config read from a + mounted file so it is editable and hot-reloads. +- Ports and adapters wired from the start (see Architecture). +- Dockerized with hot-reload, wired into the OSS dev compose and traefik. + +Out (later WPs, per the design doc): +- Daytona sandbox. The runtime adapter is the local process for now. +- Streaming and multi-message output. This cut returns the final assistant text as a + single `data.outputs`. +- Custom tools and skills. Stubbed for the first cut. +- Server-side config persistence. Config is passed in at startup. +- Other harnesses (Codex, Claude Code). Design the port for them, implement only Pi. + +## Architecture: ports and adapters + +The service is harness-agnostic at its core, with the two ports the design doc calls out. + +``` +HTTP layer (Fastify or Express): POST /invoke, POST /inspect, GET /health, CORS + | +Core (no Pi, no Daytona): + AgentRunner.run(config, messages, inputs) -> { output } + | | + Port: Harness Port: Runtime (environment) + setup(config) start() / shutdown() + invoke(messages, inputs) pause() / connectVolume() + stop() / shutdown() + | | + Adapter: PiSdkHarness Adapter: LocalRuntime + (createAgentSession, (in-process; the Node process + in-memory auth + AGENTS.md itself is the run environment) + + model, SessionManager + .inMemory()) [later: DaytonaRuntime in WP-3] + [later: PiRpcHarness] +``` + +- Harness port: the seam between our service and the agent engine. Pi is one + implementation. The MVP ships one adapter, `PiSdkHarness`. The doc also floats RPC and + JSON adapters; the port shape leaves room for `PiRpcHarness` later. + **[DECISION]** Drive Pi via the SDK in-process for the MVP (recommended: simplest for a + Node service, gives in-memory auth + AGENTS.md + model), rather than spawning `pi --mode + rpc`. +- Runtime port: the seam for the run environment (start, shutdown, pause, connect volume). + The MVP adapter is `LocalRuntime` (the Node process). `DaytonaRuntime` lands in WP-3 + behind the same port. + +### PiSdkHarness (the MVP adapter) + +Per `research/diskless-in-memory-config.md`: +- `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` for the LLM key. +- `DefaultResourceLoader` with `noContextFiles: true` and `agentsFilesOverride` (or + `systemPromptOverride`) to inject AGENTS.md text in memory. +- `SessionManager.inMemory()`, `SettingsManager.inMemory()`, + `ModelRegistry.inMemory(auth)` so nothing persists. +- `model: getModel(provider, modelId)`. +- `TMPDIR` set to a tmpfs for Pi's bash output spillover (the one forced write). +- MVP run: `await session.prompt(text)`, then read the final assistant text from + `session.messages` (or the `agent_end` event). Return it as `data.outputs`. No + streaming. + +## HTTP contract (mirror chat) + +- `POST /invoke`: accept `{ data: { parameters, inputs }, references?, ... }`. Pull the + user message from `inputs`/`messages` the way chat does + (`services/oss/src/chat.py:18`). Return + `{ version, data: { outputs }, status: { code: 200 }, trace_id, span_id }`. +- `POST /inspect`: return the parameters/inputs schema. The MVP can return a minimal + static schema, enough for the backend inspect path. +- `GET /health`: `{ status: "ok" }`. +- CORS: allow the same origins as the Python services so the browser can call it directly. + +Auth note: the Python services verify an `Authorization: Secret {token}` header via SDK +middleware. The local MVP can accept the header without verifying it. Real verification is +a later concern. Flagging this as a known gap. + +## Repo placement and Docker + +- New Node project at `services/agent/`: own `package.json`, `tsconfig.json`, `src/` (with + `http/`, `core/`, `adapters/pi/`, `adapters/runtime/`), `config/` (the editable + AGENTS.md and model config), and `docker/Dockerfile.dev` + `docker/Dockerfile.gh`. +- Pin `@earendil-works/pi-coding-agent@0.79.4` and `@earendil-works/pi-ai@0.79.4`. +- Hot-reload: run with `tsx watch` (or `node --watch`). Bind-mount `services/agent/src` and + `services/agent/config`; keep `node_modules` in the image via an anonymous volume so the + host/container split does not break it. +- New compose service block in `hosting/docker-compose/oss/docker-compose.dev.yml` (model + the existing `services` block at line 351). Own port (for example 8090), traefik router + `PathPrefix(/agent/)` that strips the prefix, env_file for the provider key. +- The provider key (for example `OPENAI_API_KEY`) goes in the dev env file the compose + service reads. + +## Verification + +1. Bring up the OSS dev stack with the new service: + `./hosting/docker-compose/run.sh --oss --dev --build`. +2. `curl http://localhost/agent/health` returns ok. +3. `curl -X POST http://localhost/agent/invoke` with a chat-style body and a message; + confirm the response carries the agent reply in `data.outputs`. This is the core WP-2 + definition of done. +4. Edit `services/agent/config/AGENTS.md`; confirm the change is picked up without a + rebuild. +5. End-to-end demo (only if decided in scope below): register an agent workflow whose + `data.url` points at the agent service, open it in the playground, send a message, see + the output. + +## Decisions to confirm + +**[DECISION 1] Service shape.** Recommended: a pure Node service that speaks `/invoke` +directly (matches the doc, fewest moving parts). Alternative: a Python shim in the existing +services container that bridges to a Node Pi sidecar (reuses Agenta auth/tracing +middleware, adds a hop). +> Your call: We should use python then call ts for the moment. The Py provides authentication, middleware, and a bunch of things. + +**[DECISION 2] How far this iteration goes.** Option A: standalone service, verified by +curl (the true WP-2 definition of done). Option B: also wire the minimal end-to-end so you +can create an agent and run it in the playground (overlaps WP-6's workflow-type +registration). +> Your call: Let's start with the standalone service verified by curl + +**[DECISION 3] LLM key for Pi.** `.env.test.local` only has Agenta cloud creds, not a model +key. Pi needs a real provider key to run. Which provider and model for the hardcoded +"hello world" agent (for example OpenAI `gpt-4o-mini`)? Can you supply the key as an env +var for a live verification, or should I build without live verification for now? +> Your call: I have set up + +**[DECISION 4] Pi driving mode.** Recommended: SDK in-process. Alternative: `pi --mode rpc` +subprocess. SDK is simpler here and supports in-memory auth and AGENTS.md. +> Your call: +I have set up auth What's left — your one-time Pi login +`~/.pi/agent` doesn't exist yet, so no model is available. Pi can't reuse the `~/.codex` token directly; it needs its own login (same ChatGPT account, browser OAuth — I can't drive that for you): + +```bash +cd docs/design/agent-workflows/wp-1-pi-tracing/poc +pnpm exec pi # TUI opens +# type: /login → choose "ChatGPT Plus/Pro (Codex)" → finish browser OAuth → quit +pnpm start # runs the agent, exports the trace +``` + +(Or `export OPENAI_API_KEY=...` / `ANTHROPIC_API_KEY=...` instead of logging in.) + +After `pnpm start`, watch for `[agenta-otel] exporting spans to .../api/otlp/v1/traces` and `[run] flushed`, then open Agenta observability on the dev box and find the `invoke_agent` trace — verify the tree types correctly and the `chat` span carries model, latency, and token usage. + +Want me to wait while you log in, then I'll run it and verify the trace in Agenta together — or would you rather I add the Pi-native model-usage cost (`gen_ai.usage.cost`) display check to the verification while you do that? + + + Logged in to ChatGPT Plus/Pro (Codex Subscription). Selected gpt-5.5. Credentials saved + to /home/mahmoud/.pi/agent/auth.json diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md b/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md new file mode 100644 index 0000000000..89a775f7e0 --- /dev/null +++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md @@ -0,0 +1,99 @@ +# WP-3: Daytona sandbox running Pi + +Status: **POC complete** against Daytona cloud (`target=eu`). See +[`poc/`](poc/README.md). Ran in parallel with WP-1 and WP-2. + +## Goal + +Prove the sandbox track end to end: create a Daytona sandbox with Pi installed, inject the +agent's files and secrets, run an agent, stream the output back, and tear down. This takes +the local Pi wrapper (WP-2) and shows it running inside a sandbox. The two can be developed +in parallel, since the Daytona lifecycle and image work do not depend on the wrapper being +finished. + +## What the POC established + +The POC ([`poc/`](poc/README.md)) does the full loop against Daytona cloud and answers the +key unknowns: + +- **Bake Pi into a snapshot.** `build_snapshot.py` builds `agenta-pi-harness` from + `node:22-bookworm` + Pi `0.79.4` + ripgrep/fd in ~26s. Daytona injects its toolbox daemon + into the custom image, so `process.exec` / `fs` / sessions work on a plain node base (no + need to layer on `daytonaio/sandbox`). +- **Cold start is sub-second warm.** Creating a sandbox from the prebuilt snapshot is + ~0.7-1.1s on a warm runner, with an occasional few-second spike when a runner pulls the + custom image cold. That beats installing Pi per run (npm install alone is ~3s). +- **Inject config + secret, run, stream, tear down.** `run_agent.py` lays an `AGENTS.md` + and a task file into a per-run dir, injects the provider credential (env var or uploaded + credential file), runs Pi headless in `--mode json`, streams the typed event lines, and + deletes the sandbox. The agent honored the injected `AGENTS.md` and used tools + (`read`, `read`, `write`). +- **Gotcha: Pi blocks on a trust prompt.** With an `AGENTS.md` in cwd, Pi asks to trust + project-local files and hangs in a non-interactive session. Pass `--approve` and run with + stdin from `/dev/null`. This was the main trap. + +Full findings, the measured numbers, and how to run it: [`poc/README.md`](poc/README.md). + +## Scope + +In: + +- Create a Daytona sandbox from the Python SDK (`pip install daytona`, + `Daytona` / `AsyncDaytona`): `create` -> `process.exec` / sessions -> `stop` -> `delete`. +- Bake Pi into a Daytona snapshot (declarative `Image` builder or Dockerfile) so runs skip + per-run `npm install`. Pre-install `rg` / `fd`. +- Inject files (`fs.upload_file` / `upload_files`) and secrets (`env_vars` at create, or + per-exec `env`). +- Run Pi headless and stream stdout/stderr back (session with `run_async=True`, + `get_session_command_logs_async`). +- Expose and use the port via `get_preview_link(port)` (the "works with our port" contract). +- One shared long-lived sandbox (`auto_stop_interval: 0`), per-run working directory plus a + per-run tmpfs for `TMPDIR`, bounded concurrency. + +Out: + +- Volume-per-execution. Not feasible in Daytona (volumes mount at create time only); use the + per-run dir + tmpfs approach instead. +- The provider abstraction for non-Daytona sandboxes. Keep the seam thin, but only implement + Daytona here. + +## Approach (grounded in research) + +See [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md) and +[`../research/sandbox-sharing.md`](../research/sandbox-sharing.md). + +## Definition of done + +- [x] A script creates a sandbox from a Pi snapshot, injects an AGENTS.md and a provider + key, runs an agent, streams the multi-message output, and tears down cleanly. +- [x] Nothing invocation-specific is written to a persistent volume. No volume is mounted; + each run uses a per-run dir plus a `TMPDIR` inside it, and the sandbox is deleted at the + end. +- [x] Cold-start with the custom snapshot is measured and recorded (`poc/README.md`). + +## Open questions + +Answered by the POC: + +- Daytona cloud works end to end with the provided `eu` credentials; the node-base snapshot + gets a working toolbox; cold start from the prebuilt snapshot is sub-second warm. +- Secret injection has two working paths: `env_vars` at create (secret-as-env) and an + uploaded credential file via `fs.upload_file` (secret-as-file). + +Still open: + +- Self-hosted Daytona vs Daytona cloud (AGPL review if self-host-and-modify). POC used + cloud only. +- Whether an actively streaming session resets the auto-stop idle timer. Sidestepped with + `auto_stop_interval=0` and owning the lifecycle; not independently confirmed. +- Realistic safe parallel-run count for one small sandbox (needs load testing). +- The snapshot build/version pipeline: who builds and pins `agenta-pi-harness` per agent + revision, and where that runs (CI or config-publish time). + +## Links + +- [`poc/`](poc/README.md) — the working POC (build snapshot, run agent, bench cold start) +- [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md) +- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md) +- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md new file mode 100644 index 0000000000..452322d858 --- /dev/null +++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md @@ -0,0 +1,118 @@ +# WP-3 POC: run a Pi agent in a Daytona cloud sandbox + +Bakes Pi into a Daytona snapshot, then creates a sandbox from it, injects the agent's +credential and config, runs the agent headless, streams its multi-message output back, +and tears the sandbox down. Runs against **Daytona cloud** (`target=eu`). + +This is the sandbox half of the agent runtime. It validates the `DaytonaRuntime` adapter +that WP-2 leaves behind its `Runtime` port (`start` -> create sandbox, inject config -> +lay down the per-run dir, `invoke` -> run Pi and stream, `shutdown` -> delete). + +## What's here + +- `build_snapshot.py` — bake Pi (+ ripgrep, fd) into the reusable `agenta-pi-harness` + snapshot so per-run cold start skips `npm install`. Run once. +- `run_agent.py` — the deliverable. Create -> inject -> run -> stream -> tear down. +- `bench_coldstart.py` — measure cold start, Pi snapshot vs the default image. +- `cleanup.py` — list sandboxes and delete leaked WP-3 ones. + +## Setup + +Needs `uv` and Daytona cloud credentials. Export them (the dev values live in +`hosting/docker-compose/ee/.env.ee.dev.local`): + +```bash +export DAYTONA_API_KEY=dtn_... +export DAYTONA_API_URL=https://app.daytona.io/api +export DAYTONA_TARGET=eu +``` + +Each script declares its own deps inline, so `uv run