From 689861535c56fea13b88b79b4053acbbd9cd7134 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Mon, 15 Jun 2026 21:59:24 +0200
Subject: [PATCH 01/10] feat(agent): Pi-backed agent workflow service,
 template, tracing, and docs

- New agent workflow service wrapping the Pi harness, served same-origin like
  chat/completion at /services/agent/v0 (Python service + Node Pi sidecar, ports/adapters).
- Builtin 'agent' app type + create-app template; config is model + AGENTS.md.
- /inspect chat schema and OpenTelemetry tracing into Agenta.
- EE dev compose agent-pi sidecar; design docs under docs/design/agent-workflows.
---
 docs/design/agent-workflows/README.md         |  123 ++
 .../agent-workflows/research/auth-secrets.md  |  441 ++++
 .../research/daytona-sandbox.md               |  482 +++++
 .../research/diskless-in-memory-config.md     |  460 ++++
 .../research/open-questions.md                |  312 +++
 .../research/otel-instrumentation.md          |  379 ++++
 .../research/pi-interaction.md                |  584 ++++++
 .../research/sandbox-sharing.md               |  359 ++++
 .../agent-workflows/wp-1-pi-tracing/README.md |   73 +
 .../integrating-the-tracing-extension.md      |  186 ++
 .../wp-1-pi-tracing/poc/.env.example          |    7 +
 .../wp-1-pi-tracing/poc/README.md             |   86 +
 .../wp-1-pi-tracing/poc/agenta-otel.ts        |  414 ++++
 .../wp-1-pi-tracing/poc/package.json          |   25 +
 .../wp-1-pi-tracing/poc/pnpm-lock.yaml        | 1842 +++++++++++++++++
 .../wp-1-pi-tracing/poc/run.ts                |  197 ++
 .../tracing-in-the-agent-service.md           |  113 +
 .../wp-2-agent-service/README.md              |  124 ++
 .../wp-2-agent-service/implementation-plan.md |  273 +++
 .../wp-3-daytona-sandbox/README.md            |   99 +
 .../wp-3-daytona-sandbox/poc/README.md        |  118 ++
 .../poc/bench_coldstart.py                    |   49 +
 .../poc/build_snapshot.py                     |   95 +
 .../wp-3-daytona-sandbox/poc/cleanup.py       |   43 +
 .../wp-3-daytona-sandbox/poc/run_agent.py     |  325 +++
 .../wp-4-multi-message-output/README.md       |   55 +
 .../wp-5-chat-vs-completion/README.md         |   51 +
 .../wp-6-workflow-type-and-template/README.md |   84 +
 .../agent-workflows/wp-7-tools/README.md      |  214 ++
 .../docker-compose/ee/docker-compose.dev.yml  |   36 +
 .../agenta/sdk/engines/running/interfaces.py  |   36 +
 .../agenta/sdk/engines/running/utils.py       |   25 +-
 services/agent/.dockerignore                  |    3 +
 services/agent/README.md                      |   73 +
 services/agent/config/AGENTS.md               |    7 +
 services/agent/config/agent.json              |    4 +
 services/agent/docker-compose.agent.yml       |   98 +
 services/agent/docker-compose.stack.yml       |   86 +
 services/agent/docker/Dockerfile.dev          |   28 +
 services/agent/package.json                   |   27 +
 services/agent/pnpm-lock.yaml                 | 1826 ++++++++++++++++
 services/agent/scripts/register_agent_app.py  |  166 ++
 services/agent/src/agenta-otel.ts             |  551 +++++
 services/agent/src/cli.ts                     |   44 +
 services/agent/src/runPi.ts                   |  231 +++
 services/agent/src/server.ts                  |   64 +
 services/agent/tsconfig.json                  |   16 +
 services/entrypoints/agent_main.py            |   47 +
 services/entrypoints/main.py                  |    2 +
 services/oss/src/agent.py                     |  140 ++
 services/oss/src/agent_pi/__init__.py         |   11 +
 services/oss/src/agent_pi/config.py           |   68 +
 services/oss/src/agent_pi/local_runtime.py    |   59 +
 services/oss/src/agent_pi/pi_harness.py       |   84 +
 services/oss/src/agent_pi/pi_http_harness.py  |   64 +
 services/oss/src/agent_pi/ports.py            |  121 ++
 services/oss/src/agent_pi/schemas.py          |   71 +
 .../components/CreateAppDropdown/index.tsx    |    6 +
 .../modals/CreateAppTypeModal/index.tsx       |    6 +
 .../pages/prompts/assets/iconHelpers.tsx      |    4 +-
 .../src/workflow/state/appUtils.ts            |    6 +-
 61 files changed, 11619 insertions(+), 4 deletions(-)
 create mode 100644 docs/design/agent-workflows/README.md
 create mode 100644 docs/design/agent-workflows/research/auth-secrets.md
 create mode 100644 docs/design/agent-workflows/research/daytona-sandbox.md
 create mode 100644 docs/design/agent-workflows/research/diskless-in-memory-config.md
 create mode 100644 docs/design/agent-workflows/research/open-questions.md
 create mode 100644 docs/design/agent-workflows/research/otel-instrumentation.md
 create mode 100644 docs/design/agent-workflows/research/pi-interaction.md
 create mode 100644 docs/design/agent-workflows/research/sandbox-sharing.md
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/README.md
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts
 create mode 100644 docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
 create mode 100644 docs/design/agent-workflows/wp-2-agent-service/README.md
 create mode 100644 docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/README.md
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py
 create mode 100644 docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py
 create mode 100644 docs/design/agent-workflows/wp-4-multi-message-output/README.md
 create mode 100644 docs/design/agent-workflows/wp-5-chat-vs-completion/README.md
 create mode 100644 docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md
 create mode 100644 docs/design/agent-workflows/wp-7-tools/README.md
 create mode 100644 services/agent/.dockerignore
 create mode 100644 services/agent/README.md
 create mode 100644 services/agent/config/AGENTS.md
 create mode 100644 services/agent/config/agent.json
 create mode 100644 services/agent/docker-compose.agent.yml
 create mode 100644 services/agent/docker-compose.stack.yml
 create mode 100644 services/agent/docker/Dockerfile.dev
 create mode 100644 services/agent/package.json
 create mode 100644 services/agent/pnpm-lock.yaml
 create mode 100644 services/agent/scripts/register_agent_app.py
 create mode 100644 services/agent/src/agenta-otel.ts
 create mode 100644 services/agent/src/cli.ts
 create mode 100644 services/agent/src/runPi.ts
 create mode 100644 services/agent/src/server.ts
 create mode 100644 services/agent/tsconfig.json
 create mode 100644 services/entrypoints/agent_main.py
 create mode 100644 services/oss/src/agent.py
 create mode 100644 services/oss/src/agent_pi/__init__.py
 create mode 100644 services/oss/src/agent_pi/config.py
 create mode 100644 services/oss/src/agent_pi/local_runtime.py
 create mode 100644 services/oss/src/agent_pi/pi_harness.py
 create mode 100644 services/oss/src/agent_pi/pi_http_harness.py
 create mode 100644 services/oss/src/agent_pi/ports.py
 create mode 100644 services/oss/src/agent_pi/schemas.py

diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md
new file mode 100644
index 0000000000..7d5784dfc2
--- /dev/null
+++ b/docs/design/agent-workflows/README.md
@@ -0,0 +1,123 @@
+# Agent Workflows
+
+Status: context draft. Research and design to follow.
+
+## Summary
+
+Add a new workflow type to the backend: **agents**. Today the backend runs
+prompt-style workflows (completion, chat, LLM-as-a-judge). Agents are different. An
+agent runs inside a sandbox, executes tools over multiple turns, returns a multi-message
+output, and is instrumented end to end. Agents run on a **pi.dev** harness by default,
+and the same harness can run locally so a configuration pulled from the server behaves
+the same on a developer machine.
+
+This document only captures context. It does not propose a solution yet. The research
+topics in [Open research topics](#open-research-topics) will be assigned to subagents and
+written up in sibling files.
+
+## What an agent is
+
+An agent is a configured, sandboxed, instrumented runtime that:
+
+- Boots a sandbox through startup hooks that lay down files and inject secrets.
+- Runs a harness (pi by default, configurable) that drives the model and its tools.
+- Produces a multi-message output rather than a single completion.
+- Carries a `session_id` so a run can be identified and, later, have its state stored.
+- Emits instrumentation through pi instruments for tracing and observability.
+
+## Agent configuration
+
+The agent configuration is what gets stored on the server, versioned as a workflow
+revision, and pulled down to run locally. It includes:
+
+- **`AGENTS.md`** — the agent's instructions.
+- **Skills** — the skills available to the agent.
+- **Model** — the model the agent runs on.
+- **Tools** — the tools the agent has access to.
+- **Files** — files that are part of the config and are laid into the sandbox by the
+  startup hook.
+- **Secrets** — for example an OpenAI key, injected into the sandbox by the startup
+  hook.
+- **Harness** — which harness runs the agent. Defaults to pi; configurable.
+
+## Runtime model
+
+- **Sandbox.** Agents run in a Daytona sandbox, or any sandbox provider that works with
+  our port. The sandbox is initialized by startup hooks: file setup, then secrets setup.
+- **Harness.** The harness (pi by default) is the layer that exposes tools and drives the
+  agent loop. It is configurable per agent.
+- **Output.** A run returns multiple messages, not one completion.
+- **Instrumentation.** Runs are instrumented with pi instruments.
+- **Sessions.** Each run has a `session_id`. Future work adds session storage alongside
+  global storage so session state can persist across runs.
+
+## Local execution parity
+
+The same harness that runs server-side must run locally on pi.dev abstractions (tools and
+the rest). A user can pull an agent's configuration from the server and run it locally
+with the same behavior. Local-server parity is a first-class requirement, not an
+afterthought.
+
+## What the research established
+
+Full write-ups live in [`research/`](research/). The load-bearing conclusions:
+
+- **pi.dev is "Pi"**, an open-source TypeScript/Node agent harness by Earendil Inc. (MIT,
+  ~v0.79.4). It is local-first (a CLI/SDK/RPC, not a hosted service) and moves fast (0.x,
+  roughly weekly releases). There is no Python SDK.
+  See [`research/pi-interaction.md`](research/pi-interaction.md),
+  [`research/open-questions.md`](research/open-questions.md).
+- **Pi can run fully diskless.** Via the SDK's `createAgentSession`, AGENTS.md
+  (`systemPromptOverride`/`agentsFilesOverride`), skills (`skillsOverride`), tools
+  (`customTools`), LLM auth (`setRuntimeApiKey` / `AuthStorage.inMemory()` / env), and
+  session/settings/model state (`*.inMemory()`) are all in-memory. The only forced disk
+  write is bash output spillover to `os.tmpdir()`, redirected with `TMPDIR` to a per-run
+  tmpfs. See [`research/diskless-in-memory-config.md`](research/diskless-in-memory-config.md).
+- **"pi instruments" is not a product.** Pi emits no OTel by itself. Instrumentation is a
+  Pi extension on the `pi.on(...)` event bus that turns lifecycle events into OTLP spans.
+  Agenta already ingests OTLP at `POST /otlp/v1/traces` with adapters for GenAI semconv
+  and OpenInference, so `gen_ai.*` spans flow with little new backend code. Watch the
+  token-attribute drift (`input_tokens`/`output_tokens` vs the mapped
+  `prompt_tokens`/`completion_tokens`). See
+  [`research/otel-instrumentation.md`](research/otel-instrumentation.md).
+- **The harness seam is ours to build.** Pi's own "harness" concept is not a swap point
+  for Codex or Claude Code. The recommended shape is a thin TypeScript wrapper that drives
+  Pi's SDK with the in-memory overrides above and exposes our own protocol on a port. That
+  wrapper is the "works with our port" contract, the swappable-harness boundary, and the
+  local/server parity point. See [`research/auth-secrets.md`](research/auth-secrets.md).
+- **One shared sandbox is viable for v1.** Daytona supports one long-lived sandbox reused
+  across runs. It does not support swapping a volume per execution (volumes mount at create
+  time only). Per-run isolation comes from process memory plus a per-run tmpfs, not a
+  volume, which the diskless finding makes clean. Concurrency is contended, so bound it.
+  See [`research/sandbox-sharing.md`](research/sandbox-sharing.md),
+  [`research/daytona-sandbox.md`](research/daytona-sandbox.md).
+
+## POC work packages
+
+The POC runs as parallel tracks. Each has its own folder with scope and a definition of
+done. WP-1 and WP-2 run against a local Pi install first (no Daytona). WP-3 takes the
+sandbox path in parallel. WP-4 and WP-5 are design tasks that feed the WP-2 interface. WP-6 registers the agent as a
+backend workflow type and template, and defines its configuration and connection to the
+running agent.
+
+- [`wp-1-pi-tracing/`](wp-1-pi-tracing/README.md) — install Pi locally and send its agent
+  telemetry to Agenta as clean, structured traces.
+- [`wp-2-agent-service/`](wp-2-agent-service/README.md) — a new service that wraps Pi and
+  exposes a completion/chat-style interface, with auth and AGENTS.md set up in memory.
+- [`wp-3-daytona-sandbox/`](wp-3-daytona-sandbox/README.md) — create a Daytona sandbox with
+  Pi installed, inject files and secrets, run an agent, and stream output back.
+- [`wp-4-multi-message-output/`](wp-4-multi-message-output/README.md) — define how an
+  agent's multi-message output is shaped, streamed, stored, and surfaced.
+- [`wp-5-chat-vs-completion/`](wp-5-chat-vs-completion/README.md) — decide the interface
+  contract; start with chat that takes a single input.
+- [`wp-6-workflow-type-and-template/`](wp-6-workflow-type-and-template/README.md) — register
+  the agent as a new backend workflow type and template; define its config (model) and the
+  connection to the running agent.
+- [`wp-7-tools/`](wp-7-tools/README.md) — make runnable tools part of the agent config; resolve
+  Composio actions into Pi tools and route tool calls back through the existing
+  `POST /tools/call`, with MCP and workflow-as-tool as future adapters.
+
+## Related work
+
+- [`../prompt-runtime-unification/`](../prompt-runtime-unification/README.md) — the
+  prompt-side runtime that "future agent-style services" were already anticipated against.
diff --git a/docs/design/agent-workflows/research/auth-secrets.md b/docs/design/agent-workflows/research/auth-secrets.md
new file mode 100644
index 0000000000..b90af4ace5
--- /dev/null
+++ b/docs/design/agent-workflows/research/auth-secrets.md
@@ -0,0 +1,441 @@
+# Research: Auth and Secrets for the pi.dev Agent Harness
+
+Status: research only. No code changes. This file answers the five auth/secrets
+questions for the agent-workflows feature (see
+[`../README.md`](../README.md)). Every claim is cited. Items I could not verify
+from a primary source are marked **UNVERIFIED**.
+
+## Summary
+
+- **pi is a local CLI/SDK, not a hosted service.** "pi.dev" is the marketing and
+  docs site plus a package registry. There is no pi.dev account, no pi-issued API
+  key, and no pi-managed model gateway. You authenticate to *model providers*, not
+  to pi. ("Pi is a local coding agent. It runs with the permissions of the user
+  account that starts it." — `security.md`.)
+- **Provider auth is bring-your-own-key (BYOK) or provider OAuth.** pi reaches
+  OpenAI/Anthropic/etc. with the user's own provider keys, or with a provider's
+  subscription OAuth (Claude Pro/Max, ChatGPT Plus/Pro (Codex), GitHub Copilot).
+  Keys live in env vars or `~/.pi/agent/auth.json`. There is no pi gateway in the
+  middle, though pi can be *pointed at* a gateway you run (Cloudflare AI Gateway,
+  OpenShell inference routing, a corporate proxy).
+- **There is no first-class "secrets vault" in pi core.** pi has an *auth*
+  concept (provider credentials) and a flexible key-resolution syntax
+  (`$ENV`, `${ENV}`, `!shell-command`, literal). Anything beyond provider creds is
+  just environment variables / files the host process already has. The "named
+  secrets, scoped, agent-never-sees-the-value" feature surfaced in searches is a
+  set of **third-party community extensions** (e.g. `pi-secret-guard`,
+  `pi-secured-setup`, `pi-heimdall`, "Greywall"), not pi core.
+- **The Codex secret has two shapes.** (a) Keep pi as the harness and use pi's
+  native `openai-codex-responses` API + the built-in "ChatGPT Plus/Pro (Codex)"
+  OAuth login — the credential is a pi `OAuthCredentials` object in
+  `~/.pi/agent/auth.json`. (b) Swap the harness to the real **OpenAI Codex CLI**
+  (`codex exec`), in which case the "codex secret" is either an `OPENAI_API_KEY`
+  /`CODEX_API_KEY` value or a ChatGPT access token, materialized into
+  `~/.codex/auth.json` (or `$CODEX_HOME/auth.json`) before the headless run.
+- **For the Agenta feature: manage secrets in Agenta and inject them.** pi has no
+  vault to delegate to. Agenta should store secrets at rest (encrypted), then the
+  startup/secrets hook lays them into the sandbox as env vars and/or the right
+  auth file. pi's observability layer is already designed to keep keys/headers/
+  payloads out of traces by default — lean on that and verify it.
+
+## 1. pi.dev auth model
+
+### Authenticating to pi itself
+
+There is nothing to authenticate to. pi is installed locally (npm/pnpm/bun/curl)
+and runs as the local user. The only network calls pi makes on its own behalf are
+version/telemetry pings to `pi.dev`, which are opt-out:
+
+- `enableInstallTelemetry` -> `https://pi.dev/api/report-install`
+- version check -> `https://pi.dev/api/latest-version`
+- `PI_OFFLINE=1` / `--offline` disables all startup network ops;
+  `PI_SKIP_VERSION_CHECK=1` disables the version check; `PI_TELEMETRY=0` disables
+  the ping. (Source: `settings.md`, `usage.md`.)
+
+So "auth to pi.dev" is not a concept we need to model. There is no pi account,
+no pi org, no pi-issued token. (Source: `security.md`; `pi.dev` landing page.)
+
+### How pi authenticates to model providers
+
+Three mechanisms, with a defined precedence. From `sdk.md` (AuthStorage) and
+`providers.md`:
+
+1. CLI `--api-key <key>` flag (or SDK runtime override `setRuntimeApiKey`, not
+   persisted).
+2. `~/.pi/agent/auth.json` entry (API key **or** OAuth tokens). Stored with `0600`
+   perms. Auth-file entries take priority over env vars.
+3. Provider env var (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`, ...).
+4. Fallback resolver for custom-provider keys from `models.json`.
+
+`auth.json` is a flat object keyed by provider name. API-key shape
+(`providers.md`):
+
+```json
+{
+  "anthropic": { "type": "api_key", "key": "sk-ant-..." },
+  "openai":    { "type": "api_key", "key": "sk-..." }
+}
+```
+
+Provider **OAuth / subscription login** is also first-class. `/login` (interactive)
+supports Claude Pro/Max, **ChatGPT Plus/Pro (Codex)**, and GitHub Copilot. OAuth
+tokens auto-refresh and persist in the same `auth.json` as an `OAuthCredentials`
+object (`providers.md`, `custom-provider.md`):
+
+```ts
+interface OAuthCredentials {
+  refresh: string;   // refresh token
+  access: string;    // access token (what getApiKey() returns)
+  expires: number;    // ms epoch expiry
+}
+```
+
+So the answer to "pass-through provider keys, a pi-managed gateway, or both?" is:
+**pass-through only.** No pi-managed gateway exists. pi *can* be pointed at a
+gateway you operate — Cloudflare AI Gateway as a unified-billing/observability
+proxy ([issue #3850](https://github.com/earendil-works/pi/issues/3850)), a
+corporate proxy via `pi.registerProvider("openai", { baseUrl, headers })`
+(`custom-provider.md`), or OpenShell inference routing where the gateway injects
+upstream provider creds and the sandbox only sees `https://inference.local`
+(`containerization.md`). Those are *your* gateways, not pi's.
+
+## 2. Provider-key handling and the key-resolution syntax
+
+This matters because it is how a secret gets indirected instead of pasted as a
+literal. `apiKey`, custom header values, and `auth.json` `key` values share one
+resolution syntax (`providers.md`, `custom-provider.md`):
+
+- `!command` at the **start** of the value runs a shell command and uses its
+  output (e.g. `"!security find-generic-password -ws 'anthropic'"`, or
+  `"!op read 'op://vault/item/secret'"` for 1Password).
+- `$ENV_VAR` and `${ENV_VAR}` interpolate environment variables.
+- `$$` -> literal `$`; `$!` -> literal `!`.
+- Otherwise the value is a literal.
+
+Custom providers/proxies can carry secrets in headers using the same syntax:
+
+```ts
+pi.registerProvider("google", {
+  baseUrl: "https://ai-gateway.corp.com/google",
+  headers: { "X-Corp-Auth": "$CORP_AUTH_TOKEN" } // env var or literal
+});
+```
+
+Implication for Agenta: we do **not** have to write raw secrets into pi config
+files. We can inject env vars into the sandbox and reference them as `$VAR` in
+pi's `auth.json`/provider config, or reference a secrets manager via `!command`.
+
+## 3. Secrets concept + injection
+
+### Is there a first-class "secrets" feature in pi core? No.
+
+pi core has an **auth** concept (provider credentials, above) and project
+**trust** (an input-loading guard for `.pi/` resources, not a secret store —
+`security.md`). It does **not** ship a named-secret/vault/scoped-secret feature.
+The "secrets with a value + allowed host patterns, where the agent never sees the
+real value" model that searches surface is from **third-party extensions**, not
+Earendil:
+
+- `pi-secret-guard` — author **acarerdinc**, third-party. Scans `git commit`/
+  `git push` via the `tool_call` event and blocks if secrets are detected;
+  regex + LLM review. (Source: `https://pi.dev/packages/pi-secret-guard` package
+  page.) This is a *leak-prevention* tool, not a secret *store*.
+- `pi-secured-setup`, `pi-heimdall`, "Greywall" — third-party permission/redaction
+  layers (community blogs; **UNVERIFIED** beyond existence — treat as ecosystem
+  examples, not core).
+
+Conclusion: if Agenta wants named, scoped secrets, Agenta owns that. pi gives us
+the *injection surface* (env vars, files, `$ENV`/`!cmd` references), not a vault.
+
+### How secrets reach a pi run and the tools inside it
+
+Because pi runs as the local user with the local environment, **every secret a
+tool sees is whatever is in the process environment / filesystem of the pi
+process**. There is no per-tool secret broker in core. Built-in tools
+(`read`, `write`, `edit`, `bash`, `grep`, `find`, `ls`) and extension tools run
+"with the permissions of the pi process" (`security.md`). So a `bash` tool can
+read any env var or file the process can. Scope is the *process/sandbox boundary*,
+not a pi ACL.
+
+This is exactly why the Agenta design runs pi in a **sandbox** (Daytona) and uses
+**startup hooks** to lay down files then inject secrets — that sandbox *is* the
+secret-scoping boundary. pi's own docs say the same: for unattended/untrusted
+work, "run pi in a contained environment ... with only the files and credentials
+required for the task" and "pass the minimum required API keys or use short-lived
+credentials" (`security.md`, `containerization.md`).
+
+### Where to inject (three concrete options, all supported by pi)
+
+1. **Env vars in the sandbox** (simplest; matches pi's BYOK model). Set
+   `OPENAI_API_KEY` etc. in the sandbox env; pi resolves them via precedence rule
+   #3. The Docker example does exactly this: `docker run -e ANTHROPIC_API_KEY ...`
+   (`containerization.md`).
+2. **`~/.pi/agent/auth.json` file** laid into the sandbox (precedence #2, beats
+   env). Either literal keys or `$ENV`/`!cmd` indirection. Note the doc warning:
+   "Mounting your host `~/.pi/agent` exposes host auth and session files to the
+   container." For a sandbox we generate a fresh `auth.json`, we do not mount the
+   host's.
+3. **Gateway / inference routing** (strongest isolation): the sandbox calls
+   `https://inference.local` and a gateway injects the real provider key upstream,
+   so "OpenShell providers can keep raw model API keys outside the sandbox"
+   (`containerization.md`). This keeps the model key out of the sandbox entirely.
+
+### Scoping per-agent / per-session
+
+- **Per-agent**: each agent revision's secrets become that sandbox's env/auth
+  files. Different agent => different sandbox => different secret set. pi's
+  precedence model means a per-sandbox `auth.json` or per-sandbox env fully
+  determines what that agent can use.
+- **Per-session**: the SDK exposes `authStorage.setRuntimeApiKey(provider, key)`
+  (runtime override, **not persisted**) and a "custom auth storage location"
+  (`sdk.md`). A session can be given a short-lived key in memory without writing
+  it to disk — useful for per-`session_id` credentials that should not outlive the
+  run. **UNVERIFIED**: exact API for a fully custom per-session AuthStorage path
+  beyond `setRuntimeApiKey` and the "custom auth storage location" mention.
+
+## 4. The Codex secret (the swappable-harness question)
+
+The README says the harness is swappable and could run OpenAI Codex instead of
+pi's own loop. There are two genuinely different ways to do this, and the "codex
+secret" means something different in each.
+
+### Option A — keep pi as the harness, talk to the Codex backend through pi
+
+pi already speaks Codex natively. `custom-provider.md` lists an API type
+**`openai-codex-responses`** ("OpenAI Codex Responses API"), and `/login` offers
+**"ChatGPT Plus/Pro (Codex)"** OAuth login ("Officially endorsed by OpenAI: Codex
+for OSS", per `providers.md`). In this option:
+
+- The "codex secret" is just a pi credential: either an `OPENAI_API_KEY` (env or
+  `auth.json` `{"openai": {"type":"api_key","key":"..."}}`) for API-key access, or
+  a pi `OAuthCredentials` object for ChatGPT-subscription Codex access.
+- Injection is identical to any other pi provider (section 3). No separate Codex
+  install needed. This is the lowest-friction path and stays inside pi's
+  instrumentation/observability.
+
+### Option B — swap in the real OpenAI Codex CLI as the harness
+
+Here pi is replaced (or wrapped) by the `codex` CLI, run headless with
+`codex exec`. The "codex secret" is Codex's own credential. How Codex authenticates
+(OpenAI Codex docs):
+
+- **ChatGPT login (default)** when no valid session exists — interactive, browser
+  or device flow. Not suitable headless unless you transplant a token.
+- **API key** — recommended for "programmatic Codex CLI workflows, such as CI/CD
+  jobs" (`developers.openai.com/codex/auth`).
+- **Access token** — ChatGPT-workspace token for "trusted, non-interactive
+  workflows" (`developers.openai.com/codex/enterprise/access-tokens`).
+
+Credential storage: `~/.codex/auth.json` (plaintext) or an OS keyring, controlled
+by `cli_auth_credentials_store` = `file` | `keyring` | `auto`; the file lives
+under `CODEX_HOME` (default `~/.codex`). Treat `auth.json` "like a password"
+(`developers.openai.com/codex/auth`).
+
+Headless injection patterns:
+
+1. **Per-invocation API key (no persisted login):**
+   ```bash
+   CODEX_API_KEY=<api-key> codex exec --json "your task"
+   ```
+   Set it only for the single invocation, not as a job-level env var, "in workflows
+   that execute untrusted code" (`developers.openai.com/codex/noninteractive`).
+2. **Persisted API-key login (writes `auth.json`):**
+   ```bash
+   printenv OPENAI_API_KEY | codex login --with-api-key   # reads key from stdin
+   codex login status   # -> "Logged in using an API key - sk-proj-***ABCD1"
+   ```
+   (`developers.openai.com/codex/auth`, simplified.guide.) Note: setting
+   `OPENAI_API_KEY` env var **alone does not persist a login** — you must run a
+   login command or use `CODEX_API_KEY` per invocation. A request to honor
+   `OPENAI_API_KEY` without writing `auth.json` was closed "not planned"
+   ([issue #5212](https://github.com/openai/codex/issues/5212)); the documented
+   workaround is a custom `[model_providers.*]` with `env_key = "OPENAI_API_KEY"`.
+3. **ChatGPT access token via stdin (subscription/workspace, headless):**
+   ```bash
+   printenv CODEX_ACCESS_TOKEN | codex login --with-access-token
+   ```
+   (`developers.openai.com/codex/auth`.)
+4. **Transplant a prepared `auth.json`** generated on a machine that did the
+   browser login, copied into `$CODEX_HOME/auth.json` in the sandbox (SSH/Docker
+   copy pattern; `developers.openai.com/codex/auth`).
+
+Custom-provider config (e.g. proxy/Azure) uses `config.toml` with `env_key` so the
+secret is never checked into the dotfile (`developers.openai.com/codex/config-advanced`):
+
+```toml
+model = "gpt-5.4"
+model_provider = "proxy"
+
+[model_providers.proxy]
+name = "OpenAI using LLM proxy"
+base_url = "http://proxy.example.com"
+env_key = "OPENAI_API_KEY"
+```
+
+Useful headless flags: `codex exec --json`, `--output-schema <path>`,
+`--ephemeral` (don't persist session files), `--skip-git-repo-check`,
+`--ignore-user-config`, `--sandbox <mode>` (`developers.openai.com/codex/noninteractive`,
+`/codex/cli/reference`).
+
+**Gotcha to design around:** Codex's API-key-via-env sign-in is blocked while a
+ChatGPT subscription login is active in the same `CODEX_HOME`
+([issue #3286](https://github.com/openai/codex/issues/3286)). For deterministic
+headless runs give each agent run a clean `CODEX_HOME` and exactly one credential
+mode.
+
+### Recommendation on the Codex secret
+
+Model a **harness-typed "codex secret"** in the agent config that can carry either
+(i) an OpenAI API key or (ii) a ChatGPT access token, plus a target mode. The
+startup/secrets hook then materializes it for whichever harness is selected:
+
+- pi harness, `openai-codex-responses` -> write to pi `auth.json` / env as the
+  `openai` credential.
+- Codex CLI harness -> either export `CODEX_API_KEY` for the single `codex exec`,
+  or render a fresh `$CODEX_HOME/auth.json`, or pipe a token to
+  `codex login --with-access-token`.
+
+This keeps the secret abstraction harness-agnostic and matches the README's
+"swappable harness" requirement.
+
+## 5. Security best practices
+
+### Keeping secrets out of logs / traces / instrumentation
+
+pi's observability design (`packages/agent/docs/observability.md`) already treats
+this as a first-class concern. pi emits structured lifecycle events
+(`pi.agent.prompt`, `pi.ai.provider.request`, `pi.agent.tool_call`, ...) that an
+adapter turns into OTel/Sentry spans. The doc defines an explicit allow/deny list:
+
+- **Safe by default** (emitted): provider, model, API id, session id, entry type,
+  tool name, status code, stop reason, token counts, costs, durations.
+- **Unsafe by default** (NOT emitted): prompts, completions, tool args, tool
+  results, shell output, file contents, provider request payloads, provider
+  response bodies, **API keys**, **headers**. "Content capture can be opt-in later
+  with explicit redaction hooks."
+
+So if Agenta maps pi observability events to its tracing/instrumentation, secrets
+in keys/headers/payloads are excluded by default. **Action for Agenta:** verify our
+adapter does not turn on content capture, and confirm we never log resolved
+`auth.json` values or the sandbox env. Also: the `before_provider_request` /
+`before_provider_payload` hooks can inspect/replace the outgoing payload, which is
+the right place to add redaction if we ever capture content
+(`packages/agent/docs/hooks.md`, `extensions.md`).
+
+Additional bleed paths to guard (pi-specific):
+
+- `!command` key resolution runs a shell; ensure the command itself does not echo
+  the secret to a place pi captures.
+- pi tools include `bash`; agent-run shell output is large and can contain secrets.
+  pi keeps tool/shell output out of traces by default, but if we surface the
+  multi-message agent output to users, scrub it.
+- Do not mount the host `~/.pi/agent` into the sandbox (would leak host
+  auth/sessions) — generate fresh files per sandbox (`containerization.md`).
+
+### Storage at rest
+
+pi stores provider creds in `~/.pi/agent/auth.json` at `0600` (or an OS keyring is
+not offered by pi core — that's Codex's `cli_auth_credentials_store`, not pi).
+**For Agenta:** the agent config carries secrets that get versioned as a workflow
+revision, so they must be **encrypted at rest in Agenta's store**, not persisted in
+plaintext alongside the rest of the config, and decrypted only at injection time.
+pi gives no at-rest encryption beyond file perms, so this is Agenta's
+responsibility. Prefer short-lived/scoped credentials where the provider supports
+them (pi docs explicitly recommend this for sandboxed runs).
+
+### How secrets reach the sandbox: env vs file vs API
+
+Ranked by isolation:
+
+1. **Gateway / inference routing (best):** raw provider key stays *outside* the
+   sandbox; sandbox calls `inference.local`; gateway injects upstream
+   (`containerization.md`). Use when we don't want the model key in the sandbox at
+   all.
+2. **Mounted auth file** (`auth.json` / `$CODEX_HOME/auth.json`): file perms
+   `0600`, generated per run, removed on teardown. Can use `$ENV`/`!cmd`
+   indirection so the file holds a reference, not the literal.
+3. **Env vars (simplest, matches pi BYOK):** fine inside a per-run sandbox; avoid
+   job-level env in any context that runs untrusted code (Codex doc warning).
+
+In all cases the **sandbox is the scope**: one agent/session -> one sandbox -> one
+minimal credential set, torn down after the run.
+
+## Open questions
+
+- **Per-session custom AuthStorage in pi SDK.** `setRuntimeApiKey` (non-persisted)
+  and a "custom auth storage location" are documented in `sdk.md`, but the full
+  API for a per-`session_id` in-memory credential store is not spelled out.
+  Confirm against `@earendil-works/pi-agent-core` / `pi-coding-agent` types.
+- **Does Agenta want pi-harness Codex (`openai-codex-responses`) or the real Codex
+  CLI as the swappable harness?** They have different secret shapes and different
+  instrumentation stories (pi events vs Codex `--json` stream). Decide before
+  designing the "codex secret" type.
+- **Daytona secret primitives.** This file covers pi + Codex. Whether Daytona has
+  its own secret/env-injection API that the startup hook should use (vs writing
+  files/env ourselves) is out of scope here — covered by the Daytona research
+  topic in the README.
+- **Codex `CODEX_HOME` isolation per run.** Confirm we give each Codex-harness run
+  a clean `CODEX_HOME` to avoid the ChatGPT-vs-API-key conflict
+  ([issue #3286](https://github.com/openai/codex/issues/3286)).
+- **Third-party secret extensions.** `pi-secured-setup` / `pi-heimdall` /
+  "Greywall" exist but are **UNVERIFIED** as to maintenance and fit; do not depend
+  on them. If we want redaction, build it on the core `before_provider_*` hooks.
+- **pi's `enableAnalytics` / `trackingId`.** Opt-in analytics exists
+  (`PI_EXPERIMENTAL=1` setup). Confirm it is off in our sandbox image so nothing
+  leaves the box unexpectedly.
+
+## Sources
+
+pi.dev (Earendil) — primary:
+
+- pi.dev landing page — product overview, providers, modes: https://pi.dev
+- providers.md (auth.json, provider env vars, /login, OAuth, ChatGPT Plus/Pro
+  (Codex)): https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/providers.md
+- custom-provider.md (registerProvider, apiKey/header syntax,
+  `openai-codex-responses` API type, OAuthCredentials, authHeader):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/custom-provider.md
+- security.md (local trust boundary, no built-in sandbox, "minimum credentials"):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/security.md
+- containerization.md (Docker `-e` keys, Gondolin, OpenShell inference routing):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md
+- settings.md (telemetry endpoints, PI_OFFLINE, analytics, sessionDir):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/settings.md
+- usage.md (env vars, /login, --api-key):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/usage.md
+- quickstart.md / index.md (subscription vs API-key first run):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/quickstart.md
+- extensions.md (events: session_start, tool_call, before_provider_request):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+- sdk.md (AuthStorage precedence, setRuntimeApiKey, custom auth storage):
+  https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+- packages/agent/docs/observability.md (safe/unsafe-by-default trace fields):
+  https://github.com/earendil-works/pi/blob/main/packages/agent/docs/observability.md
+- packages/agent/docs/hooks.md (before_provider_request/payload transform hooks):
+  https://github.com/earendil-works/pi/blob/main/packages/agent/docs/hooks.md
+- Cloudflare AI Gateway request (gateway is user-operated):
+  https://github.com/earendil-works/pi/issues/3850
+- pi-secret-guard package page (third-party, author acarerdinc):
+  https://pi.dev/packages/pi-secret-guard
+
+OpenAI Codex — primary:
+
+- Codex authentication (ChatGPT vs API key, auth.json, CODEX_HOME,
+  cli_auth_credentials_store, --with-api-key, --with-access-token):
+  https://developers.openai.com/codex/auth
+- Codex non-interactive (codex exec, CODEX_API_KEY, --ephemeral, --json, sandbox):
+  https://developers.openai.com/codex/noninteractive
+- Codex CLI reference (flags): https://developers.openai.com/codex/cli/reference
+- Codex advanced config (model_providers, env_key):
+  https://developers.openai.com/codex/config-advanced
+- Codex enterprise access tokens:
+  https://developers.openai.com/codex/enterprise/access-tokens
+- Issue #5212 (OPENAI_API_KEY without writing auth.json — closed not planned):
+  https://github.com/openai/codex/issues/5212
+- Issue #3286 (env API-key sign-in blocked when ChatGPT login active):
+  https://github.com/openai/codex/issues/3286
+
+Secondary / corroborating (not load-bearing):
+
+- simplified.guide Codex API-key login (codex login --with-api-key, login status):
+  https://www.simplified.guide/codex/api-key-login
+- Mario Zechner (pi author) build notes: https://mariozechner.at/posts/2025-11-30-pi-coding-agent/
diff --git a/docs/design/agent-workflows/research/daytona-sandbox.md b/docs/design/agent-workflows/research/daytona-sandbox.md
new file mode 100644
index 0000000000..df794d25c8
--- /dev/null
+++ b/docs/design/agent-workflows/research/daytona-sandbox.md
@@ -0,0 +1,482 @@
+# Daytona sandbox integration for agent workflows
+
+Research only. This file documents how the backend would programmatically create a
+Daytona sandbox, install and run the pi.dev harness inside it, lay down files, inject
+secrets, run the agent, stream output, and tear down. Every claim is cited. Items I could
+not confirm from a primary source are marked UNVERIFIED.
+
+Context: see [`../README.md`](../README.md). Agents run on a pi.dev harness inside a
+Daytona sandbox ("or any provider that works with our port"). Startup hooks lay down
+config files, then inject secrets.
+
+## Summary
+
+- Daytona is an open-source (AGPL 3.0) "secure and elastic infrastructure for running
+  AI-generated code." Sandboxes are isolated machines with their own kernel, filesystem,
+  and network. It advertises sandbox start "under 90ms from code to execution."
+  [README](https://github.com/daytonaio/daytona), [docs](https://www.daytona.io/docs/en/).
+- There is a first-class **Python SDK** (`pip install daytona`, package `daytona`, with
+  both sync `Daytona` and async `AsyncDaytona` clients), plus TypeScript, Go, Ruby, and
+  Java SDKs, a REST API, and a CLI.
+  [Python SDK](https://www.daytona.io/docs/en/python-sdk/),
+  [docs landing](https://www.daytona.io/docs/en/).
+- Lifecycle: `daytona.create(...)` → `sandbox.process.exec(...)` / sessions →
+  `sandbox.stop()` / `sandbox.delete()`. States are creating/started/stopping/stopped/
+  archiving/archived/deleting/deleted/error. Auto-stop (default 15 min), auto-archive
+  (default 7 days), and auto-delete (off by default) timers manage idle sandboxes.
+  [Sandboxes](https://www.daytona.io/docs/en/sandboxes/),
+  [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/).
+- **Installing pi**: best fit is to bake pi into a custom **snapshot** (reusable image
+  template) so cold start does not pay an `npm install`. Build the snapshot from a base
+  image plus install commands using the **declarative Image builder** or a Dockerfile, or
+  install pi at runtime via `npm i -g @earendil-works/pi-coding-agent` /
+  `curl -fsSL https://pi.dev/install.sh | sh`. pi runs headless in print/JSON/RPC modes.
+  [Snapshots](https://www.daytona.io/docs/en/snapshots/),
+  [Declarative builder](https://www.daytona.io/docs/en/declarative-builder/),
+  [pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md).
+- **Files**: `sandbox.fs.upload_file` / `upload_files` (in-memory bytes → remote path),
+  plus `git` clone and mounted **volumes**. **Secrets/env**: `env_vars={...}` at create
+  time, `env={...}` per `exec`, baked `.env` in the image, or write a `.env`-style file
+  via the filesystem API. [File system](https://www.daytona.io/docs/en/file-system-operations/),
+  [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/).
+- **Streaming**: run the agent in a **session** with `run_async=True`, then stream
+  stdout/stderr through `get_session_command_logs_async(session_id, cmd_id, on_stdout,
+  on_stderr)`. This maps cleanly onto pi's multi-message output if pi runs in JSON/RPC
+  mode (each emitted JSON line is one log chunk). [Process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx).
+- **Ports / "works with our port"**: `sandbox.get_preview_link(port)` returns a public URL
+  `https://{port}-{sandboxId}.proxy.daytona.work` plus an auth `token` (sent as
+  `x-daytona-preview-token`). Any HTTP port 1–65535 can be previewed. This is the
+  provider-agnostic "port contract" the design alludes to.
+  [Preview](https://www.daytona.io/docs/en/preview/).
+- **Self-host**: yes, AGPL, via docker-compose (local) or a domain deployment behind
+  Caddy. Auth is API keys (`DAYTONA_API_KEY`, `X-Daytona-Organization-ID` for JWT) backed
+  by Dex/Auth0 OIDC. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/),
+  [API keys](https://www.daytona.io/docs/en/api-keys/).
+
+## Daytona SDK and lifecycle (Python, with code)
+
+### Install and client
+
+```bash
+pip install daytona     # package name: "daytona"; module import: "daytona"
+```
+
+```python
+from daytona import Daytona, DaytonaConfig
+
+# From env vars: DAYTONA_API_KEY, DAYTONA_API_URL, DAYTONA_TARGET
+daytona = Daytona()
+
+# Or explicit config
+daytona = Daytona(DaytonaConfig(
+    api_key="YOUR_API_KEY",
+    api_url="https://app.daytona.io/api",   # point at self-hosted URL for own infra
+    target="us",
+))
+```
+
+Async client (recommended for a FastAPI backend):
+
+```python
+from daytona import AsyncDaytona
+
+async with AsyncDaytona() as daytona:
+    sandbox = await daytona.create()
+```
+
+Source: [Python SDK](https://www.daytona.io/docs/en/python-sdk/),
+[API keys](https://www.daytona.io/docs/en/api-keys/).
+
+### Create / exec / stop / delete
+
+```python
+# Create (defaults: python language, 1 vCPU / 1GB RAM / 3GiB disk)
+sandbox = daytona.create()
+
+# Run a command
+resp = sandbox.process.exec("echo 'Hello, World!'")
+print(resp.result)
+
+# Stop, then delete (method names per SDK reference and sandboxes doc)
+sandbox.stop()
+sandbox.delete()
+```
+
+`Daytona.create()` signatures (note the default 60s creation timeout):
+
+```python
+create(params: CreateSandboxFromSnapshotParams | None = None,
+       *, timeout: float = 60) -> Sandbox
+
+create(params: CreateSandboxFromImageParams | None = None,
+       *, timeout: float = 60,
+       on_snapshot_create_logs: Callable[[str], None] | None = None) -> Sandbox
+```
+
+`Sandbox` exposes submodules: `process`, `fs` / `file_system`, `git`, `object_storage`,
+`volume`. Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/),
+[Sandboxes](https://www.daytona.io/docs/en/sandboxes/).
+
+### Creation params (the important fields)
+
+`CreateSandboxFromSnapshotParams` and `CreateSandboxFromImageParams` both inherit
+`CreateSandboxBaseParams`:
+
+- `snapshot: str` (snapshot params) or `image: str | Image` (image params)
+- `resources: Resources | None` — only on the image params variant
+- `name`, `language` (default `"python"`), `os_user`
+- `env_vars: dict[str, str] | None` — **environment variables in the sandbox**
+- `labels: dict[str, str] | None`
+- `public: bool | None`
+- `timeout: float | None`
+- `auto_stop_interval: int | None` — minutes; default 15; `0` disables
+- `auto_archive_interval: int | None` — minutes; default 7 days; `0` = max
+- `auto_delete_interval: int | None` — minutes; off by default; `0` deletes immediately
+- `volumes: list[VolumeMount] | None`
+- `network_block_all: bool | None`, `network_allow_list: str | None` (CIDRs)
+- `ephemeral: bool | None` — sets `auto_delete_interval=0` when True
+- `linked_sandbox: str | None`
+
+Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/).
+
+## Installing pi (image / snapshot strategy)
+
+pi.dev (the "pi coding agent") is a minimal, swappable agent harness. Install options
+([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)):
+
+```bash
+npm install -g --ignore-scripts @earendil-works/pi-coding-agent
+# or
+curl -fsSL https://pi.dev/install.sh | sh
+```
+
+Three baking strategies, in order of recommendation for the agent loop:
+
+### 1. Prebuilt snapshot (recommended)
+
+A **snapshot** is a reusable sandbox template built from a Docker/OCI image. Bake pi (and
+Node) into it once, reuse for every run, and you avoid paying `npm install` on each cold
+start. [Snapshots](https://www.daytona.io/docs/en/snapshots/).
+
+```python
+from daytona import Daytona, CreateSnapshotParams, Image, Resources
+
+daytona = Daytona()
+
+image = (
+    Image.base("node:22-bookworm")
+    .run_commands("npm install -g --ignore-scripts @earendil-works/pi-coding-agent")
+    .workdir("/home/daytona")
+)
+
+daytona.snapshot.create(
+    CreateSnapshotParams(
+        name="agenta-pi-harness",
+        image=image,
+        resources=Resources(cpu=2, memory=4, disk=8),
+    ),
+    on_logs=print,   # build logs
+)
+```
+
+Then create sandboxes from it (fast path):
+
+```python
+from daytona import CreateSandboxFromSnapshotParams
+
+sandbox = daytona.create(
+    CreateSandboxFromSnapshotParams(snapshot="agenta-pi-harness")
+)
+```
+
+CLI equivalents: `daytona snapshot create <name> --image <image>`,
+`daytona snapshot create <name> --dockerfile ./Dockerfile`,
+`daytona snapshot push <local-image> --name <name>`, `daytona snapshot list|activate|delete`.
+
+### 2. Declarative Image built on demand
+
+Pass an `Image` object straight to `create()` and Daytona builds it on the fly. Good for
+iteration, slower than a prebuilt snapshot on first use.
+[Declarative builder](https://www.daytona.io/docs/en/declarative-builder/).
+
+```python
+from daytona import CreateSandboxFromImageParams, Image
+
+image = (
+    Image.debian_slim("3.12")
+    .run_commands(
+        "apt-get update && apt-get install -y curl",
+        "curl -fsSL https://pi.dev/install.sh | sh",
+    )
+    .add_local_file("AGENTS.md", "/home/daytona/AGENTS.md")  # config files
+    .env({"PI_HOME": "/home/daytona/.pi"})
+    .workdir("/home/daytona")
+)
+
+sandbox = daytona.create(
+    CreateSandboxFromImageParams(image=image),
+    timeout=0,                      # 0 = no timeout while the image builds
+    on_snapshot_create_logs=print,  # stream build logs
+)
+```
+
+Builder methods available: `Image.debian_slim(py_ver)`, `Image.base(ref)`,
+`Image.from_dockerfile(path)`, `.pip_install([...])`,
+`.pip_install_from_requirements(path)`, `.pip_install_from_pyproject(path, ...)`,
+`.run_commands(...)`, `.env({...})`, `.workdir(path)`, `.add_local_file(src, dst)`,
+`.add_local_dir(src, dst)`, `.dockerfile_commands([...])`.
+
+### 3. Install at runtime
+
+Create a plain sandbox, then `sandbox.process.exec("npm i -g @earendil-works/pi-coding-agent")`.
+Simplest but pays install latency on every run; only sensible for prototyping.
+
+Note on local parity (design requirement): the same `@earendil-works/pi-coding-agent`
+package and `AGENTS.md` / skills layout work identically on a developer machine, so a
+config pulled from the server runs the same locally. pi resolves `AGENTS.md` from
+`~/.pi/agent/agent.md` (global), parent dirs, and cwd; skills live in
+`~/.pi/agent/skills/`, `.pi/skills/`, or project dirs.
+[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md).
+
+## Files + secrets injection
+
+Order matches the design's startup hooks: files first, secrets second.
+
+### Files into the sandbox
+
+In-memory upload (no local temp file needed — good for config blobs pulled from the DB):
+
+```python
+# Single file: source bytes -> remote path
+sandbox.fs.upload_file(agents_md_bytes, "/home/daytona/AGENTS.md")
+
+# Bulk
+from daytona import FileUpload
+sandbox.fs.upload_files([
+    FileUpload(source=agents_md_bytes, destination="/home/daytona/AGENTS.md"),
+    FileUpload(source=skill_bytes,     destination="/home/daytona/.pi/agent/skills/x/SKILL.md"),
+])
+
+sandbox.fs.create_folder("/home/daytona/.pi/agent/skills", "755")
+sandbox.fs.set_file_permissions("/home/daytona/AGENTS.md", "644")
+```
+
+Source: [File system operations](https://www.daytona.io/docs/en/file-system-operations/).
+
+Other ways to get files in: `sandbox.git` clone; mounted **volumes** (`VolumeMount`,
+shared persistent storage); baking files into the image with `.add_local_file` /
+`.add_local_dir`. [Volumes](https://www.daytona.io/docs/en/volumes/) (UNVERIFIED on exact
+volume API surface; listed in SDK submodules and snapshots doc).
+
+### Secrets / env vars
+
+Several layers, pick by sensitivity and lifetime:
+
+```python
+# A) Whole-sandbox env at creation
+sandbox = daytona.create(CreateSandboxFromSnapshotParams(
+    snapshot="agenta-pi-harness",
+    env_vars={"OPENAI_API_KEY": "sk-...", "ANTHROPIC_API_KEY": "sk-ant-..."},
+))
+
+# B) Per-command env (scoped to one exec)
+sandbox.process.exec("echo $CUSTOM_SECRET", env={"CUSTOM_SECRET": "DAYTONA"})
+
+# C) Write a .env file via the filesystem API, then have pi/harness read it
+sandbox.fs.upload_file(b"ANTHROPIC_API_KEY=sk-ant-...\n", "/home/daytona/.env")
+```
+
+`env_vars` is a field on `CreateSandboxBaseParams`
+([SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/)); per-exec `env`
+is shown in [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx).
+pi reads provider keys from standard env vars (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`,
+etc.), so `env_vars` at create time is the cleanest secret injection path
+([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)).
+The OpenClaw guide confirms the same pattern: extra keys (e.g. `ANTHROPIC_API_KEY`) added
+to `.env.sandbox` are loaded into the sandbox
+([OpenClaw guide](https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/)).
+
+Daytona also has a server-side **secrets** concept (scoped secret injection) referenced in
+its security program, but I did not find a dedicated public SDK method for an
+organization secret vault; treat that as UNVERIFIED and prefer `env_vars` for now.
+[SECURITY.md](https://github.com/daytonaio/daytona/blob/main/SECURITY.md).
+
+## Process exec + streaming + ports
+
+### One-shot exec
+
+```python
+resp = sandbox.process.exec("pi -p 'analyze repo'", cwd="/home/daytona", timeout=600)
+print(resp.result)   # buffered stdout; returned after the command finishes
+```
+
+`exec` supports `cwd`, `env`, and `timeout`.
+[process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx).
+
+### Long-running agent + live stdout/stderr streaming (the agent loop)
+
+Run the harness async inside a **session** and stream both streams via callbacks:
+
+```python
+import asyncio
+from daytona import SessionExecuteRequest
+
+session_id = "agent-run-<session_id>"
+sandbox.process.create_session(session_id)
+
+command = sandbox.process.execute_session_command(
+    session_id,
+    SessionExecuteRequest(
+        command="pi --mode json -p 'do the task'",
+        run_async=True,
+    ),
+)
+
+logs_task = asyncio.create_task(
+    sandbox.process.get_session_command_logs_async(
+        session_id,
+        command.cmd_id,
+        lambda chunk: handle_stdout(chunk),   # each chunk = pi JSON line(s)
+        lambda chunk: handle_stderr(chunk),
+    )
+)
+
+# Optional interactive input back into the process
+sandbox.process.send_session_command_input(session_id, command.cmd_id, "y")
+
+await logs_task
+```
+
+This is the recommended shape for the multi-message agent output: run pi in
+`--mode json` (or `--mode rpc`), and each emitted JSON line becomes a streamed log chunk
+the backend forwards to the client. pi's JSON/RPC event stream emits typed events
+(`agent_start`, `message_update` with `text_delta`, `tool_execution_start/update/end`,
+`agent_end`), so the backend can map each event to an agent message / tool span for
+tracing. RPC framing is strict LF-delimited JSONL — split on `\n` only.
+Sources: [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx),
+[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md),
+[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md).
+
+pi mode summary for headless use:
+- `pi -p "<prompt>"` — print mode, runs once and exits (buffered text).
+- `pi --mode json` — same as print but emits all events as JSON lines (best for parsing).
+- `pi --mode rpc` — bidirectional JSONL over stdin/stdout; send
+  `{"type":"prompt","message":"..."}`, receive `response` + streamed events; supports
+  `steer` / `followUp` mid-run, `get_state`, `fork`, `switch_session`.
+- Flags: `--provider`, `--model` (or `--model anthropic/claude-opus`), `--name`,
+  `--no-session`.
+[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md).
+
+### Ports / preview ("works with our port")
+
+If the harness or a tool serves HTTP, expose it with a preview link:
+
+```python
+preview = sandbox.get_preview_link(3000)
+print(preview.url)    # https://3000-<sandboxId>.proxy.daytona.work
+print(preview.token)  # send as header: x-daytona-preview-token
+```
+
+Any HTTP port 1–65535 is previewable; the port opens automatically if closed. For private
+sandboxes the `token` is required (header `x-daytona-preview-token`), and the token resets
+when the sandbox restarts, so re-fetch after a restart. This preview/port mechanism is the
+provider-agnostic "port contract" the design refers to. A self-hosted deployment serves
+the equivalent under `*.proxy.<yourdomain>`.
+[Preview](https://www.daytona.io/docs/en/preview/),
+[Preview & auth](https://www.daytona.io/docs/en/preview-and-authentication/).
+
+## Cold start, lifecycle states, timeouts, limits
+
+- **Cold start:** advertised "under 90ms from code to execution"
+  ([README](https://github.com/daytonaio/daytona)). UNVERIFIED how that interacts with
+  on-demand image builds; a *prebuilt snapshot* should hit the fast path, whereas building
+  a declarative `Image` on first `create()` is a separate, slower one-time build.
+- **States:** creating, started, stopping, stopped, archiving, archived, deleting,
+  deleted, error. Archived preserves state cheaply (on object storage); restarting from
+  archived is slower than from stopped. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/).
+- **Timeouts / timers:**
+  - `create(..., timeout=60)` default 60s creation timeout (use `timeout=0` for builds).
+  - `auto_stop_interval`: default **15 min** of inactivity → stop; `0` disables.
+  - `auto_archive_interval`: default **7 days** stopped → archive; `0` = max (30 days).
+  - `auto_delete_interval`: **disabled by default**; `0` = delete immediately on stop;
+    `-1` disables. `ephemeral=True` sets it to 0.
+  [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/),
+  [Sandboxes](https://www.daytona.io/docs/en/sandboxes/).
+- **Resources:** default **1 vCPU / 1GB RAM / 3GiB disk**; per-sandbox org max
+  **4 vCPU / 8GB RAM / 10GB disk**. Set via `Resources(cpu=2, memory=4, disk=8)` on the
+  from-image path. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/).
+
+Implication for an agent loop: a long agent run will hit the 15-min auto-stop unless you
+raise `auto_stop_interval` or keep the session active; set it explicitly for runs expected
+to exceed 15 minutes, and `delete()`/`ephemeral=True` to guarantee teardown.
+
+## Self-host + auth
+
+- **Self-hostable:** yes. AGPL 3.0; "free to deploy and run in any environment,"
+  community-supported. If you modify it and expose over a network, AGPL requires releasing
+  your modifications. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/).
+- **Deploy modes:** local docker-compose, or a domain deployment behind Caddy (TLS, DNS
+  provider token, ports 80/443/2222, 4GB+ RAM). Components: API (3000, dashboard + REST),
+  Proxy (4000, preview routing), SSH Gateway (2222), PostgreSQL, Redis, Dex (OIDC),
+  Registry, MinIO (S3-compatible storage).
+  ```bash
+  git clone https://github.com/daytonaio/daytona
+  docker compose -f docker/docker-compose.yaml up -d   # http://localhost:3000
+  # or: ./scripts/setup-domain-oss-deployment.sh        # guided domain + TLS setup
+  ```
+  Local default login: `dev@daytona.io` / `password` (Dex). Domain setup generates
+  `ENCRYPTION_KEY`, `ENCRYPTION_SALT`, `PROXY_API_KEY`, `RUNNER_API_KEY`,
+  `SSH_GATEWAY_API_KEY`. Auth0 OIDC is an optional alternative.
+  [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/).
+- **Auth model (API):** API keys created in the Dashboard or via the API; SDK/CLI read
+  `DAYTONA_API_KEY` (and `DAYTONA_API_URL` to point at self-hosted). JWT-authenticated
+  requests additionally need `X-Daytona-Organization-ID`. For self-host, set
+  `api_url` / `DAYTONA_API_URL` to your deployment.
+  [API keys](https://www.daytona.io/docs/en/api-keys/).
+
+## Open questions
+
+- **Snapshot build pipeline ownership.** Who builds/owns the `agenta-pi-harness` snapshot
+  and how is it pinned/versioned per agent revision? Building a declarative `Image` on the
+  hot path is slow; we likely need a prebuild step in CI or at config-publish time.
+- **Cold start with custom image.** The "<90ms" figure is for sandbox start; the
+  first-time build of a custom image/snapshot is separate and unmeasured here. UNVERIFIED:
+  start time from a *prebuilt* pi snapshot vs. the default image.
+- **pi output → Agenta tracing mapping.** Which pi events (`message_update`,
+  `tool_execution_*`) map to Agenta's multi-message output and pi-instruments tracing, and
+  whether RPC mode (bidirectional, supports steering) or JSON print mode is the better fit
+  for our streaming endpoint. RPC's "bash output appears in context on the *next* prompt"
+  semantics needs design attention.
+- **Secrets vault.** Whether Daytona exposes a real scoped-secret API beyond `env_vars`
+  (referenced in SECURITY.md but no public SDK method found). For now `env_vars` at
+  create time. UNVERIFIED.
+- **Provider abstraction.** The design says "any provider that works with our port." The
+  Daytona preview-URL/port + token model is concrete; a sandbox-provider interface would
+  need to abstract create/exec/stream/preview across providers (e.g. E2B, Modal). Out of
+  scope here but the port + streaming-logs contract is the seam.
+- **Volume API surface.** Exact `VolumeMount` / `daytona.volume` Python API not fully
+  confirmed here. UNVERIFIED.
+- **Long-run auto-stop.** Confirm whether an actively streaming session resets the
+  `auto_stop_interval` idle timer or whether we must raise it explicitly. UNVERIFIED.
+
+## Sources
+
+- Daytona docs landing — https://www.daytona.io/docs/en/
+- Daytona GitHub (README, license, "<90ms") — https://github.com/daytonaio/daytona
+- Python SDK overview — https://www.daytona.io/docs/en/python-sdk/
+- Python SDK reference (params, fields, create signatures) — https://www.daytona.io/docs/python-sdk/sync/daytona/
+- Sandboxes (lifecycle, states, resources, timers) — https://www.daytona.io/docs/en/sandboxes/
+- Snapshots (custom images, CLI) — https://www.daytona.io/docs/en/snapshots/
+- Declarative builder (Image API) — https://www.daytona.io/docs/en/declarative-builder/
+- Process & code execution (exec, sessions, async log streaming) — https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx
+- File system operations (upload/download/permissions) — https://www.daytona.io/docs/en/file-system-operations/
+- Preview / ports / token — https://www.daytona.io/docs/en/preview/
+- Preview & authentication — https://www.daytona.io/docs/en/preview-and-authentication/
+- OSS deployment (self-host, components, auth) — https://www.daytona.io/docs/en/oss-deployment/
+- API keys (auth model) — https://www.daytona.io/docs/en/api-keys/
+- SECURITY.md (secrets management mention) — https://github.com/daytonaio/daytona/blob/main/SECURITY.md
+- OpenClaw-in-sandbox guide (agent + secrets + preview pattern) — https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/
+- pi.dev landing — https://pi.dev , https://pi.dev/docs/latest
+- pi coding-agent README (install, modes, AGENTS.md, skills) — https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md
+- pi RPC protocol doc (JSONL events, streaming) — https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md
+- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent
diff --git a/docs/design/agent-workflows/research/diskless-in-memory-config.md b/docs/design/agent-workflows/research/diskless-in-memory-config.md
new file mode 100644
index 0000000000..a4f13732ca
--- /dev/null
+++ b/docs/design/agent-workflows/research/diskless-in-memory-config.md
@@ -0,0 +1,460 @@
+# Pi agent harness: diskless / in-memory config
+
+Research target: Pi coding agent (pi.dev, Earendil Inc.), npm
+`@earendil-works/pi-coding-agent`, verified against version **0.79.4** (matches the
+version installed by `npm view`). All signatures below are quoted from the published
+package's TypeScript declaration files (`dist/**/*.d.ts`), the compiled JS
+(`dist/**/*.js`), the bundled SDK examples (`examples/sdk/*.ts`), and the dependency
+`@earendil-works/pi-ai@0.79.4`. Source URLs are in the Sources section.
+
+## Summary / net answer
+
+**Yes — Pi can run fully diskless with all invocation-specific data in process memory.**
+Every invocation-specific input we care about has a confirmed in-memory path:
+
+- **System prompt / AGENTS.md**: pass as in-memory strings via `DefaultResourceLoader`
+  (`systemPrompt` / `systemPromptOverride`, `appendSystemPrompt` /
+  `appendSystemPromptOverride`, `agentsFilesOverride`). No file required.
+- **Skills**: register in-memory `Skill` objects via `skillsOverride`, or point at an
+  arbitrary directory via `additionalSkillPaths`. No fixed disk convention required.
+- **Provider auth**: `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` (not
+  persisted), or per-provider env vars. Both confirmed disk-free.
+- **Custom tools**: defined in-process via `customTools: ToolDefinition[]` /
+  `defineTool(...)` or `pi.registerTool(...)` in an inline `extensionFactories` function.
+  No file.
+- **Sessions/state**: `SessionManager.inMemory()` writes nothing.
+  `SettingsManager.inMemory()` and `ModelRegistry.inMemory()` likewise avoid disk.
+
+The one thing that is **not** purely in-memory is bash/tool **output spillover**: when a
+bash command (or a tool using the output accumulator) exceeds an in-memory byte
+threshold, Pi spills the tail to a temp file under `os.tmpdir()`. This is the only
+unavoidable write in a headless run that uses the bash/grep/find tools. Point `TMPDIR`
+at a tmpfs (or make `/tmp` tmpfs) and it never touches a persistent volume.
+
+If you drive Pi via the **SDK** (`createAgentSession`) rather than the CLI, you also avoid
+startup migrations and the CLI's `agentDir` touches entirely. If you drive it via
+`pi --mode rpc`/`--print` (the `main()` CLI entrypoint), redirect `agentDir` and
+`sessionDir` to tmpfs and pass `--no-session`.
+
+---
+
+## Per-question findings
+
+### 1. System prompt / AGENTS.md in memory — CONFIRMED in-memory
+
+The system prompt and AGENTS.md content are supplied through the `ResourceLoader`, not
+through top-level `createAgentSession` options. `DefaultResourceLoaderOptions` exposes
+both direct values and override callbacks (quoted from
+`dist/core/resource-loader.d.ts`):
+
+```typescript
+export interface DefaultResourceLoaderOptions {
+    cwd: string;
+    agentDir: string;
+    ...
+    noContextFiles?: boolean;          // disable AGENTS.md discovery from disk
+    systemPrompt?: string;             // in-memory base system prompt
+    appendSystemPrompt?: string[];     // in-memory appended instructions
+    ...
+    agentsFilesOverride?: (base: {
+        agentsFiles: Array<{ path: string; content: string }>;
+    }) => { agentsFiles: Array<{ path: string; content: string }> };
+    systemPromptOverride?: (base: string | undefined) => string | undefined;
+    appendSystemPromptOverride?: (base: string[]) => string[];
+}
+```
+
+The `ResourceLoader` interface returns these to the session via
+`getSystemPrompt(): string | undefined`, `getAppendSystemPrompt(): string[]`, and
+`getAgentsFiles(): { agentsFiles: Array<{ path: string; content: string }> }`.
+
+**Replace the entire system prompt (in memory)** — from `examples/sdk/03-custom-prompt.ts`:
+
+```typescript
+const loader1 = new DefaultResourceLoader({
+    cwd, agentDir,
+    systemPromptOverride: () => `You are a helpful assistant that speaks like a pirate.
+Always end responses with "Arrr!"`,
+    // Needed to avoid DefaultResourceLoader appending APPEND_SYSTEM.md from ~/.pi/agent or <cwd>/.pi.
+    appendSystemPromptOverride: () => [],
+});
+await loader1.reload();
+const { session } = await createAgentSession({
+    resourceLoader: loader1,
+    sessionManager: SessionManager.inMemory(),
+});
+```
+
+**Inject AGENTS.md content in memory** — from `examples/sdk/07-context-files.ts`:
+
+```typescript
+const loader = new DefaultResourceLoader({
+    cwd: process.cwd(), agentDir: getAgentDir(),
+    agentsFilesOverride: (current) => ({
+        agentsFiles: [
+            ...current.agentsFiles,
+            { path: "/virtual/AGENTS.md", content: `# Project Guidelines ...` },
+        ],
+    }),
+});
+```
+
+Note the file comment: "Disable context files entirely by returning an empty list in
+`agentsFilesOverride`." (return `{ agentsFiles: [] }`), or set `noContextFiles: true`.
+
+**Where Pi reads AGENTS.md from disk by default** (so it can be pointed at tmpfs or
+disabled): `loadProjectContextFiles({ cwd, agentDir })` walks from `cwd` upward and reads
+the `agentDir`. CLI flag to disable: `--no-context-files` (`Args.noContextFiles`).
+The CLI also exposes `--system-prompt` and `--append-system-prompt`
+(`Args.systemPrompt?: string`, `Args.appendSystemPrompt?: string[]` in
+`dist/cli/args.d.ts`), so over RPC/print mode you can pass the prompt as a process arg
+(in memory, no file).
+
+### 2. Skills in memory — CONFIRMED both in-memory registration and arbitrary path
+
+Skills are normally a **directory-of-files** convention. From `dist/core/skills.d.ts`
+(`loadSkillsFromDir` doc comment):
+
+> Discovery rules:
+> - if a directory contains SKILL.md, treat it as a skill root and do not recurse further
+> - otherwise, load direct .md children in the root
+> - recurse into subdirectories to find SKILL.md
+
+Default discovery locations (from the docs and `DefaultResourceLoader`): `.pi/skills/`,
+`.agents/skills/` (walking up), `~/.agents/skills/`, `~/.pi/agent/skills/`.
+
+A `Skill` is a plain object, so it can be created **in memory** with no file:
+
+```typescript
+export interface Skill {
+    name: string;
+    description: string;
+    filePath: string;
+    baseDir: string;
+    sourceInfo: SourceInfo;
+    disableModelInvocation: boolean;
+}
+```
+
+**Register an in-memory skill** — from `examples/sdk/04-skills.ts`:
+
+```typescript
+const customSkill: Skill = {
+    name: "my-skill",
+    description: "Custom project instructions",
+    filePath: "/virtual/SKILL.md",
+    baseDir: "/virtual",
+    sourceInfo: createSyntheticSourceInfo("/virtual/SKILL.md", { source: "sdk" }),
+    disableModelInvocation: false,
+};
+const loader = new DefaultResourceLoader({
+    cwd: process.cwd(), agentDir: getAgentDir(),
+    skillsOverride: (current) => ({
+        skills: [...current.skills, customSkill],
+        diagnostics: current.diagnostics,
+    }),
+});
+```
+
+**Point skills at an arbitrary path**: `DefaultResourceLoaderOptions.additionalSkillPaths?:
+string[]` (and `noSkills?: boolean` to disable default discovery). CLI equivalents:
+`--skills <paths>` (`Args.skills?: string[]`) and `--no-skills` (`Args.noSkills`).
+The lower-level `loadSkills({ cwd, agentDir, skillPaths, includeDefaults })` confirms
+`skillPaths` is an explicit list and `includeDefaults` can be turned off.
+
+Caveat: the skill's `filePath`/`baseDir` only matter if the skill body is read lazily on
+invocation. For a fully synthetic in-memory skill you must ensure the content is provided
+up front; if Pi reads `filePath` on `/skill:name` invocation it would need that path to
+exist. For pure "inject instructions into the system prompt" use, `formatSkillsForPrompt`
+uses `name`/`description` and the prompt formatting only. UNVERIFIED whether explicit
+`/skill:name` expansion re-reads `filePath` from disk for an SDK-injected synthetic skill;
+to be safe, point synthetic skills at a tmpfs path or set
+`disableModelInvocation`/use systemPrompt injection instead.
+
+### 3. Provider / LLM auth in memory — CONFIRMED (three disk-free paths)
+
+**(a) Environment variables.** `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js`
+contains the canonical provider→env-var map (`getApiKeyEnvVars`). Exact names:
+
+- anthropic: `ANTHROPIC_OAUTH_TOKEN` (precedence) then `ANTHROPIC_API_KEY`
+- openai: `OPENAI_API_KEY`
+- google (Gemini): `GEMINI_API_KEY`
+- google-vertex: `GOOGLE_CLOUD_API_KEY` (or ADC via `GOOGLE_APPLICATION_CREDENTIALS` +
+  `GOOGLE_CLOUD_PROJECT`/`GCLOUD_PROJECT` + `GOOGLE_CLOUD_LOCATION`)
+- amazon-bedrock: `AWS_PROFILE` | `AWS_ACCESS_KEY_ID`+`AWS_SECRET_ACCESS_KEY` |
+  `AWS_BEARER_TOKEN_BEDROCK` | ECS/IRSA container creds
+- azure-openai-responses: `AZURE_OPENAI_API_KEY`
+- xai: `XAI_API_KEY`; groq: `GROQ_API_KEY`; cerebras: `CEREBRAS_API_KEY`;
+  deepseek: `DEEPSEEK_API_KEY`; mistral: `MISTRAL_API_KEY`; nvidia: `NVIDIA_API_KEY`;
+  openrouter: `OPENROUTER_API_KEY`; together: `TOGETHER_API_KEY`;
+  fireworks: `FIREWORKS_API_KEY`; vercel-ai-gateway: `AI_GATEWAY_API_KEY`;
+  github-copilot: `COPILOT_GITHUB_TOKEN`; huggingface: `HF_TOKEN`;
+  moonshotai / moonshotai-cn: `MOONSHOT_API_KEY`; kimi-coding: `KIMI_API_KEY`;
+  zai: `ZAI_API_KEY`; zai-coding-cn: `ZAI_CODING_CN_API_KEY`;
+  minimax: `MINIMAX_API_KEY`; minimax-cn: `MINIMAX_CN_API_KEY`;
+  opencode / opencode-go: `OPENCODE_API_KEY`; nvidia, etc.;
+  cloudflare-workers-ai / cloudflare-ai-gateway: `CLOUDFLARE_API_KEY`;
+  xiaomi family: `XIAOMI_API_KEY`, `XIAOMI_TOKEN_PLAN_{CN,AMS,SGP}_API_KEY`;
+  ant-ling: `ANT_LING_API_KEY`.
+
+**(b) Runtime in-memory setter — CONFIRMED.** `dist/core/auth-storage.d.ts`:
+
+```typescript
+export declare class AuthStorage {
+    static create(authPath?: string): AuthStorage;
+    static fromStorage(storage: AuthStorageBackend): AuthStorage;
+    static inMemory(data?: AuthStorageData): AuthStorage;
+    /** Set a runtime API key override (not persisted to disk). Used for CLI --api-key flag. */
+    setRuntimeApiKey(provider: string, apiKey: string): void;
+    removeRuntimeApiKey(provider: string): void;
+    setFallbackResolver(resolver: (provider: string) => string | undefined): void;
+    ...
+}
+export declare class InMemoryAuthStorageBackend implements AuthStorageBackend { ... }
+```
+
+So `setRuntimeApiKey(provider: string, apiKey: string): void` is real (UNVERIFIED in the
+original brief — now CONFIRMED). Resolution priority in `getApiKey()`:
+1. runtime override (`--api-key` / `setRuntimeApiKey`), 2. `auth.json` API key,
+3. `auth.json` OAuth (auto-refreshed), 4. environment variable, 5. fallback resolver.
+
+`AuthStorage.inMemory()` plus `InMemoryAuthStorageBackend` give a fully in-memory store.
+Verified in the compiled `dist/core/auth-storage.js`: every `writeFileSync`/`mkdirSync`/
+`chmodSync` call lives inside `FileAuthStorageBackend` (class starts line 17); the
+`InMemoryAuthStorageBackend` class (line 127) performs no filesystem writes.
+
+From `examples/sdk/09-api-keys-and-oauth.ts`:
+
+```typescript
+// Runtime API key override (not persisted to disk)
+authStorage.setRuntimeApiKey("anthropic", "sk-my-temp-key");
+// No models.json - only built-in models
+const simpleRegistry = ModelRegistry.inMemory(authStorage);
+```
+
+**(c) RPC protocol credential message — NOT PRESENT.** The full `RpcCommand` union in
+`dist/modes/rpc/rpc-types.d.ts` has no `set_api_key` / `set_credential` / auth message
+(commands are: prompt, steer, follow_up, abort, new_session, get_state, set_model,
+cycle_model, get_available_models, set_thinking_level, cycle_thinking_level,
+set_steering_mode, set_follow_up_mode, compact, set_auto_compaction, set_auto_retry,
+abort_retry, bash, abort_bash, get_session_stats, export_html, switch_session, fork,
+clone, get_fork_messages, get_last_assistant_text, set_session_name, get_messages,
+get_commands). **Implication:** in RPC mode, credentials must be supplied at process spawn
+— via env vars or the `--api-key`/`--provider` CLI flags (`Args.apiKey`, `Args.provider`).
+You cannot inject a key over the JSONL channel after spawn. If you need post-spawn,
+in-memory key injection without env vars, drive Pi via the **SDK** and pass a custom
+`AuthStorage` instead of RPC mode.
+
+### 4. Tool auth / custom tools in memory — CONFIRMED in-process, no file
+
+Custom tools are pure in-process definitions. Two confirmed paths:
+
+**Via `customTools` on `createAgentSession`** (`dist/core/sdk.d.ts`):
+
+```typescript
+export interface CreateAgentSessionOptions {
+    ...
+    /** Custom tools to register (in addition to built-in tools). */
+    customTools?: ToolDefinition[];
+    ...
+}
+```
+
+A `ToolDefinition` (`dist/core/extensions/types.d.ts`) carries its own `execute(...)`
+function — so any auth/config the tool needs is closed over in code, no on-disk config:
+
+```typescript
+export interface ToolDefinition<TParams extends TSchema = TSchema, ...> {
+    name: string; label: string; description: string;
+    parameters: TParams;  // TypeBox schema
+    execute(toolCallId, params, signal, onUpdate, ctx): Promise<AgentToolResult<TDetails>>;
+    ...
+}
+export declare function defineTool<...>(tool: ToolDefinition<...>): ...;
+```
+
+**Via inline extension factory + `pi.registerTool`** (`examples/sdk/06-extensions.ts`):
+
+```typescript
+const resourceLoader = new DefaultResourceLoader({
+    cwd: process.cwd(), agentDir: getAgentDir(),
+    extensionFactories: [
+        (pi) => { pi.on("agent_start", () => { ... }); },
+    ],
+});
+// inside an extension: pi.registerTool({ name: "my_tool", label: "My Tool", ... })
+```
+
+`ExtensionRunner.registerTool<...>(tool: ToolDefinition<...>): void` is in the type
+surface. Both paths require no file: the extension can be an inline function passed in
+`extensionFactories`, and tool auth is whatever the closure references (e.g. an HTTP
+client back to your backend). Built-in tool selection is also code-only via
+`tools`/`excludeTools`/`noTools` on `createAgentSession`.
+
+### 5. Working directory / cwd and state files — what Pi writes, and how to redirect
+
+**Path knobs (from `dist/config.js`):**
+
+- `getAgentDir()` returns `process.env.PI_CODING_AGENT_DIR` (expanded) if set, else
+  `~/.pi/agent`. The env var name is built as
+  `` `${APP_NAME.toUpperCase()}_CODING_AGENT_DIR` `` with `APP_NAME = "pi"`, i.e.
+  **`PI_CODING_AGENT_DIR`**.
+- Session dir env var **`PI_CODING_AGENT_SESSION_DIR`** (`ENV_SESSION_DIR`), read in
+  `main.js`. Resolution order in CLI: `--session-dir` flag → `PI_CODING_AGENT_SESSION_DIR`
+  → settings default. Default session dir:
+  `getDefaultSessionDir(cwd, agentDir)` = `<agentDir>/sessions/--<encoded-cwd>--/`
+  (it `mkdirSync`s the dir).
+- All other config files hang off `agentDir`: `auth.json`, `models.json`, `settings.json`,
+  `tools/`, `bin/`, `prompts/`, `themes/`, `sessions/`, and the debug log
+  `<agentDir>/pi-debug.log`. Redirecting `PI_CODING_AGENT_DIR` moves all of them.
+
+**SDK-level in-memory replacements (no disk):**
+
+- `SessionManager.inMemory(cwd?)` — "Create an in-memory session (no file persistence)".
+  Verified: `SessionManager` only `writeFileSync`s when `this.persist` is true; `inMemory`
+  sets `persist=false`.
+- `SettingsManager.inMemory(settings?)` — no `settings.json` read/write.
+- `ModelRegistry.inMemory(authStorage)` — built-in models only, no `models.json`.
+- `AuthStorage.inMemory()` / custom `AuthStorageBackend` — no `auth.json`.
+
+**What Pi writes on its own during a run (headless), and how to neutralize it:**
+
+| Writer (dist file) | Path | When | Redirect / avoid |
+| --- | --- | --- | --- |
+| `core/session-manager.js` | `<agentDir>/sessions/...*.jsonl` | every persisted session | `SessionManager.inMemory()` (SDK) or `--no-session` (CLI). Else `PI_CODING_AGENT_SESSION_DIR`→tmpfs. |
+| `core/bash-executor.js` | `os.tmpdir()/pi-bash-<id>.log` | only when bash output exceeds `DEFAULT_MAX_BYTES` (spillover) | set `TMPDIR` to tmpfs / make `/tmp` tmpfs |
+| `core/tools/output-accumulator.js` | `os.tmpdir()/<prefix>-<id>.log` | tool output spillover above threshold | same (`TMPDIR`→tmpfs) |
+| `core/settings-manager.js` | `<agentDir>/settings.json`, `<cwd>/.pi/settings.json` | only on settings change with persistence | `SettingsManager.inMemory()` |
+| `core/auth-storage.js` (`FileAuthStorageBackend`) | `<agentDir>/auth.json` | only with file-backed AuthStorage | `AuthStorage.inMemory()` / `setRuntimeApiKey` |
+| `core/trust-manager.js` | project trust file under `<cwd>/.pi` / agentDir | only when project-trust resolution runs | avoid project `.pi` resources; SDK path skips trust prompts |
+| `core/package-manager.js` | `<agentDir>/tmp/extensions/` | only when installing/loading extension packages | use inline `extensionFactories` (no package install) |
+| `core/agent-session-runtime.js` | `<sessionDir>/<attached-file>` | only when attaching files + persistence | in-memory session; don't attach files |
+| `core/agent-session.js` | export path | only on explicit `exportToHtml`/`exportToJsonl` | don't call exports |
+| `utils/tools-manager.js` | `<agentDir>/bin/{rg,fd}` | only if `rg`/`fd` not found in PATH | pre-install ripgrep + fd in the sandbox image (it prefers system binaries in PATH) |
+| `migrations.js` (CLI only) | `<agentDir>/auth.json`, `settings.json` | `main()` startup, only if legacy files present | SDK path doesn't call it; or point `PI_CODING_AGENT_DIR` at an empty tmpfs |
+
+The interactive TUI also writes `pi-debug.log` and reads more of `agentDir`, but those
+code paths (`modes/interactive/*`) do not run in `--mode rpc`, `--print`, or the SDK.
+
+### 6. Net answer — concrete diskless recipe
+
+**Recommended: drive Pi via the SDK (`createAgentSession`), not the RPC CLI**, because the
+SDK lets you inject `AuthStorage`, system prompt, skills, AGENTS.md, and custom tools as
+in-memory objects, and skips CLI startup migrations. Run many sessions in one shared
+sandbox, one `createAgentSession` per invocation, each with its own in-memory loader and
+auth.
+
+Per invocation, in code (all in memory):
+
+```typescript
+const auth = AuthStorage.inMemory();
+auth.setRuntimeApiKey("anthropic", perRunKey);     // never persisted
+
+const loader = new DefaultResourceLoader({
+  cwd: perRunWorkdir,                                 // a per-run tmpfs subdir
+  agentDir: perRunAgentDir,                            // a per-run tmpfs subdir (or unused)
+  noContextFiles: true,                                // ignore on-disk AGENTS.md
+  systemPrompt: baseSystemPrompt,                      // in memory
+  appendSystemPromptOverride: () => [extraInstructions],
+  agentsFilesOverride: () => ({ agentsFiles: [{ path: "/virtual/AGENTS.md", content: agentsMd }] }),
+  skillsOverride: (cur) => ({ skills: [...inMemorySkills], diagnostics: cur.diagnostics }),
+  extensionFactories: [(pi) => { pi.registerTool(myProxyTool); }],
+});
+await loader.reload();
+
+const { session } = await createAgentSession({
+  cwd: perRunWorkdir,
+  authStorage: auth,
+  modelRegistry: ModelRegistry.inMemory(auth),
+  settingsManager: SettingsManager.inMemory(),
+  sessionManager: SessionManager.inMemory(perRunWorkdir),
+  resourceLoader: loader,
+  model: getModel("anthropic", "claude-..."),
+  customTools: [/* or here instead of via extensionFactories */],
+});
+```
+
+Environment for the sandbox process:
+
+- `TMPDIR=/dev/shm/pi-tmp` (or any tmpfs) — captures bash/tool output spillover.
+- Optionally `PI_CODING_AGENT_DIR=/dev/shm/pi-agent` and
+  `PI_CODING_AGENT_SESSION_DIR=/dev/shm/pi-sessions` as a belt-and-suspenders redirect for
+  any code path that still resolves `agentDir`/`sessionDir`.
+- `PI_OFFLINE=1` to suppress version-check network/file activity (optional).
+- Provider key via env var (e.g. `ANTHROPIC_API_KEY`) **only if** you use env-var auth
+  instead of `setRuntimeApiKey`.
+- Pre-install `ripgrep` (`rg`) and `fd` in the sandbox image so the `grep`/`find` tools
+  never trigger a download to `<agentDir>/bin`.
+
+**What must be a file (therefore tmpfs):** nothing strictly required for config. The only
+forced writes are (a) bash/tool **output spillover** to `os.tmpdir()` (point `TMPDIR` at
+tmpfs), and (b) any session/settings/auth persistence you opt into — all avoidable with
+the `inMemory()` factories. If you instead use `pi --mode rpc`, sessions and `agentDir`
+are file-based by default, so you must pass `--no-session` and redirect both env vars to
+tmpfs, and you lose post-spawn in-memory key injection (RPC has no auth message).
+
+**Verdict:** fully diskless (process memory + a tmpfs `TMPDIR`) is achievable via the SDK.
+No persistent-volume write is required for prompts, skills, AGENTS.md, auth, tools, or
+session state.
+
+---
+
+## Open questions / UNVERIFIED
+
+- **Synthetic skill body re-read.** Whether an SDK-injected `Skill` whose `filePath` points
+  at a non-existent `/virtual/SKILL.md` is safe when the model triggers `/skill:name`
+  expansion (which may re-read `filePath`). The system-prompt listing only needs
+  `name`/`description`, but explicit invocation might hit disk. Mitigation: put synthetic
+  skills' `filePath`/`baseDir` on tmpfs, or rely on systemPrompt injection. Confirm by
+  reading `_expandSkillCommand` in `dist/core/agent-session.js` or testing.
+- **`os.tmpdir()` honoring `TMPDIR`.** Node's `os.tmpdir()` respects `TMPDIR` on Linux, so
+  setting `TMPDIR` to a tmpfs path redirects the spillover files. This is standard Node
+  behavior, not Pi-specific; verify the sandbox doesn't override `TMPDIR`.
+- **OAuth refresh writes.** If you use OAuth credentials (not API keys), token refresh in
+  `FileAuthStorageBackend` writes back to `auth.json`. With `AuthStorage.inMemory()` /
+  `InMemoryAuthStorageBackend`, refreshed tokens stay in memory — confirm refresh path
+  uses the injected backend (it goes through `withLock`/`withLockAsync`, which the
+  in-memory backend implements).
+- **`ModelRegistry` provider registration side effects.** `ModelRegistry.inMemory` avoids
+  `models.json`, but custom provider registration (Bedrock/Vertex) may read other on-disk
+  creds (`~/.aws`, ADC json). Out of scope if using API-key providers.
+- Version drift: verified at 0.79.4. Re-check `rpc-types.d.ts` for an auth message and
+  `resource-loader.d.ts` option names if upgrading.
+
+---
+
+## Sources
+
+Primary (package source / types — inspected from the published tarball; equivalent files
+on GitHub):
+
+- `@earendil-works/pi-coding-agent@0.79.4` npm tarball, files:
+  `dist/core/sdk.d.ts` (`CreateAgentSessionOptions`, `customTools`, `createAgentSession`),
+  `dist/core/resource-loader.d.ts` (`DefaultResourceLoaderOptions`: `systemPrompt`,
+  `appendSystemPrompt`, `systemPromptOverride`, `agentsFilesOverride`, `skillsOverride`,
+  `additionalSkillPaths`, `noContextFiles`, `noSkills`),
+  `dist/core/auth-storage.d.ts` + `dist/core/auth-storage.js` (`AuthStorage`,
+  `setRuntimeApiKey`, `inMemory`, `InMemoryAuthStorageBackend`),
+  `dist/core/session-manager.d.ts` + `.js` (`SessionManager.inMemory`, `getDefaultSessionDir`),
+  `dist/core/settings-manager.js` (`inMemory`), `dist/core/model-registry.js` (`inMemory`),
+  `dist/core/skills.d.ts` (`Skill`, `loadSkills`, `loadSkillsFromDir`),
+  `dist/core/extensions/types.d.ts` (`ToolDefinition`, `defineTool`, `registerTool`),
+  `dist/config.js` (`getAgentDir`, `ENV_AGENT_DIR=PI_CODING_AGENT_DIR`,
+  `ENV_SESSION_DIR=PI_CODING_AGENT_SESSION_DIR`, session/auth/bin paths),
+  `dist/cli/args.d.ts` (`--api-key`, `--system-prompt`, `--append-system-prompt`,
+  `--no-session`, `--session-dir`, `--skills`, `--no-skills`, `--no-context-files`),
+  `dist/modes/rpc/rpc-types.d.ts` (full `RpcCommand` union — no auth message),
+  `dist/core/bash-executor.js` + `dist/core/tools/output-accumulator.js` (tmpdir spillover),
+  `dist/utils/tools-manager.js` (rg/fd download, prefers system PATH binaries),
+  `dist/main.js` (`runMigrations`, session-dir resolution),
+  `examples/sdk/03-custom-prompt.ts`, `04-skills.ts`, `05-tools.ts`, `06-extensions.ts`,
+  `07-context-files.ts`, `09-api-keys-and-oauth.ts`, `11-sessions.ts`.
+- `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js` — provider→env-var map
+  (`getApiKeyEnvVars`, `getEnvApiKey`).
+
+Docs / GitHub (corroborating):
+
+- SDK reference: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+- npm: https://www.npmjs.com/package/@earendil-works/pi-coding-agent
+- Docs site: https://pi.dev/docs/latest/sdk
+- DeepWiki overview: https://deepwiki.com/earendil-works/pi/7.1-pi-coding-agent-sdk
diff --git a/docs/design/agent-workflows/research/open-questions.md b/docs/design/agent-workflows/research/open-questions.md
new file mode 100644
index 0000000000..dd9d37fd47
--- /dev/null
+++ b/docs/design/agent-workflows/research/open-questions.md
@@ -0,0 +1,312 @@
+# Agent Workflows: Daytona and pi.dev due-diligence
+
+Status: research only. Broad due-diligence to surface what the focused research topics
+(interaction API, OTel instrumentation, sandbox creation, auth/secrets, sandbox-sharing)
+might miss. Every claim is cited. Items I could not verify from a primary source are
+marked UNVERIFIED. Researched 2026-06-15.
+
+## Summary
+
+- **pi.dev** is a young but very active open-source (MIT) agent harness from Earendil Inc.,
+  authored by Mario Zechner (GitHub `badlogic`, creator of libGDX). The npm package
+  `@earendil-works/pi-coding-agent` first published 2026-05-07 and is on **0.79.4** (released
+  the day of this research), shipping roughly weekly with frequent **breaking changes** in
+  the 0.x line. It runs locally as a CLI/SDK/RPC server; **it does not depend on Daytona**.
+- **Daytona** is a mature, well-funded ($5M, Upfront Ventures), SOC-2 open-source (AGPL-3.0)
+  sandbox platform for running AI-generated code. Sub-90ms container starts, usage-based
+  pricing, $200 free credits, US/EU regions. The managed cloud is the same codebase as the
+  OSS repo and can be self-hosted via Docker Compose.
+- **Biggest risks for this project:** (1) pi's 0.x velocity and breaking changes mean we
+  pin a version and budget for upgrade churn; the RPC/SDK contract is pi-specific and
+  **not** a portable cross-harness standard, so "configurable harness" is an abstraction
+  *we* own. (2) pi has **no first-party OpenTelemetry**; the only OTel path today is a
+  third-party community extension. (3) Daytona uses shared-kernel containers (not microVMs),
+  a weaker isolation story for hostile code; (4) default **15-min auto-stop** can kill
+  long-running agents mid-run; (5) network egress is restricted by default below Tier 3.
+
+## Maturity & risk
+
+**pi.dev**
+- Open source, **MIT** license; monorepo `earendil-works/pi` (mirror/origin also seen as
+  `badlogic/pi-mono`). Packages: `pi-coding-agent` (CLI), `pi-agent-core` (runtime, tool
+  calling, state), `pi-ai` (unified multi-provider LLM API), `pi-tui` (terminal UI). A
+  separate `pi-chat` repo does Slack/chat workflows.
+  [README](https://github.com/earendil-works/pi/blob/main/README.md),
+  [npm](https://www.npmjs.com/package/@earendil-works/pi-coding-agent)
+- Author: **Mario Zechner** (`badlogic`), an experienced OSS developer (libGDX). Earendil Inc.
+  is the company.
+  [HN](https://news.ycombinator.com/item?id=46629341),
+  [GitHub badlogic](https://github.com/badlogic)
+- **Very young, very active.** npm package created **2026-05-07**, latest **0.79.4** on
+  **2026-06-15**. Release cadence is ~weekly (0.75.0 2026-05-17 through 0.79.4 2026-06-15 =
+  ~15 releases in a month). Still firmly **pre-1.0**.
+  [npm metadata via `npm view`](https://www.npmjs.com/package/@earendil-works/pi-coding-agent),
+  [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md)
+- **Breaking-change history is real and frequent** (0.x). Recent examples from the changelog:
+  0.75.0 raised min Node to 22.19.0 and reworked tool selection from cwd-bound instances to
+  tool-name allowlists; 0.72.0 replaced `compat.reasoningEffortMap` with `thinkingLevelMap`;
+  0.71.0 removed built-in Gemini/Antigravity providers; 0.69.0 migrated TypeBox and
+  invalidated captured session-bound extension objects. A `legacy-node20` dist-tag (0.74.2)
+  exists for older Node.
+  [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md)
+- **Lock-in:** low at the model layer (15+ providers, MIT). But the integration surface
+  (RPC commands/events, extension API, session JSONL format) is **pi-specific** and changes
+  between minor versions, so coupling to pi is a real cost even though the code is open.
+- Community size: hard to quantify; active HN presence, third-party extensions appearing
+  (otel, sandboxing, oh-my-pi fork). Smaller and newer than Claude Code / Codex ecosystems.
+  [HN](https://news.ycombinator.com/item?id=47634337)
+
+**Daytona**
+- Open source, **AGPL-3.0**; repo `daytonaio/daytona` reports ~72k stars on the repo page
+  (other sources cite ~21k — figure is noisy, treat as "large, popular"). 200+ releases,
+  latest ~v0.187.0 (2026-06-11). Polyglot (TS/Go/Python/Ruby/Java SDKs).
+  [GitHub](https://github.com/daytonaio/daytona),
+  [stars/funding search](https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments)
+- Company: Ivan Burazin (CEO, ex-Codeanywhere/Infobip), raised **$5M** (Upfront Ventures,
+  500 EE). **SOC-2** compliant.
+  [PRNewswire](https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html)
+- **AGPL note:** the AGPL-3.0 license is copyleft and network-triggered. We consume Daytona
+  as a hosted service or via SDK over the network (not by linking/modifying its source), so
+  AGPL obligations should not reach Agenta's own code, but legal should confirm before any
+  self-host-and-modify path. The cloud and OSS share a codebase, so self-hosting is a real
+  fallback (Docker Compose stack + customer-managed compute/BYOC).
+  [GitHub](https://github.com/daytonaio/daytona)
+
+## Pricing & limits
+
+**Daytona** (managed cloud, pay-as-you-go, no minimum/commitment):
+- vCPU **$0.0504/h**; RAM **$0.0162/h per GiB**; storage **$0.000108/h per GiB** (first 5 GiB
+  free). Billed per second. GPU: H100 $3.95/h, RTX PRO 6000 $3.03/h. Windows/Android OS
+  add-ons extra. **$200 free credits** at signup (no card for trial); startups up to $50k.
+  [Pricing](https://www.daytona.io/pricing),
+  [pricing search](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Cost intuition:** a 1 vCPU / 2 GiB sandbox ≈ $0.0504 + 2×$0.0162 = **~$0.083/h** of
+  active compute (storage extra). 10 such sandboxes running continuously ≈ **$0.83/h** ≈
+  ~$600/mo if never stopped; auto-stop after idle cuts this sharply since CPU/RAM stop
+  billing while stopped (storage persists). Costs scale with concurrency × active runtime,
+  not request count. (Derived from the per-hour rates above — arithmetic ours.)
+- **Rate limits (per minute, by tier):** Tier1 10k general / 300 create / 10k lifecycle;
+  Tier2 20k/400/20k; Tier3 40k/500/40k; Tier4 50k/600/50k; Enterprise custom.
+- **Resource quotas (per tier):** Tier1 10 vCPU / 20 GiB RAM / 30 GiB disk; Tier2
+  100/200/300; Tier3 250/500/2000; Tier4 500/1000/5000. Concurrency is gated by these
+  pooled quotas (how many sandboxes run at once depends on each one's size).
+- **Tier gating:** Tier1 email-verified; Tier2 card + $25 top-up; Tier3 $500 top-up; Tier4
+  $2000 top-up / 30 days; Enterprise contact.
+  [Limits](https://www.daytona.io/docs/en/limits/),
+  [DeepWiki quotas](https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits)
+
+**pi.dev**
+- The harness itself is free/MIT. Cost is the **LLM provider tokens** (BYO key or OAuth to
+  Claude Pro/Max, ChatGPT/Codex, Copilot, plus API-key providers) plus whatever sandbox you
+  run it in. No pi-side metering.
+  [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/),
+  [pi.dev](https://pi.dev/)
+
+## Operational concerns
+
+**Daytona**
+- **Cold start:** advertised sub-90ms sandbox creation (container-based).
+  [docs overview](https://www.daytona.io/docs), [vstorm](https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/)
+- **Lifecycle/timeouts:** default **auto-stop after 15 min** of inactivity, **auto-archive
+  after 7 days** stopped; auto-delete configurable. Stopped = storage kept, CPU/RAM freed;
+  archived = no quota. **Sharp edge:** a long-running process (e.g. a >15-min agent run with
+  no external interaction) can be auto-stopped mid-run because the process itself does not
+  count as "activity" — set/extend auto-stop for long agents.
+  [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona),
+  [Northflank](https://northflank.com/blog/daytona-vs-modal)
+- **Regions / residency:** shared regions **US** (`us`) and **EU** (`eu`); you can target a
+  region per sandbox. Custom Regions (BYO runners, full isolation, residency control) are
+  invite-only/experimental. Some sources note the **managed cloud is effectively single
+  primary region (us-east-1/iad1)** in practice — UNVERIFIED against official docs, treat
+  EU availability as "claimed, confirm before relying on it for residency".
+  [Regions](https://www.daytona.io/docs/en/regions/),
+  [single-region claim](https://www.zenml.io/blog/e2b-vs-daytona)
+- **Networking egress:** per-sandbox network stack with firewall. **Tier 1 & 2: restricted
+  egress by default; Tier 3 & 4: full internet by default.** Controls: `networkAllowList`
+  (CIDR, max 10 /32 entries) and `networkBlockAll`. Only Tier 3/4 can change firewall after
+  creation. All tiers get allowlisted access to npm/PyPI, Docker/k8s registries,
+  GitHub/GitLab, CDNs, and AI providers (Anthropic/OpenAI/Google). **Implication:** to inject
+  an arbitrary secret endpoint or call a non-allowlisted internal service, plan for Tier 3+.
+  [Network limits](https://www.daytona.io/docs/en/network-limits/),
+  [egress issue](https://github.com/daytonaio/daytona/issues/3357)
+- **Isolation:** container with dedicated kernel claims, but multiple comparisons note it
+  shares the host kernel (not Firecracker microVM) — weaker boundary for genuinely hostile
+  code than E2B/Fly.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+
+**pi.dev**
+- Runs as a local process; operational profile (cold start, scaling) is whatever sandbox/
+  host we run it on. No managed pi runtime to scale or rate-limit. Reliability is a function
+  of (a) pi's own stability at 0.x and (b) the chosen LLM provider's limits.
+
+## Local parity
+
+- **Strong yes — pi is local-first and needs no Daytona.** pi is a CLI/SDK/RPC harness that
+  runs in any project directory. Four surfaces: interactive TUI, print/JSON event-stream
+  mode, **RPC mode** (JSONL over stdin/stdout), and a **Node SDK** (`AgentSession`). The same
+  binary/SDK runs locally or inside a sandbox.
+  [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md),
+  [RPC docs](https://pi.dev/docs/latest/rpc)
+- This makes "pull config from server, run the same harness locally" realistic: the agent
+  config (AGENTS.md, skills, model, tools, files) maps onto pi's own context model
+  (AGENTS.md/SYSTEM.md, skills, tool allowlists, presets/extensions).
+  [overview](https://pi.dev/), [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md)
+- **What differs local vs sandboxed (the parity gaps we own):**
+  - **Sandbox/isolation layer.** Server runs pi inside Daytona; local runs pi on the host (or
+    pi's own local sandbox options: **Gondolin** QEMU micro-VM, plain Docker, OpenShell).
+    These are pi's *own* local isolation, not Daytona — so the file/secret startup hooks and
+    the FS/network surface differ between Daytona and a local run unless we replicate them.
+    [containerization search](https://github.com/pasky/pi-gondolin)
+  - **Secrets/auth injection.** Server injects secrets via startup hooks into the sandbox;
+    locally the user supplies keys/OAuth. Parity requires our wrapper to lay down the same
+    files/env both places.
+  - **Network egress.** Daytona's tiered firewall has no local equivalent; a tool that works
+    locally could be blocked in-sandbox below Tier 3.
+  - **Instrumentation.** OTel is an opt-in extension either way (see below); it is not on by
+    default, so parity depends on us loading the same extension/config in both modes.
+- Net: pi gives genuine local parity for the *agent loop*; the *environment* (sandbox,
+  secrets, egress, telemetry) is the part Agenta must make identical across local and server.
+
+## Harness swappability
+
+- **Important framing:** in pi, "harness" means *the agent loop you customize within pi*
+  (tools, prompts, auth, event loop), not a pluggable adapter where you drop in Codex or
+  Claude Code behind a common interface. pi's own docs/talks define the harness as "the set
+  of abstractions which transforms [the] IO machine into an 'agent'" and emphasize
+  composition *within* pi, not interchangeable backends.
+  [harness-engineering slides](https://dmg-egg.github.io/slides-harness-engineering-with-pi/)
+- pi supports many **models/providers** (Anthropic, OpenAI, Google, Bedrock, Mistral, xAI,
+  Groq, Cerebras, OpenRouter, Ollama, etc.) and **subscription OAuth** to Claude Pro/Max,
+  ChatGPT/Codex, and Copilot. But these are *models behind pi's loop*, not separate harnesses
+  like the Claude Code CLI or Codex CLI.
+  [providers/RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/)
+- The RPC protocol is rich (85+ commands, ~12 event types incl. `agent_start/end`,
+  `turn_start/end`, `message_*`, `tool_execution_*`, plus `get_state` exposing `sessionId`,
+  and `agent_end` carrying **all messages from the run** = the multi-message output). But it
+  is **pi-specific and unversioned** (no documented stability/deprecation policy), and pi's
+  own docs say to prefer `AgentSession` directly over the subprocess RPC when embedding in
+  Node. So it is a good integration surface for pi, **not** a neutral cross-harness standard.
+  [RPC docs](https://pi.dev/docs/latest/rpc)
+- **Conclusion for the design:** "configurable/swappable harness" is **an abstraction Agenta
+  must own.** If we ever want to run Codex CLI or Claude Code as alternative harnesses, we
+  define our own port (config in -> sandbox setup -> run -> normalized multi-message output +
+  session_id + traces out) and write per-harness adapters. pi will be the first and
+  best-fitting adapter because of its RPC/SDK, but it does not hand us a ready-made
+  multi-harness interface.
+
+## Gotchas / sharp edges
+
+- **pi 0.x churn.** Weekly releases with breaking changes (Node-version bumps, tool-selection
+  model changes, provider removals, session-object invalidation). Pin an exact version, test
+  upgrades, watch the changelog.
+  [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md)
+- **No first-party OTel in pi.** The only OpenTelemetry path is a **third-party community
+  extension** (`mprokopov/pi-otel-telemetry`), which emits one trace tree per prompt (turns,
+  LLM requests, tool calls) over OTLP. It is unofficial and unversioned against pi; the
+  instrumentation research topic should treat first-party telemetry as absent today.
+  [pi-otel repo](https://github.com/mprokopov/pi-otel-telemetry),
+  [pi-otel writeup](https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html)
+- **pi has no built-in permission system / MCP / sub-agents / plan mode** by design — they
+  are extension territory. Anything we assume "the agent will ask before X" must be added.
+  [README](https://github.com/earendil-works/pi/blob/main/README.md),
+  [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md)
+- **JSONL framing is strict** in RPC mode: split on `\n` only; do not use Node `readline`
+  (it splits on Unicode separators too) or records corrupt.
+  [RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/)
+- **Daytona 15-min auto-stop** can kill long agent runs mid-flight (process activity does not
+  reset the idle timer) — set auto-stop explicitly for agents.
+  [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona)
+- **Daytona egress is tiered**; below Tier 3 you cannot freely reach arbitrary endpoints and
+  cannot change the firewall post-creation. Budget for Tier 3 if agents call internal/custom
+  services.
+  [Network limits](https://www.daytona.io/docs/en/network-limits/)
+- **Daytona shared-kernel isolation** is weaker than microVM competitors for untrusted code.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **pi.dev's own sandbox examples (Gondolin/Docker/OpenShell) are local/host-side**, with no
+  first-party Daytona integration — the pi <-> Daytona glue is ours to build.
+  [containerization search](https://github.com/pasky/pi-gondolin)
+
+## Alternatives (fallback landscape — one line each)
+
+Sandbox providers (alternatives to Daytona):
+- **E2B** — Firecracker microVM with a dedicated kernel per sandbox; strongest isolation for
+  untrusted code.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Modal** — native GPU sandboxes; the pick when agents need inference/GPU in-sandbox.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Fly.io (Machines / "Sprites")** — full filesystem persistence across sessions so agents
+  resume without rebuilding; Firecracker-based.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Morph** — VM branching/fork in <250ms for parallel exploration of multiple solution paths.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Freestyle** — full root + nested virtualization (Docker-in-VM) for heavy/custom envs.
+  [morphllm](https://www.morphllm.com/comparisons/daytona-alternative)
+- **Vercel Sandbox / Northflank / Cloudflare / microsandbox** — other credible options that
+  show up in 2026 comparisons; differentiators not deeply verified here. UNVERIFIED specifics.
+  [comparison](https://northflank.com/blog/ai-sandbox-pricing),
+  [comparison](https://betterstack.com/community/comparisons/best-sandbox-runners/)
+
+Harnesses (alternatives to pi.dev):
+- **Claude Code** (Anthropic) — the de-facto reference coding agent; more opinionated, larger
+  ecosystem, less "minimal/composable" than pi. Often cited by pi users as the thing they
+  came from.
+  [HN](https://news.ycombinator.com/item?id=47634337)
+- **Codex CLI** (OpenAI) — OpenAI's agent CLI; pi can use Codex *as a provider via OAuth*, but
+  as a *harness* it's a separate tool with its own loop.
+  [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/)
+- **oh-my-pi** — a community fork of pi adding subagents/LSP/browser/optimized tool harness;
+  signal that pi's design invites forks, and a possible drop-in if pi mainline diverges.
+  [oh-my-pi](https://github.com/can1357/oh-my-pi)
+
+## Open questions (for the focused topics / before committing)
+
+1. Pin strategy for pi version (exact pin + upgrade cadence) given weekly breaking 0.x
+   releases. Who owns watching the changelog?
+2. Telemetry: do we adopt/fork `pi-otel-telemetry`, or write our own pi extension to emit the
+   spans Agenta tracing expects? (No first-party OTel exists.) → instrumentation topic.
+3. Confirm Daytona EU region + data-residency guarantees against official docs/sales; the
+   "single-region us-east-1" claim needs verification before we promise EU residency.
+4. Decide the default auto-stop / max-run-duration for agent sandboxes so long runs aren't
+   killed at 15 min. → sandbox-creation topic.
+5. Which Daytona tier do we operate on? Egress + post-creation firewall + concurrency quotas
+   all hinge on Tier 3+. → auth/secrets + sandbox-creation topics.
+6. Define Agenta's own harness port (config -> setup -> run -> normalized output + session_id
+   + traces) since pi gives no neutral multi-harness interface; validate it against pi first,
+   then sketch a Codex/Claude-Code adapter to prove the abstraction. → pi.dev harness topic.
+7. Local-parity contract: which startup hooks (files, secrets, egress, telemetry) must be
+   replicated locally, and do we reuse pi's Gondolin/Docker locally or run bare on host?
+   → local-execution topic.
+8. AGPL review for any self-hosted-and-modified Daytona path (network copyleft).
+
+## Sources
+
+- pi.dev overview — https://pi.dev/
+- pi README — https://github.com/earendil-works/pi/blob/main/README.md
+- pi docs index — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md
+- pi coding-agent CHANGELOG — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md
+- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent
+- pi RPC docs — https://pi.dev/docs/latest/rpc
+- pi RPC (mirror) — https://hochej.github.io/pi-mono/coding-agent/rpc/
+- Harness engineering with pi (slides) — https://dmg-egg.github.io/slides-harness-engineering-with-pi/
+- Mario Zechner GitHub — https://github.com/badlogic
+- HN discussion on pi — https://news.ycombinator.com/item?id=47634337 and https://news.ycombinator.com/item?id=46629341
+- pi-otel telemetry extension — https://github.com/mprokopov/pi-otel-telemetry
+- pi-otel writeup — https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html
+- pi-gondolin sandbox extension — https://github.com/pasky/pi-gondolin
+- oh-my-pi fork — https://github.com/can1357/oh-my-pi
+- Daytona docs overview — https://www.daytona.io/docs
+- Daytona limits — https://www.daytona.io/docs/en/limits/
+- Daytona resource quotas (DeepWiki) — https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits
+- Daytona regions — https://www.daytona.io/docs/en/regions/
+- Daytona network limits — https://www.daytona.io/docs/en/network-limits/
+- Daytona dynamic egress issue — https://github.com/daytonaio/daytona/issues/3357
+- Daytona pricing — https://www.daytona.io/pricing
+- Daytona GitHub — https://github.com/daytonaio/daytona
+- Daytona funding (PRNewswire) — https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html
+- Daytona funding (blog) — https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments
+- E2B vs Daytona — https://www.zenml.io/blog/e2b-vs-daytona
+- Daytona vs Modal — https://northflank.com/blog/daytona-vs-modal
+- AI sandbox pricing comparison — https://northflank.com/blog/ai-sandbox-pricing
+- Daytona alternatives — https://www.morphllm.com/comparisons/daytona-alternative
+- Sandbox runners comparison — https://betterstack.com/community/comparisons/best-sandbox-runners/
+- Daytona sub-90ms (vstorm) — https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/
diff --git a/docs/design/agent-workflows/research/otel-instrumentation.md b/docs/design/agent-workflows/research/otel-instrumentation.md
new file mode 100644
index 0000000000..5f632e8ca6
--- /dev/null
+++ b/docs/design/agent-workflows/research/otel-instrumentation.md
@@ -0,0 +1,379 @@
+# OTel Instrumentation for the pi.dev Agent Harness
+
+Status: research only. No code changed. Research date: 2026-06-15.
+
+This file answers the five research questions in the agent-workflows brief:
+how to instrument the pi.dev harness with OpenTelemetry (OTel), what already
+exists, what span conventions to use, how spans get out of a sandbox, and how
+all of that lands in Agenta's existing OTel ingestion.
+
+## Summary
+
+- **pi.dev is "Pi", a minimal agent harness by Earendil Inc.** (the company is
+  "earendil-works" on GitHub, repo `earendil-works/pi`). It is a coding-agent
+  toolkit: a unified multi-provider LLM API, an agent loop with tool calling,
+  a TUI, and a CLI. It ships as npm packages `@earendil-works/pi-ai`,
+  `@earendil-works/pi-agent-core`, `@earendil-works/pi-coding-agent`,
+  `@earendil-works/pi-tui`. MIT licensed.
+- **"pi instruments" is not a built-in OTel exporter.** Pi has no native OTel
+  emitter in its docs. What it has is an **extension event system**: an
+  extension registers handlers with `pi.on(<event>, handler)` and gets
+  lifecycle events for the agent loop (session, agent_start/agent_end,
+  turn_start/turn_end, tool_execution_start/end, before_provider_request /
+  after_provider_response, message_start/message_end). "Instrumentation" =
+  writing (or installing) an extension that listens to those events and turns
+  them into OTel spans. There is no first-party Pi telemetry dashboard to
+  reuse.
+- **Three community OTel extensions for Pi already exist** and all emit OTLP:
+  `maxmalkin/pi-OTEL`, `mprokopov/pi-otel-telemetry`, and the `pi-otel` covered
+  by the nikiforovall blog. They all use **OTel GenAI semantic conventions**
+  (`gen_ai.*`), not OpenInference. They are TypeScript Pi extensions.
+- **Agenta already ingests exactly this.** Agenta exposes an OTLP/HTTP
+  protobuf endpoint at `POST /otlp/v1/traces` and normalizes incoming spans
+  through an adapter registry that already understands **OTel GenAI semconv**,
+  **OpenLLMetry (Traceloop)**, **OpenInference (Arize)**, **Logfire**, and
+  **Vercel AI**. A Pi extension that emits `gen_ai.*` spans over OTLP/HTTP to
+  Agenta's endpoint would flow through the existing pipeline with little or no
+  new backend code.
+- **Recommended path:** emit OTel GenAI-semconv spans from a Pi extension
+  (fork/reuse one of the three), export OTLP/HTTP to Agenta's
+  `/otlp/v1/traces` with `Authorization: ApiKey <key>` and `?project_id=<id>`,
+  and let the existing GenAI-semconv adapter map them. Add a thin Agenta-side
+  adapter only if we want richer agent/turn structure than `gen_ai.*` carries.
+
+## What "pi instruments" is
+
+**Product.** pi.dev = "Pi", "a minimal agent harness" by Earendil Inc. Tagline
+"Adapt Pi to your workflows, not the other way around." Four operating modes:
+interactive TUI, print/JSON output, RPC (stdin/stdout JSONL), and an SDK for
+embedding in Node.js. It deliberately omits MCP, sub-agents, permission popups,
+and plan mode from the core, expecting you to add them via extensions.
+Source: https://pi.dev/ , https://github.com/earendil-works/pi/blob/main/README.md
+
+**Packages** (npm, scope `@earendil-works`):
+- `pi-ai` — unified multi-provider LLM API (OpenAI, Anthropic, Google, etc.)
+- `pi-agent-core` — agent runtime: tool calling + state management
+- `pi-coding-agent` — interactive coding-agent CLI
+- `pi-tui` — terminal UI library
+Source: https://github.com/earendil-works/pi/blob/main/README.md
+
+**The instrumentation mechanism is the extension event bus, not a built-in
+exporter.** Pi's official docs have an "Extensions" page but **no telemetry /
+OTel / observability page**. Extensions are TypeScript modules that subscribe
+to lifecycle events:
+
+```ts
+pi.on(eventName, async (event, ctx) => {
+  // ctx is an ExtensionContext: ctx.sessionManager (read-only session),
+  // ctx.signal (abort-aware), ctx.ui (interaction)
+});
+```
+
+Events relevant to telemetry (exact names from the Extensions doc):
+- Session lifecycle: `session_start` (reasons: startup/reload/new/resume/fork),
+  `session_shutdown`, `project_trust`, `resources_discover`.
+- Agent loop: `before_agent_start`, `agent_start` (once per user prompt),
+  `agent_end` (has `event.messages`), `turn_start`, `turn_end` (per LLM
+  response cycle).
+- Messages: `message_start`, `message_update`, `message_end` (user, assistant,
+  tool-result messages).
+- Tools: `tool_execution_start` (has `toolCallId`, `toolName`, `args`),
+  `tool_execution_update`, `tool_execution_end`; plus `tool_call` (pre-exec,
+  can block) and `tool_result` (post-exec, can modify).
+- Provider/model: `before_provider_request` (built payload, before HTTP),
+  `after_provider_response` (HTTP status/headers, before stream consumed),
+  `model_select`, `thinking_level_select`.
+- Input: `input`, `user_bash`.
+Source: https://pi.dev/docs/latest/extensions
+
+So when the agent-workflows README says runs are "instrumented through pi
+instruments," concretely that means: **a Pi extension hooks these events and
+produces spans/metrics.** There is no proprietary "instruments" object to
+adopt; it is the standard extension API. (UNVERIFIED: whether "pi instruments"
+is an internal Agenta shorthand for a specific bundled extension vs. the
+generic extension mechanism. The public Pi docs only expose `pi.on` + tools.)
+
+Installation pattern for an extension (from pi-OTEL):
+`pi install git:github.com/<owner>/<repo>` (or `pi install npm:<pkg>`), then
+`/reload`. Source: https://github.com/maxmalkin/pi-OTEL
+
+## Existing libraries
+
+### Pi-specific OTel extensions (closest fit — reuse candidates)
+
+All three are TypeScript Pi extensions emitting OTLP and using OTel GenAI
+semconv. They differ mainly in span tree shape and whether they also emit
+metrics.
+
+1. **`maxmalkin/pi-OTEL`** — "OpenTelemetry harness for the Pi coding agent."
+   - Span tree: `pi.session` -> `pi.agent_turn` -> (`gen_ai.chat <model>`,
+     `tool.<name>`).
+   - Attributes follow OTel GenAI semconv. Honors standard OTLP env vars:
+     `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_ENDPOINT`
+     (appends `/v1/traces`), `OTEL_EXPORTER_OTLP_HEADERS` (`k=v,k=v`),
+     `OTEL_SERVICE_NAME` (default `pi`), `OTEL_RESOURCE_ATTRIBUTES`.
+     Pi-specific: `PI_OTEL_DISABLED` (default `0`),
+     `PI_OTEL_CAPTURE_CONTENT` (default `0`, gates prompt/completion/tool I/O).
+     Same keys accepted in `settings.json` under `otel`. Falls back to
+     `http://localhost:4318/v1/traces` (OTLP/HTTP).
+   - Runtime commands: `/otel-status`, `/otel-flush`.
+   - Source: https://github.com/maxmalkin/pi-OTEL
+
+2. **`pi-otel` (nikiforovall)** — emits one trace tree per user prompt.
+   - Span tree: `pi.interaction` (root, per prompt) -> `pi.turn` ->
+     (`pi.llm_request`, `pi.tool.<name>`). Deliberately **does not** make the
+     session a span ("a pi session can run for hours; long-running root spans
+     are an OTel anti-pattern") — it correlates via `gen_ai.conversation.id`.
+   - Attributes: GenAI semconv — `gen_ai.system`, `gen_ai.request.model`,
+     `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, finish reason,
+     tool call ids, `gen_ai.conversation.id`.
+   - Config: default endpoint `http://localhost:4317` (OTLP **gRPC**),
+     `settings.json` `otel` block `{enabled, endpoint, protocol:"grpc"}`,
+     `OTEL_*` env overrides, `PI_OTEL_DISABLED=1` to disable. Default backend
+     is a local .NET Aspire dashboard (auto-spawned via `/otel start`); any
+     OTLP backend works (Grafana LGTM, Jaeger, Honeycomb).
+   - Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html
+
+3. **`mprokopov/pi-otel-telemetry`** — traces **and metrics**.
+   - Span tree: `session` (root) -> `agent.prompt` (per user message) ->
+     `agent.turn` (LLM call + tool cycle) -> `tool.<name>` (e.g. `tool.bash`,
+     `tool.read`, `tool.edit`). Span events: `llm.request`, `model.changed`,
+     `session.compacted`.
+   - Metrics: `pi.tokens.input`, `pi.tokens.output` (counters); `pi.tool.calls`,
+     `pi.tool.errors` (counters, labelled `tool.name`); `pi.tool.duration`
+     (histogram ms); `pi.turns`, `pi.prompts` (counters);
+     `pi.session.duration` (histogram s).
+   - Attributes: `session.id`, `session.cwd`, token counts, user identity;
+     turn spans `turn.index`, `llm.usage.input_tokens`,
+     `llm.usage.output_tokens`; tool spans `tool.name`, `tool.call_id`,
+     `tool.duration_ms`.
+   - Config: `OTEL_EXPORTER_OTLP_ENDPOINT` default `http://localhost:4318`
+     (OTLP/HTTP), `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` /
+     `..._METRICS_ENDPOINT` overrides, `PI_OTEL_DEBUG=true`.
+   - Source: https://github.com/mprokopov/pi-otel-telemetry
+
+**Takeaway:** there is no single canonical Pi OTel package; the three diverge on
+span-tree shape and span names (`pi.session` vs `pi.interaction` vs `session`).
+What they agree on is **GenAI semconv `gen_ai.*` attributes over OTLP**. For
+Agenta we should pick/fork one and pin the span tree we want; don't assume a
+stable upstream contract.
+
+### Framework instrumentations (not Pi-specific)
+
+- **OpenInference (Arize)** — OTel-based semantic conventions + auto-instrumentors
+  for LangChain, LlamaIndex, OpenAI SDK, etc. Defines 10 span kinds via the
+  required `openinference.span.kind` attribute: `LLM`, `EMBEDDING`,
+  `RETRIEVER`, `RERANKER`, `TOOL`, `CHAIN`, `AGENT`, `GUARDRAIL`, `EVALUATOR`,
+  `PROMPT`. It does **not** ship a Pi instrumentor — Pi isn't one of its
+  supported frameworks — so using OpenInference for Pi means writing the span
+  kinds by hand in a Pi extension. Fit: good vocabulary for agent/tool/chain
+  structure, but no off-the-shelf Pi support.
+  Sources: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md ,
+  https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions
+
+- **OpenLLMetry (Traceloop)** — OTel SDK + instrumentations that emit `gen_ai.*`
+  (plus `traceloop.*`, `llm.*`) attributes. Auto-instruments LLM providers and
+  some frameworks. No Pi instrumentor; same story as OpenInference — you'd hand
+  off via a Pi extension or rely on its provider-level auto-instrumentation of
+  the underlying LLM HTTP client (possible but indirect, and Pi's `pi-ai` may
+  not match a provider Traceloop patches).
+  (UNVERIFIED whether Traceloop's provider instrumentation intercepts
+  `@earendil-works/pi-ai`'s HTTP calls automatically.)
+
+- **OTel GenAI semantic conventions (official)** — the upstream spec the Pi
+  extensions follow. Operation names: `create_agent`, `invoke_agent`,
+  `execute_tool`, plus the chat/inference spans. Span naming guidance:
+  `invoke_agent {gen_ai.agent.name}` (or just `invoke_agent`), and
+  `execute_tool {gen_ai.tool.name}` for tool calls (used for MCP tool calls
+  too). Key attributes: `gen_ai.operation.name`, `gen_ai.agent.name`,
+  `gen_ai.agent.id`, `gen_ai.conversation.id`, `gen_ai.tool.name`,
+  `gen_ai.tool.call.id`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`,
+  `gen_ai.usage.output_tokens`. This is the most "standard" and the most
+  future-proof target.
+  Sources: https://opentelemetry.io/docs/specs/semconv/gen-ai/ ,
+  https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
+  (NOTE: the gen-ai pages now redirect to the
+  `open-telemetry/semantic-conventions` repo; the agent-spans operation
+  names above come from the indexed spec text, lightly UNVERIFIED against the
+  latest repo revision.)
+
+## Span / attribute conventions and how well they map to agent runs
+
+A multi-turn agent run = one logical conversation -> N user prompts ->
+per-prompt agent invocation -> M turns (each an LLM call) -> per-turn 0..K tool
+calls. All three conventions can express this; they differ in vocabulary.
+
+| Layer in a Pi run | OTel GenAI semconv | OpenInference span kind | Pi extension span (varies) |
+|---|---|---|---|
+| Whole conversation | `gen_ai.conversation.id` (correlation, not a span) | `session.id` attr / CHAIN root | `pi.session` / `session` (or skipped) |
+| Per-prompt agent invocation | `invoke_agent` op | `AGENT` | `pi.interaction` / `agent.prompt` / `pi.agent_turn` |
+| Per-turn LLM call | chat/inference span, `gen_ai.request.model` | `LLM` | `gen_ai.chat <model>` / `pi.turn` / `pi.llm_request` |
+| Tool call | `execute_tool`, `gen_ai.tool.name`, `gen_ai.tool.call.id` | `TOOL` | `tool.<name>` |
+| Glue/orchestration | (no dedicated kind) | `CHAIN` | n/a |
+| Retrieval / rerank / embeddings | embeddings spans | `RETRIEVER` / `RERANKER` / `EMBEDDING` | n/a |
+
+Assessment:
+- **GenAI semconv** maps cleanly to LLM calls and tool calls and has explicit
+  agent + tool operation names. Its weak spot is the multi-turn *tree*: it
+  leans on `gen_ai.conversation.id` for correlation rather than mandating a
+  session/turn span hierarchy, which is why the Pi extensions invent their own
+  parent spans (`pi.session`, `pi.interaction`, `pi.turn`). Good attribute
+  vocabulary; you still design the tree.
+- **OpenInference span kinds** (AGENT/CHAIN/LLM/TOOL/RETRIEVER) map *very*
+  cleanly to a nested agent run and are what Agenta's UI already keys off (see
+  next section). The cost: no Pi auto-instrumentor, so you set
+  `openinference.span.kind` yourself.
+- A pragmatic hybrid works: emit GenAI `gen_ai.*` attributes (what the Pi
+  extensions already produce) **and** set `openinference.span.kind` per span so
+  Agenta types the node correctly. Agenta's adapters read both.
+
+## Export-from-sandbox path
+
+Inside a Daytona (or other) sandbox the Pi extension runs the OTel SDK and
+exports OTLP. To reach Agenta's collector across the sandbox boundary:
+
+1. **Endpoint.** Agenta accepts OTLP/HTTP **protobuf** at `POST /otlp/v1/traces`
+   (mounted in `api/entrypoints/routers.py` with prefix `/otlp/v1`). Binary
+   protobuf only (`Content-Type: application/x-protobuf`); JSON OTLP is **not**
+   accepted. Batch size limit default 10 MB (`AGENTA_OTLP_MAX_BATCH_BYTES`,
+   env `OTLPConfig.max_batch_bytes`); over-limit -> 413. (The router docstring
+   says "default 4 MB"; the actual env default in `env.py` is 10 MB — doc/code
+   drift worth noting.)
+   Files: `api/oss/src/apis/fastapi/otlp/router.py`,
+   `api/oss/src/utils/env.py` (`OTLPConfig`, line ~326),
+   `api/entrypoints/routers.py` (~line 770).
+   - So set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=https://<agenta-host>/otlp/v1/traces`
+     and use the **OTLP/HTTP protobuf** exporter. The gRPC-default extension
+     (nikiforovall) would need reconfiguring to HTTP/protobuf, or a collector
+     sidecar to translate.
+2. **Auth + tenant scope.** Agenta's auth middleware expects
+   `Authorization: ApiKey <key>` (prefix `ApiKey `) and resolves
+   organization/workspace/project/user from it; `project_id` can also come
+   from a `?project_id=<uuid>` query param. So the exporter needs
+   `OTEL_EXPORTER_OTLP_HEADERS=Authorization=ApiKey <key>` and the project id
+   either in the key's scope or the URL query string. In EE the ingest path
+   also checks `EDIT_SPANS` permission and `TRACES_INGESTED` quota.
+   Files: `api/oss/src/middlewares/auth.py` (`_APIKEY_TOKEN_PREFIX = "ApiKey "`,
+   query `project_id` handling), `api/oss/src/apis/fastapi/otlp/router.py`
+   (EE permission + entitlement checks).
+3. **Secret delivery.** The Agenta API key is a secret; per the agent-workflows
+   README, secrets are injected into the sandbox by the startup hook. The key
+   and the OTLP endpoint should be injected the same way (env vars consumed by
+   the OTel SDK), so the harness running locally vs server-side only differs in
+   endpoint/key values — preserving the local/server parity requirement.
+4. **Trace-context propagation across the boundary.** Two cases:
+   - If the agent run is *initiated by* an Agenta backend request, propagate
+     W3C `traceparant`/`traceparent` into the sandbox (env or RPC metadata) so
+     the in-sandbox root span is a child of the backend span and the run shows
+     as one trace. (UNVERIFIED: whether Agenta currently sets/forwards
+     `traceparent` to invocations — needs a check of the invocation service.)
+   - If the run is standalone, the extension creates its own root and relies on
+     `gen_ai.conversation.id` / `session.id` for correlation; Agenta's
+     OpenInference + Logfire adapters map `session.id` /
+     `gen_ai.conversation.id` -> `ag.session.id`, which lines up with the
+     agent-workflows `session_id` concept.
+5. **Network egress.** The sandbox must be allowed outbound HTTPS to the Agenta
+   host. With Daytona this is a sandbox network-policy concern (UNVERIFIED for
+   our port). A collector/agent sidecar in the sandbox is an alternative that
+   also lets us batch, retry, and strip content centrally.
+
+## How it maps to Agenta's existing OTel ingestion
+
+Agenta already has the whole receive-and-normalize pipeline; a Pi agent is just
+another OTLP producer.
+
+- **Ingest.** `OTLPRouter.otlp_ingest` parses the protobuf
+  (`parse_otlp_stream`), converts each OTel span to an internal DTO
+  (`parse_from_otel_span_dto`), runs an EE quota soft-check, then queues spans
+  on a Redis stream for async persistence (same path as native ingest).
+  File: `api/oss/src/apis/fastapi/otlp/router.py`.
+- **Normalization via adapter registry.** `AdapterRegistry` runs, in order:
+  `OpenLLMmetryAdapter`, `OpenInferenceAdapter`, `LogfireAdapter`,
+  `VercelAIAdapter`, `DefaultAgentaAdapter`. Each maps its vendor attributes to
+  Agenta's canonical `ag.*` namespace.
+  File: `api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py`.
+- **GenAI semconv is already mapped.** `api/.../otlp/opentelemetry/semconv.py`
+  and the OpenLLMetry adapter map `gen_ai.system`, `gen_ai.request.model`,
+  `gen_ai.usage.prompt_tokens|completion_tokens|total_tokens`,
+  `gen_ai.prompt.*`, `gen_ai.completion.*`, etc. -> `ag.meta.*` /
+  `ag.data.*` / `ag.metrics.unit.tokens.*`. **This is precisely what the Pi
+  OTel extensions emit**, so Pi `gen_ai.*` spans largely normalize today.
+  - Caveat: the existing map uses the older `gen_ai.usage.prompt_tokens` /
+    `completion_tokens` names. The Pi extensions emit the newer
+    `gen_ai.usage.input_tokens` / `output_tokens`. Those newer keys are **not**
+    in `semconv.py` yet, so token metrics from Pi would be dropped until we add
+    the two aliases. (Verified by reading `semconv.py` — only `prompt_tokens` /
+    `completion_tokens` / `total_tokens` are present.)
+- **Span typing / agent structure.** `OpenInferenceAdapter` maps
+  `openinference.span.kind` -> `ag.type.node` with
+  `OPENINFERENCE_TO_AGENTA_SPAN_KIND_MAP`: `CHAIN->chain`, `RETRIEVER->query`,
+  `RERANKER->rerank`, `LLM->chat`, `EMBEDDING->embedding`, `AGENT->agent`,
+  `TOOL->tool`, `GUARDRAIL->task`, `EVALUATOR->task`. It also normalizes tool
+  definitions (`llm.tools.{i}.tool.json_schema`), tool calls, and
+  input/output messages into the canonical OpenAI shape Agenta's UI expects.
+  File: `api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py`.
+- **Session correlation.** `session.id` (OpenInference) and
+  `gen_ai.conversation.id` (Logfire adapter) both map to `ag.session.id`,
+  which aligns with the agent-workflows `session_id`.
+
+**Net:** the lowest-effort integration is a Pi extension emitting GenAI-semconv
+spans **and** `openinference.span.kind` over OTLP/HTTP protobuf to
+`/otlp/v1/traces`. To get full fidelity we'd add a small amount of backend
+mapping (token-name aliases; optionally a dedicated "Pi/agent" adapter if we
+want first-class agent/turn nodes instead of generic chat/tool). No new ingest
+infrastructure is needed.
+
+## Open questions
+
+1. **Which span tree do we standardize on?** The three Pi extensions disagree
+   (`pi.session` vs `pi.interaction` vs `session`; whether the session is a
+   span at all). We must pin one to get a stable Agenta UI. The
+   "no long-running session root" argument (nikiforovall) matters if Pi
+   sessions can run for hours.
+2. **Build vs fork.** Fork `maxmalkin/pi-OTEL` (OTLP/HTTP, content gate) or
+   `mprokopov/pi-otel-telemetry` (also metrics) vs write our own minimal
+   extension? Need to read their actual source for license/quality and to see
+   the exact `pi.on(...)` wiring (the READMEs describe spans, not code).
+3. **Token attribute drift.** Add `gen_ai.usage.input_tokens` /
+   `output_tokens` (and `gen_ai.usage.*` newer keys) to Agenta's `semconv.py`
+   so Pi token metrics aren't silently dropped. Confirm against the live
+   GenAI semconv revision.
+4. **Trace-context propagation.** Does Agenta forward W3C `traceparent` into an
+   invocation today? If we want the in-sandbox spans stitched under the
+   originating backend span, we need to propagate context across the
+   harness/sandbox boundary (env var or RPC metadata). Needs a code check of
+   the invocation/workflow run path.
+5. **Content capture policy.** Pi extensions gate prompt/completion/tool I/O
+   behind `PI_OTEL_CAPTURE_CONTENT`. Decide default (privacy vs. eval
+   usefulness) and whether to enforce it server-side too.
+6. **Transport mismatch.** Agenta is OTLP/HTTP **protobuf only**. The
+   gRPC-default extension and any JSON-OTLP setup need reconfiguration or a
+   collector sidecar in the sandbox.
+7. **"pi instruments" terminology.** Confirm with whoever wrote the
+   agent-workflows README whether it refers to the generic `pi.on` extension
+   API or a specific Earendil/Agenta-internal "instruments" bundle. The public
+   Pi docs only expose `pi.on` + tool registration; no "instruments" object.
+8. **Doc/code drift.** OTLP router docstring says 4 MB max batch; `env.py`
+   default is 10 MB. Worth fixing when this work lands.
+
+## Sources
+
+- Pi product site: https://pi.dev/
+- Pi repo README: https://github.com/earendil-works/pi/blob/main/README.md
+- Pi extensions doc (event system / `pi.on`): https://pi.dev/docs/latest/extensions
+- Pi docs index: https://pi.dev/docs/latest
+- pi-OTEL extension (maxmalkin): https://github.com/maxmalkin/pi-OTEL
+- pi-otel-telemetry (mprokopov): https://github.com/mprokopov/pi-otel-telemetry
+- pi-otel blog (nikiforovall): https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html
+- Pi as customer-hosted agent runtime discussion: https://github.com/earendil-works/pi/discussions/3337
+- OTel GenAI semconv (index): https://opentelemetry.io/docs/specs/semconv/gen-ai/
+- OTel GenAI agent spans: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
+- OpenInference semantic conventions spec: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md
+- OpenInference conventions (Arize docs): https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions
+- Agenta OTLP ingest router: api/oss/src/apis/fastapi/otlp/router.py
+- Agenta adapter registry: api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py
+- Agenta GenAI/OpenLLMetry semconv map: api/oss/src/apis/fastapi/otlp/opentelemetry/semconv.py
+- Agenta OpenInference adapter: api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py
+- Agenta auth middleware: api/oss/src/middlewares/auth.py
+- Agenta OTLP config: api/oss/src/utils/env.py (OTLPConfig)
+- Router mounting: api/entrypoints/routers.py
diff --git a/docs/design/agent-workflows/research/pi-interaction.md b/docs/design/agent-workflows/research/pi-interaction.md
new file mode 100644
index 0000000000..c5a1fee83c
--- /dev/null
+++ b/docs/design/agent-workflows/research/pi-interaction.md
@@ -0,0 +1,584 @@
+# Research: Programmatically driving the pi.dev agent harness
+
+Status: research only. No code changed outside this file.
+Scope: how the Agenta backend can drive a "pi.dev" harness for the new `agents`
+workflow type. Answers questions 1-7 from the research brief, with sources.
+
+## Summary
+
+- **pi.dev is the Pi coding agent** by Earendil Inc.: "a minimal, extensible agent
+  harness." It is a TypeScript/Node monorepo, MIT-licensed, distributed on npm.
+  Latest published version at time of research: **0.79.4**. The CLI binary is `pi`.
+- Three layers matter to us, smallest to largest:
+  - `@earendil-works/pi-ai` - unified multi-provider LLM API (`getModel`, `stream`,
+    `complete`, content blocks incl. images, image generation).
+  - `@earendil-works/pi-agent-core` - the agent loop: stateful `Agent` class, tool
+    calling, event stream, `sessionId`, before/after tool hooks, transport abstraction.
+  - `@earendil-works/pi-coding-agent` - the full harness + CLI: `createAgentSession`,
+    built-in tools (read/bash/edit/write/...), extensions/hooks, skills, AGENTS.md
+    loading, session persistence (JSONL), and four run surfaces (TUI, print/JSON, RPC,
+    SDK).
+- **Four ways to drive it programmatically.** For a Python backend driving pi inside a
+  sandbox, the realistic options are (a) **RPC mode** (`pi --mode rpc`, JSONL over
+  stdin/stdout, bidirectional, supports follow-ups/steering/abort), or (b) **print/JSON
+  mode** (`pi --mode json "prompt"`, one-shot, JSON-lines events on stdout). The
+  **SDK** (`createAgentSession`) is the in-process TypeScript path and gives the richest
+  control; it is what you would use if any part of the harness is itself Node.
+- **Multi-message output, sessions, streaming, hooks, tools, model selection** are all
+  first-class and map cleanly onto the design doc's requirements. The one soft spot is
+  **"pi instruments"**: pi itself ships no built-in "instruments" product. The
+  observability story is OpenTelemetry via the community `pi-otel` extension (built on
+  pi's hooks), plus an in-house extensions/hooks API you can instrument against. See
+  Question 3 and the Open questions section.
+- **Swappable harness + local parity** are supported by design: the harness is the thing
+  behind a thin run surface (RPC/JSON/SDK), so a different harness (e.g. OpenAI Codex)
+  that speaks the same surface can be slotted in; and the same `pi` binary/SDK runs
+  locally and in the sandbox, which is exactly the parity the design wants.
+
+## What pi.dev is (with sources)
+
+"Pi is a minimal, extensible agent harness... Adapt Pi to your workflows, not the other
+way around." It deliberately omits things like sub-agents and plan mode so you compose
+them yourself via extensions.
+Source: https://pi.dev/ and https://github.com/earendil-works/pi
+
+Packages (all MIT, all `0.79.4` at research time; confirmed via the npm registry API):
+- `@earendil-works/pi-coding-agent` - "Coding agent CLI with read, bash, edit, write
+  tools and session management." Bin: `{"pi": "dist/cli.js"}`. Depends on
+  `pi-agent-core`, `pi-ai`, `pi-tui` (all `^0.79.4`), `typebox@1.x`, `undici`, etc.
+- `@earendil-works/pi-agent-core` - "General-purpose agent with transport abstraction,
+  state management, and attachment support."
+- `@earendil-works/pi-ai` - "Unified LLM API with automatic model discovery and provider
+  configuration."
+Source: `https://registry.npmjs.org/@earendil-works/pi-coding-agent` (and `/pi-ai`,
+`/pi-agent-core`), GitHub repo root README.
+
+Repository layout (monorepo):
+```
+packages/
+  coding-agent/   # CLI + harness (SDK lives here)
+  agent/          # @earendil-works/pi-agent-core
+  ai/             # @earendil-works/pi-ai
+  tui/            # @earendil-works/pi-tui
+```
+Key docs in-repo: `packages/coding-agent/docs/{sdk,extensions,json,rpc,models,settings,
+containerization}.md`.
+Source: https://github.com/earendil-works/pi/tree/main/packages
+
+Why this matches the design doc's "agent harness with tools, hooks, instruments,
+sessions, runs in sandboxes": pi provides tools (built-in + custom via TypeBox),
+25+ TypeScript hooks, JSONL sessions with a `sessionId`, a documented containerization
+story, and a community OTel instrumentation extension. The name "pi.dev" in the design
+doc is unambiguously this product.
+
+Install (host or inside sandbox image):
+```bash
+npm install @earendil-works/pi-coding-agent   # SDK + CLI
+# CLI is also installable via curl / PowerShell / pnpm / bun per pi.dev
+```
+Source: https://github.com/earendil-works/pi, https://pi.dev/
+
+---
+
+## Question 1 - How do you programmatically interact with pi.dev (API/SDK/CLI surface)?
+
+**Language:** TypeScript/Node. There is no first-party Python SDK; a Python backend
+drives pi over a process boundary (RPC or print/JSON mode) or shells out to the `pi` CLI.
+
+**Four run surfaces** (pi's own term):
+1. **Interactive TUI** - `pi` (not relevant to us).
+2. **Print / JSON mode** - `pi -p "query"` or `pi --mode json "query"`. One-shot;
+   emits results (text or JSON-lines events) to stdout. Good for stateless single runs.
+3. **RPC mode** - `pi --mode rpc`. JSON protocol over stdin/stdout; bidirectional and
+   long-lived. This is the canonical "drive it from another process/language" surface.
+4. **SDK** - `import { createAgentSession } from "@earendil-works/pi-coding-agent"`.
+   In-process, richest control. This is what you embed if your harness runner is Node.
+Sources: https://pi.dev/, https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md
+
+**SDK entrypoints** (from `docs/sdk.md`):
+```typescript
+import {
+  createAgentSession,
+  createAgentSessionRuntime,
+  SessionManager,
+  AuthStorage,
+  ModelRegistry,
+  DefaultResourceLoader,
+  defineTool,
+} from "@earendil-works/pi-coding-agent";
+
+const { session, extensionsResult, modelFallbackMessage } =
+  await createAgentSession({
+    cwd: process.cwd(),
+    model: myModel,
+    thinkingLevel: "medium",
+    tools: ["read", "bash", "edit"],
+    sessionManager: SessionManager.inMemory(),
+  });
+```
+`createAgentSessionRuntime(factory, options)` is the multi-session variant
+(`newSession()`, `switchSession()`, `fork()`, `importFromJsonl()`).
+
+The returned `AgentSession` interface (verbatim from docs):
+```typescript
+interface AgentSession {
+  prompt(text: string, options?: PromptOptions): Promise<void>;
+  steer(text: string): Promise<void>;
+  followUp(text: string): Promise<void>;
+  subscribe(listener: (event: AgentSessionEvent) => void): () => void;
+  setModel(model: Model): Promise<void>;
+  setThinkingLevel(level: ThinkingLevel): void;
+  cycleModel(): Promise<ModelCycleResult | undefined>;
+  navigateTree(targetId: string, options?: NavigateOptions): Promise<NavigateResult>;
+  compact(customInstructions?: string): Promise<CompactionResult>;
+  abort(): Promise<void>;
+  dispose(): void;
+  sessionFile: string | undefined;
+  sessionId: string;            // <-- session id, see Q7
+  agent: Agent;
+  model: Model | undefined;
+  thinkingLevel: ThinkingLevel;
+  messages: AgentMessage[];     // <-- multi-message output, see Q4
+  isStreaming: boolean;
+}
+```
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+
+**Low-level loop** (in `pi-agent-core`) if you want to drive turns yourself:
+```typescript
+import { agentLoop, agentLoopContinue } from "@earendil-works/pi-agent-core";
+for await (const event of agentLoop([userMessage], context, config)) { /* ... */ }
+```
+Source: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md
+
+**Recommendation for Agenta:** drive pi over **RPC mode** from the Python backend
+process that owns the sandbox (long-lived, supports follow-ups/steering/abort and a
+stable JSONL contract), and reserve print/JSON mode for stateless single-shot runs. Use
+the SDK only if the in-sandbox runner is itself Node. RPC/JSON give the cleanest swappable
+boundary for a non-pi harness (Codex) later (Question 7).
+
+---
+
+## Question 2 - Sending messages and getting responses; streaming
+
+**SDK:** `await session.prompt(text, options?)` sends a user message and resolves when the
+agent turn completes. Mid-stream you can `steer()` (replace current op) or `followUp()`
+(queue after the turn). Streaming is via `subscribe()` callbacks (push-based observer,
+not an async generator at the session level):
+```typescript
+const unsubscribe = session.subscribe((event) => {
+  switch (event.type) {
+    case "message_update":
+      if (event.assistantMessageEvent.type === "text_delta") {
+        process.stdout.write(event.assistantMessageEvent.delta);   // streaming text
+      }
+      break;
+    case "tool_execution_start": /* event.toolName */ break;
+    case "tool_execution_end":   /* event.isError */ break;
+    case "turn_end":  /* event.message */ break;
+    case "agent_end": /* event.messages = full multi-message output */ break;
+  }
+});
+```
+Full event set: `agent_start`, `agent_end`, `turn_start`, `turn_end`, `message_start`,
+`message_update`, `message_end`, `tool_execution_start`, `tool_execution_update`,
+`tool_execution_end`, `queue_update`, `compaction_start/end`, `auto_retry_start/end`.
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+
+**pi-agent-core** is where the async-generator streaming lives: `agentLoop()` /
+`agentLoopContinue()` are `for await` async generators; the `Agent` class wraps them with
+`subscribe()`. The low-level `pi-ai` `stream()` emits `text_start/delta/end`,
+`thinking_*`, `toolcall_*`, `done`, `error`.
+Sources: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md,
+https://github.com/earendil-works/pi/blob/main/packages/ai/README.md
+
+**RPC mode (cross-process / cross-language):** JSONL over stdin/stdout.
+- Framing: strict LF (`\n`)-delimited JSON. Strip a trailing `\r`. **Do not** use
+  Node `readline` or other readers that split on Unicode separators (e.g. `U+2028`),
+  because those characters appear inside JSON payloads.
+- Send a prompt (client -> pi stdin):
+  ```json
+  {"id": "req-1", "type": "prompt", "message": "Hello"}
+  ```
+  Ack (pi stdout): `{"id": "req-1", "type": "response", "command": "prompt", "success": true}`
+- Other commands: `steer`, `follow_up`, `abort`, `new_session`, `set_model`,
+  `cycle_model`, `get_state`, `get_messages`, `set_thinking_level`, `bash`,
+  `get_session_stats`, `switch_session`, `fork`, `clone`, `compact`, etc.
+- Events stream back as JSON lines **without** an `id` (same event names as the SDK):
+  ```json
+  {"type":"message_update","assistantMessageEvent":{"type":"text_delta","delta":"Hello"}}
+  {"type":"message_update","assistantMessageEvent":{"type":"text_end"}}
+  {"type":"agent_end","messages":[...]}
+  ```
+- The optional `id` on a command is echoed back on its `response` for correlation. There
+  is **no handshake** - the protocol starts immediately; the first client command begins
+  interaction.
+- Extension UI is also over the wire: `extension_ui_request` (stdout) /
+  `extension_ui_response` (stdin) for `select`/`confirm`/`input`/`editor`, plus
+  fire-and-forget `notify`/`setStatus`/`setWidget`.
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md
+
+**Streaming summary:** SDK = observer callbacks; agent-core/pi-ai = async generators;
+RPC/JSON modes = JSON-lines event stream over stdout. No SSE or websockets in pi itself;
+if Agenta needs SSE to a frontend, the backend wraps the JSONL/observer stream and
+re-emits SSE.
+
+---
+
+## Question 3 - Startup hooks (file setup, secret injection, env prep)
+
+pi has a rich **extension hook system**, plus an **app-level startup ordering** for the
+sandbox that Agenta controls itself. Two layers:
+
+### 3a. pi extension hooks (in-process, TypeScript)
+Extensions are default-exported factory functions auto-discovered from:
+- Global: `~/.pi/agent/extensions/*.ts` (or `.../*/index.ts`)
+- Project: `.pi/extensions/*.ts` (or `.../*/index.ts`)
+- CLI: `pi -e ./path.ts`
+```typescript
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+export default function (pi: ExtensionAPI) {
+  pi.on("session_start", async (event, ctx) => { /* file setup / state restore */ });
+  pi.registerTool({ /* ... */ });
+}
+```
+Factory functions may be **async**, which is the supported way to do startup
+initialization (e.g. fetch remote config) before the session begins.
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+
+**Relevant hook points (25+ total) for startup/setup:**
+- `project_trust` -> `{ trusted: "yes"|"no"|"undecided", remember? }` (gate before
+  loading dynamic configs).
+- `session_start` -> reason `"startup"|"reload"|"new"|"resume"|"fork"`. The documented
+  place for one-time per-session setup and state restoration. This is the natural
+  **file-setup hook**.
+- `session_shutdown` -> cleanup / persist state (`pi.appendEntry(...)`).
+- `resources_discover` -> contribute `skillPaths`/`promptPaths`/`themePaths` (how skills
+  get injected).
+- `before_agent_start` -> inject messages or modify the system prompt before the LLM turn.
+- `context` / `before_provider_request` / `after_provider_response` -> mutate the
+  messages/payload around each LLM call (good instrumentation points).
+- `tool_call` -> can **block** a tool (`{ block: true, reason }`); `tool_result` can
+  rewrite results.
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+
+**Secret injection at the pi layer** is via provider registration with env interpolation:
+```typescript
+pi.registerProvider("provider-name", {
+  name: "Display Name",
+  baseUrl: "https://api.example.com",
+  apiKey: "$ENV_VAR",         // "$VAR" / "${VAR}" interpolated; "$$" -> literal "$"
+  api: "anthropic-messages",
+  models: [/* ... */],
+});
+```
+And/or `AuthStorage` (SDK): resolution order is runtime overrides -> `auth.json` ->
+environment variables -> fallback resolver:
+```typescript
+const authStorage = AuthStorage.create();
+authStorage.setRuntimeApiKey("anthropic", process.env.MY_KEY); // not persisted
+```
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+
+### 3b. App-level (sandbox) startup ordering - Agenta's own hooks
+The design doc's "startup hooks set up files then secrets" is the **sandbox boot
+sequence**, which Agenta owns, not a pi API. pi's containerization doc shows secrets are
+injected as env vars at container start and files via bind mounts:
+```bash
+docker run --rm -it \
+  -e ANTHROPIC_API_KEY \
+  -v "$PWD:/workspace" \
+  -v pi-agent-home:/root/.pi/agent \
+  pi-sandbox
+```
+Three documented isolation modes: **Gondolin** (local micro-VM, tools run in VM, auth
+stays on host), **plain Docker** (whole pi process containerized), and **OpenShell**
+(policy-controlled gateway that can inject provider creds upstream so raw keys never
+enter the sandbox). For Agenta's Daytona target, the equivalent is: lay files into the
+workspace, then set secret env vars / write `auth.json`, then start `pi --mode rpc`.
+Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md
+
+So "file setup then secrets" maps to: (1) sandbox provisioning lays config files
+(AGENTS.md, skills, files) into the workspace and `~/.pi/agent`; (2) secrets are set as
+env vars / `auth.json`; (3) pi boots and its own `session_start` extension hook can do any
+remaining in-process setup. Note: pi's own hooks fire **inside** pi after it starts, so
+they cannot themselves be the mechanism that installs pi's secrets before pi starts -
+that ordering belongs to the sandbox layer (the `$ENV_VAR`/`auth.json` is read by pi at
+boot).
+
+---
+
+## Question 4 - Returns as TEXT
+
+- **Streaming:** `message_update` events carry `assistantMessageEvent.type ===
+  "text_delta"` with `.delta`. Concatenate deltas for live text. (RPC/JSON modes emit the
+  same shape on stdout.)
+- **Final / multi-message:** the run produces an array of messages, not one completion.
+  - SDK: `session.messages` (all) and the `agent_end` event's `messages` array; per-turn
+    text is on `turn_end`'s `message`.
+  - The `agent_end` event is the canonical "full multi-message output" the design doc
+    wants. Each assistant message's `content` is an array of content blocks; text blocks
+    are `{ type: "text", text }`.
+- **print mode:** `pi -p "query"` prints assistant text to stdout directly (simplest text
+  path for a one-shot run).
+- **JSON mode filtering example** (text via `message_end`):
+  ```bash
+  pi --mode json "List files" 2>/dev/null | jq -c 'select(.type == "message_end")'
+  ```
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md,
+https://github.com/earendil-works/pi/blob/main/packages/ai/README.md
+
+---
+
+## Question 5 - Returns as IMAGES and other binary/file artifacts
+
+pi-ai content blocks include an explicit image block; images are base64 + MIME type:
+```typescript
+type ContentBlock =
+  | { type: 'text';  text: string }
+  | { type: 'image'; data: string; mimeType: string }        // base64-encoded
+  | { type: 'toolCall'; id: string; name: string; arguments: Record<string, any> }
+  | { type: 'thinking'; thinking: string };
+```
+Tool results carry their own `content: ContentBlock[]`, so a tool can return an image
+block:
+```typescript
+{
+  role: 'toolResult';
+  toolCallId: string;
+  toolName: string;
+  content: ContentBlock[];   // may include { type: 'image', data, mimeType }
+  isError: boolean;
+  timestamp: number;
+}
+```
+- **Input images** (multimodal prompts): SDK `prompt(text, { images: [...] })` with
+  `ImageContent` = `{ type: "image", source: { type: "base64", mediaType, data } }`
+  (SDK shape). pi-agent-core's `prompt()` also accepts
+  `[{ type: "image", data, mimeType }]`.
+- **Generated images:** pi-ai exposes `getImageModel(provider, modelId)` and
+  `generateImages(model, input, options)` (one-shot image generation).
+- **Binary/file artifacts:** there is no dedicated "artifact" return channel. The two
+  practical paths are (a) tools return an `image` content block (base64), or (b) the
+  agent writes files to the sandbox workspace (write/bash tools) and Agenta collects them
+  from the filesystem after the run. pi-agent-core's package description explicitly
+  mentions "attachment support," which is worth confirming in source for non-image
+  binaries.
+Sources: https://github.com/earendil-works/pi/blob/main/packages/ai/README.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+`https://registry.npmjs.org/@earendil-works/pi-agent-core` (description). The
+attachment/binary specifics are **UNVERIFIED** beyond the image block - confirm in
+`packages/agent` source / `packages/ai` source.
+
+---
+
+## Question 6 - STRUCTURED OUTPUTS (JSON / schema-constrained)
+
+pi's idiomatic structured-output pattern is **a terminating tool**, not a provider-level
+`response_format`/`json_schema`. You define a tool whose TypeBox parameters are your
+output schema and return `terminate: true` so the agent stops without an extra LLM turn;
+the validated arguments are your structured object. See
+`packages/coding-agent/examples/extensions/structured-output.ts`:
+```typescript
+defineTool({
+  name: "save_structured_output",
+  parameters: Type.Object({
+    headline: Type.String({ description: "Short title for the result" }),
+    summary: Type.String({ description: "One-paragraph summary" }),
+    actionItems: Type.Array(Type.String(), { description: "Concrete next steps" }),
+  }),
+  async execute(_toolCallId, params) {
+    return {
+      content: [{ type: "text", text: `Saved structured output: ${params.headline}` }],
+      details: {                       // <-- machine-readable structured result
+        headline: params.headline,
+        summary: params.summary,
+        actionItems: params.actionItems,
+      } satisfies StructuredOutputDetails,
+      terminate: true,                 // <-- ends agent without follow-up turn
+    };
+  },
+});
+```
+You then read the structured object from that tool call's arguments / the tool result's
+`details`. TypeBox is the schema system throughout pi (`Type`, `Static`, `TSchema` are
+re-exported from `@earendil-works/pi-ai`), and `validateToolCall(tools, toolCall)`
+validates arguments against the schema before execution.
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts,
+https://github.com/earendil-works/pi/blob/main/packages/ai/README.md
+
+**UNVERIFIED:** whether `pi-ai`'s `complete()`/`stream()` accept a provider-native
+`responseFormat`/`jsonSchema` option (OpenAI/xAI-style strict JSON schema). The README
+did not document one; the documented, portable pattern is the terminating-tool approach
+above. Confirm by reading `packages/ai` source (`complete`/`stream` option types).
+
+---
+
+## Question 7 - Tools, model selection, and the session_id
+
+### Tools
+**Built-in:** enable per session: `tools: ["read", "bash", "edit", "write", "grep",
+"find", "ls"]`. Read-only mode = `["read","grep","find","ls"]`. `excludeTools: [...]`
+removes specific ones.
+
+**Custom (SDK):**
+```typescript
+import { Type } from "typebox";
+import { defineTool } from "@earendil-works/pi-coding-agent";
+const myTool = defineTool({
+  name: "my_tool",
+  label: "My Tool",
+  description: "Does something useful",
+  parameters: Type.Object({ input: Type.String({ description: "Input value" }) }),
+  execute: async (_toolCallId, params) => ({
+    content: [{ type: "text", text: `Result: ${params.input}` }],
+    details: {},
+  }),
+});
+await createAgentSession({ customTools: [myTool], tools: ["read", "bash", "my_tool"] });
+```
+**Custom (extension):** `pi.registerTool({...})` with the same shape plus TUI hooks
+(`renderCall`, `renderResult`), `promptSnippet`, `promptGuidelines`, and optional
+`onUpdate` streaming. `pi.getAllTools()`, `pi.getActiveTools()`, `pi.setActiveTools()`
+manage the active set at runtime. `tool_call` hooks can block tools; MCP is composed via
+extensions (not core).
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+
+### Model selection
+```typescript
+import { getModel } from "@earendil-works/pi-ai";
+const opus = getModel("anthropic", "claude-opus-4-5");      // built-in
+const custom = modelRegistry.find("my-provider", "my-model"); // from models.json
+const available = await modelRegistry.getAvailable();         // those with valid keys
+await createAgentSession({
+  model: opus,
+  thinkingLevel: "high",  // off | minimal | low | medium | high | xhigh
+  scopedModels: [ { model: opus, thinkingLevel: "high" }, { model: haiku, thinkingLevel: "off" } ],
+  authStorage, modelRegistry,
+});
+await session.setModel(newModel);   // runtime switch
+```
+If no model is provided: restore from session -> settings default -> first available.
+15+ providers (Anthropic, OpenAI, Google, Bedrock, Ollama, ...). RPC equivalent:
+`set_model`/`cycle_model`; CLI flags `--provider`, `--model`. Custom providers are added
+via `pi.registerProvider(...)`. This is the swap point for "run on OpenAI/Codex models."
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md,
+https://pi.dev/
+
+### session_id
+- **Creation:** a session has a `sessionId`. In JSON mode the run opens with a header
+  line: `{"type":"session","version":3,"id":"<uuid>","timestamp":"...","cwd":"/path"}`.
+  The `id` is the session id (UUID). The SDK exposes it as `session.sessionId`; the
+  `Agent` constructor accepts an explicit `sessionId` (so Agenta can supply its own and
+  thread it through).
+- **Threading:** sessions persist as JSONL files (`SessionManager.create(cwd)` for
+  on-disk, `SessionManager.inMemory()` for none). `createAgentSessionRuntime` supports
+  `newSession`/`switchSession`/`fork`/`importFromJsonl`, i.e. resume and branch by
+  session. In RPC mode, `new_session`/`switch_session`/`fork`/`clone` manage sessions; the
+  client correlates its own requests with the optional `id` field on each command.
+- This matches the design doc's "carry a `session_id`... later have its state stored":
+  pi already persists session state to JSONL, and you can pass your own `sessionId`.
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md,
+https://github.com/earendil-works/pi/blob/main/packages/agent/README.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md
+
+---
+
+## Instrumentation ("pi instruments") - important nuance
+
+The design doc says runs are "instrumented with pi instruments." Findings:
+- pi core ships **no product literally called "instruments."** Observability is delivered
+  through the **extension/hooks API** (you can instrument any of `context`,
+  `before_provider_request`, `after_provider_response`, `tool_call`, `tool_result`,
+  `agent_start/end`, `turn_start/end`, etc.).
+- The mature path is **`pi-otel`**, a community OpenTelemetry extension:
+  - Install: `pi install npm:pi-otel`; activate `/otel start`.
+  - Span tree per prompt: `pi.interaction` -> `pi.turn` -> `pi.llm_request` /
+    `pi.tool.<name>`, with GenAI semantic-convention attributes (model, token counts,
+    finish reason).
+  - Metrics: histograms for LLM request latency, token usage (input/output/cache), tool
+    execution time.
+  - Structured log events: `pi.session.start`, `pi.session.end`, `pi.tool.error`.
+  - Config via standard OTel env vars (`OTEL_EXPORTER_OTLP_ENDPOINT`,
+    `OTEL_EXPORTER_OTLP_HEADERS`) or `.pi/settings.json` `{ "otel": { endpoint, protocol } }`;
+    `PI_OTEL_DISABLED=1` disables it.
+- There is also a proposed (issue-stage) session usage stats sink via `PI_USAGE_DIR`.
+Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html,
+https://github.com/earendil-works/pi/issues/2054,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+
+**Implication for Agenta:** "pi instruments" most likely means "instrument pi via its
+hooks (OTel-style)," and Agenta's existing OTel-based tracing/observability can ingest
+`pi-otel` OTLP output directly, or Agenta can write its own thin extension that emits
+spans on the same hook points. Confirm with the design owner whether "pi instruments"
+refers to `pi-otel`, a private Earendil "instruments" API, or just "instrumented via
+hooks" - this wording is **UNVERIFIED**.
+
+---
+
+## Local execution parity & swappable harness (design requirements)
+
+- **Parity:** the same `pi` binary / SDK that runs in the sandbox runs locally; pulling
+  the agent config (AGENTS.md, skills, model, tools, files, secrets) and starting pi
+  locally yields the same behavior. The four run surfaces are identical local vs sandbox.
+  Containerization doc shows host vs container are the same pi.
+- **Swappable harness:** because the contract is a thin run surface (RPC JSONL / JSON
+  events / SDK events), a non-pi harness (e.g. OpenAI Codex) can be slotted behind the
+  same surface if Agenta defines its harness port against the RPC/event shapes. Within pi,
+  model/provider swapping (incl. OpenAI) is `getModel`/`registerProvider`/`set_model` -
+  but "swap the whole harness" is an Agenta-side abstraction over the run surface, not a
+  pi feature.
+Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md,
+https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md, https://pi.dev/
+
+---
+
+## Open questions / unknowns
+
+1. **"pi instruments" exact meaning** - is it `pi-otel`, a private Earendil API, or
+   "instrument via hooks"? UNVERIFIED. Resolve with the design owner; if OTel, wire
+   `pi-otel` OTLP into Agenta's existing tracing.
+2. **Provider-native structured output** - does `pi-ai` `complete()`/`stream()` accept a
+   `responseFormat`/`jsonSchema` option, or is the terminating-tool pattern the only
+   supported route? UNVERIFIED; confirm in `packages/ai` source.
+3. **Non-image binary artifacts** - `pi-agent-core` advertises "attachment support," but
+   only the `image` content block is documented. How are arbitrary file/binary artifacts
+   returned (vs. written to the workspace and collected from disk)? UNVERIFIED; confirm in
+   `packages/agent`/`packages/ai` source.
+4. **Daytona specifically** - pi documents Gondolin / Docker / OpenShell, not Daytona. The
+   Daytona port is Agenta's to build (lay files -> set secrets -> `pi --mode rpc`); no pi
+   Daytona integration exists today.
+5. **Skills config -> pi** - how Agenta's stored "skills" map to pi skills (loaded via
+   `resources_discover` skillPaths and `~/.pi/agent` layout) needs a concrete mapping;
+   read `docs/settings.md` and the skills section of the SDK/extensions docs.
+6. **Exact `agent_end.messages` schema** for storing multi-message output - capture the
+   precise `AgentMessage`/content-block JSON (read `packages/agent` types) before
+   designing Agenta's storage shape.
+7. **Version pinning** - researched against `0.79.4`. The API is pre-1.0 and moving (RPC
+   command names, event names, hook names may change between minors); pin a version and
+   re-verify against that tag's docs before implementing.
+
+## Sources
+
+- https://pi.dev/ (and https://pi.dev/docs/latest)
+- https://github.com/earendil-works/pi (repo root, package layout)
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md
+- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts
+- https://github.com/earendil-works/pi/blob/main/packages/agent/README.md
+- https://github.com/earendil-works/pi/blob/main/packages/ai/README.md
+- https://registry.npmjs.org/@earendil-works/pi-coding-agent (and /pi-ai, /pi-agent-core) - version, license, bin, deps
+- https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html (pi-otel OTel extension)
+- https://github.com/earendil-works/pi/issues/2054 (PI_USAGE_DIR usage stats proposal)
+- https://deepwiki.com/earendil-works/pi (and /7.1-pi-coding-agent-sdk, /6.3-extension-examples-and-patterns)
diff --git a/docs/design/agent-workflows/research/sandbox-sharing.md b/docs/design/agent-workflows/research/sandbox-sharing.md
new file mode 100644
index 0000000000..9c8ffbaded
--- /dev/null
+++ b/docs/design/agent-workflows/research/sandbox-sharing.md
@@ -0,0 +1,359 @@
+# Sandbox sharing: one sandbox for all agents, or one per agent?
+
+Status: research. Source of the question: the product owner wants v1 to mirror today's
+prompt-style workflows, which run against one shared runtime/service rather than one per
+workflow. The proposed shortcut is "reuse the same sandbox but connect it to a different
+volume at each execution."
+
+This file answers: can we reuse one Daytona sandbox across many agent executions, can the
+mounted volume change per execution, how do we isolate executions in a shared sandbox,
+what is the concurrency model, how pi.dev views sessions, and what v1 should actually do.
+
+## Summary
+
+- **Reusing one long-lived sandbox: yes, supported.** A Daytona sandbox is designed for
+  long-lived reuse across many tasks, and the Process API provides both stateless one-off
+  `exec()` / `code_run()` and stateful named **Sessions** (`create_session` /
+  `execute_session_command` / `delete_session`) for running many independent command
+  streams in one sandbox. [daytona-sandboxes][daytona-sandboxes][daytona-process]
+- **Swapping a different volume per execution: NO.** Daytona volumes are mounted **only at
+  sandbox creation** via `CreateSandboxFromSnapshotParams(volumes=[...])`. They cannot be
+  attached, detached, or changed on a running sandbox. Changing the mount requires
+  recreating the sandbox. The canonical docs say so explicitly. So the literal
+  "reuse the sandbox, attach a different volume each run" idea is **not feasible in
+  Daytona today.** [daytona-volumes][daytona-volumes-src]
+- **Closest workable equivalent to "a volume per execution" without recreating the
+  sandbox:** give each execution its own **working directory** (e.g.
+  `/runs/<session_id>/`) and lay its config/files/secrets there per run, optionally with a
+  per-run OS user. That is the per-exec isolation lever in a shared sandbox, not volumes.
+  If you genuinely need a persistent named volume per agent, that belongs to the
+  sandbox-per-agent model, where `subpath` on one shared volume gives per-agent isolation
+  at create time. [daytona-process][daytona-volumes]
+- **Isolation in a shared sandbox is weak by default.** All sessions and execs in one
+  sandbox share one kernel, one filesystem, one process table, one network stack, and one
+  set of OS env vars. Filesystem bleed, leftover processes, and secret bleed are real and
+  must be managed by convention (per-run dirs, per-command `env`, cleanup), not by the
+  platform. Daytona's own positioning is "isolated sandbox **per execution**" for safety.
+  [daytona-sandboxes][daytona-blog-best]
+- **Concurrency is bounded and shares resources.** One sandbox defaults to 1 vCPU / 1 GiB
+  RAM (max 4 vCPU / 8 GiB), and an org's *total* active-sandbox budget is 4 vCPU / 8 GiB /
+  10 GiB. Many agent runs can be launched as concurrent sessions in one sandbox, but they
+  contend for that single sandbox's CPU/RAM/disk and can step on each other's files.
+  Daytona has an open issue to add a Parallel Sandbox Execution API precisely because one
+  sandbox is not a clean unit for parallel independent workflows today.
+  [daytona-sandboxes][daytona-parallel-issue]
+- **pi.dev does not need a dedicated machine per session, only a distinct session file and
+  working dir.** pi stores each session as a JSONL tree file; the SDK lets you point each
+  session at its own `cwd`, its own session file (`SessionManager.open(path)`), or its own
+  `agentDir`, and run in `--mode rpc --no-session`. So multiple pi sessions can coexist in
+  one environment as long as each gets its own directory/session file. This maps cleanly
+  onto "per-run working directory inside one shared sandbox." [pi-sdk][pi-docs]
+- **Recommendation for v1:** one shared, long-lived sandbox for all agents, isolation by
+  **per-run working directory + per-command env + cleanup**, NOT by per-run volumes.
+  Treat the volume-per-execution idea as not feasible and substitute per-run dirs.
+  Serialize or cap concurrency on the shared sandbox. Keep the sandbox-provider port
+  abstraction so the migration to **sandbox-per-agent / sandbox-per-run** (with a
+  per-agent volume via `subpath` at create time) is a config swap, not a rewrite.
+
+## Reusing one sandbox (sessions / exec model)
+
+Daytona explicitly designs sandboxes for long-lived reuse: they keep filesystem state
+across stop/start, can be archived and restored, and resized without recreation.
+[daytona-sandboxes] Agenta already has the integration scaffolding: `DaytonaConfig` in
+`api/oss/src/utils/env.py` carries `DAYTONA_API_KEY`, `DAYTONA_API_URL`,
+`DAYTONA_SNAPSHOT`, `DAYTONA_TARGET`, which tells us the plan is snapshot-based sandbox
+creation.
+
+The Process API gives two execution modes inside one sandbox:
+
+- **One-off, stateless:** `exec(command, cwd=None, env=None, timeout=None)` and
+  `code_run(code, params=None, timeout=None)`. Each invocation starts fresh; good for
+  isolated commands. Both accept per-call `cwd` and `env`. [daytona-process]
+- **Stateful Sessions:** named background sessions that persist state across commands.
+  [daytona-process]
+
+Python session example (verbatim shape from the docs): [daytona-process-src]
+
+```python
+session_id = "interactive-session"
+sandbox.process.create_session(session_id)
+
+command = sandbox.process.execute_session_command(
+    session_id,
+    SessionExecuteRequest(
+        command="pip uninstall requests",
+        run_async=True,
+    ),
+)
+# later
+sandbox.process.get_session(session_id)     # status + command history
+sandbox.process.delete_session(session_id)  # cleanup
+```
+
+`SessionExecuteRequest` fields: `command` and `run_async` (Python) / `runAsync` (TS).
+[daytona-process-src] Sessions are the natural home for one agent run: create a session
+per run keyed by `session_id`, fire the harness command, monitor it, delete the session
+when done. Many sessions can live in one sandbox at once.
+
+**Keeping the shared sandbox alive.** A running sandbox auto-stops after
+`autoStopInterval` (default 15 min). Critically, **internal/background processes do NOT
+reset the timer** — only lifecycle changes, preview network requests, active SSH, and
+Toolbox SDK calls do. For an always-on shared sandbox, set `autoStopInterval: 0` or call
+`sandbox.refreshActivity()` periodically. [daytona-sandboxes]
+
+## Volumes — can they change per execution?
+
+**No.** This is the central finding and it kills the literal proposal.
+
+> "Once a volume is created, it can be mounted to a sandbox by specifying it in the
+> `CreateSandboxFromSnapshotParams` object." [daytona-volumes-src]
+
+Volumes mount **only at sandbox creation**. There is no API to attach/detach or swap a
+volume on a running sandbox; the docs describe mounting exclusively through the create
+params, and contain no running-sandbox mount operation. Changing what is mounted requires
+**recreating** the sandbox. [daytona-volumes][daytona-volumes-src]
+
+Mounting example (Python): [daytona-volumes]
+
+```python
+from daytona import CreateSandboxFromSnapshotParams, Daytona, VolumeMount
+
+daytona = Daytona()
+volume = daytona.volume.get("my-volume", create=True)
+
+params = CreateSandboxFromSnapshotParams(
+    language="python",
+    volumes=[
+        VolumeMount(
+            volume_id=volume.id,
+            mount_path="/home/daytona/volume",
+            subpath="users/alice",   # optional per-tenant prefix
+        )
+    ],
+)
+sandbox = daytona.create(params)
+```
+
+`VolumeMount` fields: `volume_id`, `mount_path` (absolute, not `/`, not a system dir like
+`/proc`, `/etc`, `/bin`...), and optional `subpath`. [daytona-volumes][daytona-volumes-src]
+
+Other volume facts that matter:
+
+- **Persistence:** "The volume will persist even after the sandbox is removed." Good for
+  producer/consumer state across sandbox lifecycles. [daytona-volumes-src]
+- **`subpath` isolation:** a sandbox mounted at `users/alice` cannot reach `users/bob` via
+  `../bob`; isolation is at the FUSE mount boundary. This is the supported way to give each
+  *sandbox* (created per agent/run) its own slice of one shared volume — but again, only at
+  create time. [daytona-volumes][daytona-volumes-src]
+- **FUSE limits:** volumes are FUSE mounts — slower than local disk, not usable for block
+  storage (e.g. DB files), and "not transactional": concurrent writes to the same path are
+  last-write-wins. [daytona-volumes-src]
+- **FUSE permission bugs:** an open issue reports `mv`, repeated `touch`, `stat`, and
+  `shutil.copystat()` failing with permission errors inside FUSE volumes. This makes
+  volumes a poor surface for frequent per-run file manipulation even where they do apply.
+  [daytona-fuse-issue]
+
+**Conclusion for the question as posed:** "reuse one sandbox, connect a different volume
+each execution" is not achievable in Daytona. Volumes are a create-time-only mount.
+
+### Alternatives to per-execution volumes (in one shared sandbox)
+
+1. **Per-run working directory (recommended).** Lay each run's config/files/secrets under
+   `/runs/<session_id>/` (or a temp dir) and run the harness with that as `cwd`. Clean it
+   up on completion. This is the direct in-sandbox analog of "a different volume per run"
+   and avoids the FUSE limits entirely. `exec`/`execute_session_command` already take
+   `cwd`. [daytona-process]
+2. **Copy files in/out per run** via the filesystem/Toolbox API, scoped to the per-run dir.
+3. **Per-run OS user** for stronger separation (file ownership, home dir) if root isn't
+   required by the harness. (Standard Linux; UNVERIFIED whether Daytona's default image
+   permits adding users without extra config.)
+4. **Recreate-per-run with a volume** (this is sandbox-per-run, not sandbox-sharing): if a
+   *persistent* per-agent volume is a hard requirement, create a fresh sandbox per run with
+   `volumes=[VolumeMount(volume_id, mount_path, subpath="agents/<agent_id>")]`. This is the
+   migration target, not v1.
+
+## Isolation in a shared sandbox
+
+A single Daytona sandbox is "isolated" from *other sandboxes and the host* — it gets a
+dedicated kernel, filesystem, network stack, and resource allocation. [daytona-sandboxes]
+But **within** one sandbox there is no isolation between executions. All sessions and execs
+share:
+
+- **One filesystem** — files written by run A are visible to run B unless you scope each
+  run to its own directory and clean up. Filesystem bleed is the default.
+- **One process table** — a leftover/background process from a prior run keeps running
+  (and does not even reset the auto-stop timer). You must track and kill per-run PIDs.
+  [daytona-sandboxes]
+- **One set of OS environment variables** — sandbox-level env is global. Secret bleed is a
+  real risk if you `export` a secret. Mitigate by passing secrets per command via the `env`
+  parameter of `exec` / `execute_session_command` rather than setting them globally, and by
+  scoping secret files to the per-run dir. [daytona-process]
+- **One network stack** — ports and outbound identity are shared.
+
+Practical isolation recipe for a shared sandbox:
+
+- Unique `session_id` per run; one Daytona Session per run.
+- Per-run working dir `/runs/<session_id>/`; never write run state outside it.
+- Pass secrets via per-command `env`, not global exports; keep secret files inside the
+  per-run dir with tight permissions; delete on completion.
+- Explicit cleanup: kill the run's process group, remove the run dir, `delete_session`.
+- Optional per-run OS user for ownership separation.
+
+Even with all of this, one sandbox is a **soft** isolation boundary (shared kernel, Docker
+by default). For untrusted agent code or cross-tenant separation, this is weaker than
+sandbox-per-run. Daytona's own marketing leans on "isolated sandbox **per execution**" for
+exactly this reason, and notes the default Docker isolation is weaker than microVMs.
+[daytona-blog-best]
+
+## Concurrency
+
+- **Resource budget.** One sandbox defaults to 1 vCPU / 1 GiB / 3 GiB disk, max
+  4 vCPU / 8 GiB / 10 GiB. The whole org's active-sandbox budget is also 4 vCPU / 8 GiB /
+  10 GiB. So a single shared sandbox is a small box, and packing many concurrent agent runs
+  into it means they contend for that fixed slice. [daytona-sandboxes]
+- **Mechanically parallel, practically contended.** You *can* open multiple sessions and
+  run them concurrently in one sandbox, but they share CPU/RAM/disk and the filesystem, so
+  heavy or untrusted runs can starve or corrupt each other. There is no per-session cgroup
+  isolation documented. (UNVERIFIED: no documented per-session CPU/memory quota.)
+- **Daytona itself flags this gap.** Open issue "Design and Implement Parallel Sandbox
+  Execution API" states that "developers working on AI agents or multi-threaded workflows
+  face limitations when trying to run multiple tasks concurrently," and that the current
+  workaround is "running multiple independent sandboxes manually (inefficient and
+  resource-heavy)." The proposed fix is forking sandbox state (filesystem + memory) — i.e.
+  Daytona's answer to parallel independent runs is *more sandboxes*, not more sessions in
+  one. [daytona-parallel-issue]
+
+Realistic v1 concurrency model for a shared sandbox: **serialize, or cap to a small N** of
+concurrent sessions, each in its own working dir, sized to fit the sandbox's CPU/RAM. If
+throughput needs to scale, that is the trigger to move to sandbox-per-run.
+
+## pi.dev session / workspace model
+
+pi (by Earendil Inc.) is a minimal, extensible agent harness — the harness Agenta's agent
+workflow defaults to. It runs as an interactive TUI, a print/JSON one-shot, an RPC process
+(stdin/stdout JSONL), or embedded via a Node SDK. [pi-home][pi-docs]
+
+Key points for sharing one sandbox:
+
+- **Sessions are files, not machines.** pi stores each session as a JSONL tree file
+  (branchable history). It does not require a dedicated host per session. [pi-docs]
+- **Per-session isolation is by path.** The SDK's `SessionManager` controls where state
+  lives: `SessionManager.create(cwd)` (new session in a directory),
+  `SessionManager.continueRecent(cwd)`, `SessionManager.open("/path/to/session.jsonl")`
+  (explicit file), and `SessionManager.inMemory()` (ephemeral). You can also point at a
+  different global config via `agentDir`. [pi-sdk]
+- **Multiple pi sessions coexist** in one environment by giving each a distinct `cwd`,
+  distinct session file, and/or distinct `agentDir` — "each combination isolates session
+  state, credentials, and settings files." [pi-sdk]
+- **Context comes from the working dir.** pi loads `AGENTS.md` / `SYSTEM.md` from
+  `~/.pi/agent/`, parent dirs, and the cwd, so the per-run working dir naturally carries
+  per-run agent config. [pi-home]
+- **Non-interactive runs:** `pi --mode rpc --no-session` (or `runRpcMode(runtime)`) for a
+  programmatic, sessionless subprocess driven over JSON-RPC. [pi-sdk]
+
+Implication: pi's design is fully compatible with "one shared sandbox, many runs." Each
+agent run = one pi process pointed at its own per-run `cwd` (carrying that run's
+`AGENTS.md`, skills, files) and its own session file. pi gives Agenta the per-run state
+isolation that Daytona volumes do **not**. Agenta's `session_id` should map to (a) the pi
+session file name and (b) the per-run working directory, and (c) the Daytona Session id —
+one id threading all three layers.
+
+## Recommendation for v1 + migration path
+
+### v1: one shared sandbox, isolation by directory (not by volume)
+
+1. **One long-lived shared Daytona sandbox** created from `DAYTONA_SNAPSHOT`, with
+   `autoStopInterval: 0` (or periodic `refreshActivity()`), reused across all agents.
+   Matches the PO's "one runtime for all" goal and the existing prompt-runtime shared model.
+2. **Per-run isolation by working directory, not volume.** For each run, create
+   `/runs/<session_id>/`, lay down that agent's config (`AGENTS.md`, skills, files) and
+   secrets there via startup hooks, and run pi with that dir as `cwd` and its own session
+   file. The "different volume per execution" intent is satisfied by a different *directory*
+   per execution. This sidesteps Daytona's create-time-only volume limit and the FUSE
+   permission/perf problems. [daytona-process][daytona-volumes][daytona-fuse-issue]
+3. **One Daytona Session per run**, keyed by `session_id`; secrets passed via per-command
+   `env`, never global exports. [daytona-process]
+4. **Mandatory cleanup** after each run: kill the run's process group, delete the run dir,
+   `delete_session`. This is what contains filesystem/process/secret bleed in a shared box.
+5. **Bounded concurrency:** serialize, or cap to a small N sized to the sandbox's 1–4 vCPU.
+   [daytona-sandboxes]
+6. **Keep the sandbox-provider port thin** so the unit of isolation (shared vs per-run) is
+   a config choice behind the same interface, as the design doc already anticipates.
+
+Honest framing for the PO: "one sandbox for all agents" is achievable, but **not by
+swapping volumes** — by swapping working directories. The volume idea is the right
+*instinct* (per-run isolated storage) attached to the wrong Daytona primitive. Use
+directories in v1; use volumes only when you move to per-run/per-agent sandboxes.
+
+### Migration path to per-agent / per-run sandboxes
+
+When isolation, security (untrusted code), or concurrency throughput outgrow the shared
+box:
+
+- Flip the provider port from "reuse shared sandbox" to "create sandbox per run."
+- At creation, mount a per-agent persistent volume slice with
+  `VolumeMount(volume_id, mount_path, subpath="agents/<agent_id>")` — this is where the
+  "volume per agent" idea finally becomes native and correct. [daytona-volumes]
+- Optionally enable stronger isolation (Kata/Sysbox) for untrusted code.
+  [daytona-blog-best]
+- Lean on snapshot warm-starts to keep per-run create latency low. [daytona-sandboxes]
+
+Because pi already isolates by `cwd`/session file and `session_id` threads all layers, the
+run-orchestration code barely changes between the two models; only the
+"get-a-sandbox" step swaps.
+
+## Open questions
+
+- **Per-session resource quotas.** Can Daytona cap CPU/RAM/disk per Session (cgroups)
+  inside one sandbox, or is the only quota the whole-sandbox allocation? Not found in docs
+  — UNVERIFIED. If none, concurrent runs cannot be resource-isolated within one sandbox.
+- **Default image users/permissions.** Does the snapshot image allow adding/switching OS
+  users per run without root issues? UNVERIFIED.
+- **Toolbox filesystem API surface** for laying down per-run files/secrets and reading
+  outputs (upload/download/permissions) — needs confirmation against the Daytona Toolbox
+  SDK docs; sibling research on the sandbox port should pin this down.
+- **pi `--no-session` vs Agenta `session_id`.** Agenta wants a `session_id` per run for
+  future state storage; pi can run sessionless (`--no-session`) or with an explicit session
+  file. Decide whether Agenta persists the pi JSONL session file (per the design doc's
+  "future session storage") or treats runs as sessionless and stores its own trace. The
+  design doc's session-storage goal points to keeping pi session files.
+- **Concurrency ceiling.** Exact safe N of parallel pi runs in one 1–4 vCPU sandbox needs
+  empirical testing; treat as serialize-first until measured.
+- **Daytona Parallel Sandbox Execution API status.** Issue #4001 is a proposal; if/when it
+  ships (fork filesystem+memory), it could change the cheapest path for parallel runs.
+  [daytona-parallel-issue]
+
+## Sources
+
+- [daytona-sandboxes] Daytona — Sandboxes (lifecycle, states, auto-stop/archive/delete,
+  refreshActivity, resource limits, per-sandbox isolation):
+  https://www.daytona.io/docs/en/sandboxes/
+- [daytona-process] Daytona — Process and Code Execution (exec/code_run vs Sessions, cwd,
+  env, create/execute/get/delete session): https://www.daytona.io/docs/en/process-code-execution/
+- [daytona-process-src] Daytona docs source — process-code-execution.mdx (verbatim session
+  example, SessionExecuteRequest fields):
+  https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx
+- [daytona-volumes] Daytona — Volumes (creation, VolumeMount, mount_path/subpath, FUSE,
+  mounting via CreateSandboxFromSnapshotParams): https://www.daytona.io/docs/en/volumes/
+- [daytona-volumes-src] Daytona docs source — volumes.mdx (verbatim "mounted at creation",
+  persistence, FUSE not transactional, last-write-wins):
+  https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/volumes.mdx
+- [daytona-fuse-issue] Daytona GitHub issue #3331 — FUSE volume permission limitations
+  (mv/touch/stat/copystat failures): https://github.com/daytonaio/daytona/issues/3331
+- [daytona-parallel-issue] Daytona GitHub issue #4001 — Design and Implement Parallel
+  Sandbox Execution API (fork filesystem+memory; current workaround = many sandboxes):
+  https://github.com/daytonaio/daytona/issues/4001
+- [daytona-blog-best] Northflank — "Best code execution sandbox for AI agents 2026"
+  (isolated sandbox per execution; Docker-default isolation weaker than microVMs):
+  https://northflank.com/blog/best-code-execution-sandbox-for-ai-agents
+- [pi-home] pi.dev — product overview (harness, modes, AGENTS.md/SYSTEM.md context):
+  https://pi.dev
+- [pi-docs] pi.dev — docs index (session tree, JSONL session format, RPC/SDK modes):
+  https://pi.dev/docs/latest
+- [pi-sdk] pi.dev — SDK/RPC (SessionManager.create/continueRecent/open/inMemory, cwd,
+  agentDir, runRpcMode, `--mode rpc --no-session`): https://pi.dev/docs/latest/sdk
+- Agenta repo — `api/oss/src/utils/env.py` `DaytonaConfig` (DAYTONA_API_KEY,
+  DAYTONA_API_URL, DAYTONA_SNAPSHOT, DAYTONA_TARGET).
+- Agenta repo — `docs/design/agent-workflows/README.md` (agent workflow context, sandbox +
+  pi harness + session_id) and `docs/design/prompt-runtime-unification/README.md` (existing
+  shared prompt runtime model).
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/README.md b/docs/design/agent-workflows/wp-1-pi-tracing/README.md
new file mode 100644
index 0000000000..0e0d1ee46a
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/README.md
@@ -0,0 +1,73 @@
+# WP-1: Tracing Pi in Agenta
+
+Status: done. Working code in [`poc/`](poc/). To embed it in the agent runtime, follow
+[`integrating-the-tracing-extension.md`](integrating-the-tracing-extension.md).
+
+## Goal
+
+Install Pi locally, run an agent, and get its telemetry into Agenta as a clean, structured
+trace. Success looks like: a local Pi run shows up in Agenta observability as a sensible
+span tree (session at the root, turns under it, LLM calls and tool calls as child spans)
+with token usage and timings intact.
+
+## Scope
+
+In:
+
+- Run Pi locally (`@earendil-works/pi-coding-agent`), pin an exact version.
+- A Pi extension on the `pi.on(...)` event bus that converts lifecycle events
+  (`session_start`, `turn_*`, `before_provider_request`/`after_provider_response`,
+  `tool_execution_*`, `message_*`) into OTel spans.
+- Export OTLP/HTTP protobuf to Agenta's `POST /otlp/v1/traces`.
+- Make the span tree read well in Agenta's UI.
+
+Out (later work packages):
+
+- Running inside Daytona. Local only here.
+- The agent service itself (that is WP-2). This WP produces the tracing extension that
+  WP-2 later embeds.
+
+## Approach (grounded in research)
+
+See [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md) and
+[`../research/pi-interaction.md`](../research/pi-interaction.md).
+
+- Pi emits no OTel on its own. Either adopt/fork a community extension (`pi-otel*`) or write
+  our own on the event bus. Writing our own is likely cleaner since we control the span
+  shape.
+- Emit OTel GenAI semantic conventions (`gen_ai.*`) plus `openinference.span.kind`
+  (AGENT / CHAIN / LLM / TOOL) so Agenta types the nodes correctly. Agenta's adapter
+  registry already understands both.
+- Export over OTLP/HTTP protobuf with `Authorization: ApiKey <key>` and `?project_id=<uuid>`.
+
+## Known gotchas to handle
+
+- **Token attribute drift.** Pi-style extensions emit `gen_ai.usage.input_tokens` /
+  `output_tokens`, but Agenta's `semconv.py` maps the older
+  `prompt_tokens` / `completion_tokens` / `total_tokens`. Either normalize in the extension
+  or add aliases in Agenta, or token metrics drop silently.
+- **Transport.** Agenta accepts OTLP/HTTP protobuf only. Not gRPC default, not JSON-OTLP.
+  Configure the exporter accordingly.
+- **Trace-context propagation.** Whether a W3C `traceparent` is threaded into the run so
+  in-sandbox spans nest under an originating backend span is UNVERIFIED. Confirm during this
+  WP.
+
+## Definition of done
+
+- A local Pi run produces one trace in Agenta with a coherent span tree.
+- LLM and tool spans are typed correctly and carry model, latency, and token usage.
+- No silently dropped attributes (token usage in particular is present).
+- The exporter config (endpoint, auth, project) is injected, not hard-coded, so it carries
+  over to the sandboxed and service contexts later.
+
+## Open questions
+
+- Adopt a community `pi-otel` extension or write our own? Lean: write our own.
+- Final span-tree shape to standardize on (session vs interaction root naming).
+- Does Agenta forward `traceparent` into an invocation for nesting?
+
+## Links
+
+- [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md)
+- [`../research/pi-interaction.md`](../research/pi-interaction.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md b/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md
new file mode 100644
index 0000000000..0be59d585c
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md
@@ -0,0 +1,186 @@
+# Integrating the Pi tracing extension into the agent runtime
+
+Status: ready to integrate. Audience: whoever builds the Dockerized Pi agent runtime
+(WP-2 service, WP-3 sandbox). Source of the working code: [`poc/`](poc/).
+
+## What this gives you
+
+A Pi extension that turns Pi's `pi.on(...)` lifecycle events into OpenTelemetry spans and
+ships them to Agenta over OTLP/HTTP protobuf. Once it is loaded, every agent run shows up
+in Agenta observability as a clean span tree with inputs, outputs, token usage, cost, and
+latency, and runs in the same session are grouped by `session.id`.
+
+It is one self-contained file, `poc/agenta-otel.ts`. Copy it into the runtime as is. It is
+written to be embedded, not just demoed. `poc/run.ts` is only an example driver; you will
+write your own runner, but you can copy its wiring.
+
+This was verified end to end against the dev box: complex multi-tool runs, parallel tool
+calls, structured returns, and multi-prompt sessions all trace correctly, and the agent
+root reports the correct whole-run token and cost totals.
+
+## The span tree it produces
+
+```
+invoke_agent              openinference.span.kind = AGENT   (root, one per user prompt)
+  turn N                  CHAIN
+    chat <model>          LLM    model, latency, token usage, finish reason, messages
+    execute_tool <name>   TOOL   args in, result out
+```
+
+Agenta types nodes from `openinference.span.kind` (AGENT to agent, CHAIN to chain, LLM to
+chat, TOOL to tool) and groups sessions from `session.id`. No backend change is needed.
+
+## How to wire it in
+
+The runtime is Node embedding Pi through the SDK, so use the SDK path. It is the one the
+extension is built for, and it is the only path where session id and model name reach the
+spans.
+
+```ts
+import {
+  AuthStorage, createAgentSession, DefaultResourceLoader,
+  getAgentDir, ModelRegistry, SessionManager,
+} from "@earendil-works/pi-coding-agent";
+import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel";
+
+const loader = new DefaultResourceLoader({
+  cwd,
+  agentDir: getAgentDir(),
+  extensionFactories: [agentaOtel],   // <-- register the extension in-process
+});
+await loader.reload();
+
+const { session } = await createAgentSession({
+  cwd, model, authStorage, modelRegistry,
+  tools: ["read", "bash", "edit", "write", "ls"],
+  sessionManager: SessionManager.inMemory(cwd),
+  resourceLoader: loader,
+});
+
+// Hand the session id and model to the extension so spans carry them.
+runConfig.sessionId = session.sessionId;
+runConfig.provider = model.provider;
+runConfig.requestModel = model.id;
+
+await session.prompt(userPrompt);     // run one or more prompts in the session
+// ...
+await shutdownTracing();              // flush before the process or container exits
+```
+
+If you instead run Pi from the CLI (`pi -e ./agenta-otel.ts ...`), the extension still
+emits spans and flushes on `session_shutdown`, but `runConfig` is never set, so spans lose
+`session.id` and the model name in the span title. Prefer the SDK path.
+
+## What you must not change, and why
+
+These five choices are load bearing. They were each found by reading how Agenta ingests
+and normalizes spans. Changing them silently drops data.
+
+1. **Atomic, parent-first export per trace.** The extension uses a small custom
+   `TraceBatchProcessor`, not the OTel `BatchSpanProcessor`. It buffers a trace and exports
+   all of its spans in one OTLP request when the root span ends, ordered parent before
+   child. Agenta rolls token and cost totals up the tree by sorting spans on
+   millisecond-resolution `start_time` and attaching a span only once its parent is
+   present. The default batch processor splits long runs on its 5 second timer, and
+   same-millisecond siblings (`agent_start` and `turn_start` fire in the same millisecond)
+   tie and drop a subtree. Either one makes the agent root undercount, showing only the
+   last turn instead of the whole run. Keep the custom processor.
+
+2. **`ag.data.inputs` must be a JSON object.** Agenta moves any non-object input to
+   `ag.unsupported`. The agent and tool spans emit `input.value` as a JSON object. The chat
+   span emits OpenInference `llm.input_messages.*` and `llm.output_messages.*` so it renders
+   as a real message thread. Do not emit a raw string as `input.value`.
+
+3. **Both token naming conventions.** The extension writes token usage under the current
+   GenAI names (`gen_ai.usage.input_tokens` / `output_tokens`) and the legacy names
+   (`prompt_tokens` / `completion_tokens`). Agenta's default `semconv.py` only maps the
+   legacy names today. Emit both or token metrics drop.
+
+4. **`openinference.span.kind` on every span.** This is what types the node in the UI.
+
+5. **`session.id` and `gen_ai.conversation.id` on the root.** Both map to `ag.session.id`,
+   which groups runs into a session. Set them from the Pi `sessionId`.
+
+## Configuration
+
+All config is read from the environment at first use, so set it before the first run.
+
+| Env var | Meaning |
+|---|---|
+| `AGENTA_HOST` | Agenta base URL, for example `http://144.76.237.122:8280`. A trailing slash is stripped. |
+| `AGENTA_API_KEY` | Agenta project API key. The project is resolved from the key, so no `project_id` is needed. |
+| `PI_OTEL_CAPTURE_CONTENT` | Set to `0` to drop prompts, completions, and tool I/O from spans. Default is on. |
+| `OTEL_SERVICE_NAME` | Resource `service.name`, default `pi-agent`. |
+
+The exporter posts to `${AGENTA_HOST}/api/otlp/v1/traces`. Note the `/api` prefix. The
+transport is OTLP/HTTP protobuf only (`@opentelemetry/exporter-trace-otlp-proto`), with
+header `Authorization: ApiKey <key>`. JSON OTLP and gRPC are rejected.
+
+These are the same env vars whether the runtime runs locally or in a container, which keeps
+local and server behavior identical.
+
+## Dockerized runtime notes
+
+- **Inject the two Agenta env vars** (`AGENTA_HOST`, `AGENTA_API_KEY`) into the container as
+  secrets at start. They are separate from the LLM provider credentials.
+- **Allow outbound network** from the sandbox to the Agenta host over HTTP or HTTPS.
+- **Flush before the container exits.** Call `shutdownTracing()` at the end of the run. The
+  per-trace processor already exports each trace when its root span ends, so a completed
+  trace is usually shipped mid-run, but a final flush guards the last trace. If the
+  container is killed before the flush, the last trace can be lost. If you cannot call
+  `shutdownTracing()`, make sure `SIGTERM` triggers Pi's `session_shutdown`, which the
+  extension also flushes on.
+- **Node 22 or newer** is required by Pi 0.79.4.
+- **LLM auth in the sandbox is your concern, not the tracing.** The interactive ChatGPT
+  Codex login used in the POC is local only. In the container use a non-interactive
+  credential (an API key or a transplanted token).
+- **Trace context across the boundary is done for the WP-2 service.** The agent service
+  threads a W3C `traceparent` into the run and starts the agent span as a child of the
+  Agenta `/invoke` span, so the whole agent run is part of the response trace. See
+  [tracing-in-the-agent-service.md](tracing-in-the-agent-service.md). Standalone runs (no
+  `traceparent`) still create their own root and correlate by `session.id`.
+
+## Dependencies
+
+Pin these in the runtime image (the OTel versions are a known-compatible set):
+
+```
+@earendil-works/pi-coding-agent  0.79.4
+@opentelemetry/api               1.9.0
+@opentelemetry/exporter-trace-otlp-proto  0.54.0
+@opentelemetry/resources         1.28.0
+@opentelemetry/sdk-trace-base    1.28.0
+@opentelemetry/sdk-trace-node    1.28.0
+@opentelemetry/semantic-conventions  1.28.0
+```
+
+## How to verify it works
+
+1. On startup you should see `[agenta-otel] exporting spans to .../api/otlp/v1/traces`.
+2. After a run, fetch the trace and check the tree and totals:
+   ```
+   curl -s "${AGENTA_HOST}/api/spans/?trace_id=<id>" -H "Authorization: ApiKey ${AGENTA_API_KEY}"
+   ```
+   Expect `invoke_agent` (agent) over `turn N` (chain) over `chat` (chat) and
+   `execute_tool` (tool). Expect `ag.data.inputs` and `ag.data.outputs` on the agent, chat,
+   and tool spans, and nothing under `ag.unsupported`. Expect the agent root's
+   `ag.metrics.tokens.cumulative` to equal the sum of the chat spans' incrementals.
+3. Or open Agenta observability and confirm the trace reads well and the root shows the
+   full-run token count and cost.
+
+## Reference: attributes per span
+
+| Span | Key attributes the extension sets |
+|---|---|
+| `invoke_agent` (AGENT) | `openinference.span.kind=AGENT`, `gen_ai.operation.name=invoke_agent`, `session.id`, `gen_ai.conversation.id`, `input.value` as `{prompt}`, `output.value` final text |
+| `turn N` (CHAIN) | `openinference.span.kind=CHAIN`, `pi.turn.index` |
+| `chat <model>` (LLM) | `openinference.span.kind=LLM`, `gen_ai.system`, `gen_ai.request.model`, `gen_ai.response.model`, `gen_ai.response.finish_reasons`, `gen_ai.usage.{input,output,prompt,completion,total}_tokens`, `llm.input_messages.*`, `llm.output_messages.*` |
+| `execute_tool <name>` (TOOL) | `openinference.span.kind=TOOL`, `gen_ai.tool.name`, `gen_ai.tool.call.id`, `input.value` as the args object, `output.value` the result |
+
+## One known gap, not on the agent side
+
+The Agenta Sessions tab groups our `session.id` correctly, and the per-session API
+(`POST /api/traces/query` filtering `ag.session.id`) returns the right traces with costs,
+but the Sessions table's aggregate columns render empty on the current dev build. The data
+is correct and queryable. This is a frontend rendering gap, not something the instrumentation
+or the runtime can fix.
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example b/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example
new file mode 100644
index 0000000000..a1ca16a17b
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example
@@ -0,0 +1,7 @@
+# Agenta collector (the runner also falls back to the repo-root .env.test.local).
+AGENTA_HOST=http://144.76.237.122:8280/
+AGENTA_API_KEY=your-agenta-project-api-key
+
+# Optional:
+# PI_OTEL_CAPTURE_CONTENT=0      # drop prompt/response/tool I/O from spans
+# OTEL_SERVICE_NAME=pi-agent
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md b/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md
new file mode 100644
index 0000000000..8d78fc4532
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md
@@ -0,0 +1,86 @@
+# WP-1 POC: trace the Pi agent harness into Agenta
+
+Installs [Pi](https://pi.dev) locally, runs a small tool-using agent, and exports the
+run to Agenta observability as a clean OpenTelemetry trace.
+
+## What's here
+
+- `agenta-otel.ts` — the deliverable: a Pi extension that turns `pi.on(...)` lifecycle
+  events into OTel spans and exports them (OTLP/HTTP protobuf) to Agenta. WP-2 embeds
+  this file as-is.
+- `run.ts` — a runner that registers the extension in-process and drives one prompt.
+
+## Span tree
+
+```
+invoke_agent              (openinference.span.kind = AGENT, carries session.id)
+  turn N                  (CHAIN)
+    chat <model>          (LLM   — model, latency, token usage, finish reason)
+    execute_tool <name>   (TOOL  — args + result)
+```
+
+Token usage is emitted under both the current (`input_tokens`/`output_tokens`) and
+legacy (`prompt_tokens`/`completion_tokens`) GenAI names, so Agenta maps it regardless
+of which adapter claims the span.
+
+## Setup
+
+```bash
+pnpm install --ignore-workspace
+```
+
+### Authenticate Pi (one time)
+
+The runner uses `~/.pi/agent/auth.json`. Log in with your ChatGPT subscription — no API
+key, no per-token billing:
+
+```bash
+pnpm exec pi          # opens the TUI
+/login                # choose "ChatGPT Plus/Pro (Codex)", finish the browser OAuth
+# then quit the TUI
+```
+
+Alternatively, export `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`.
+
+### Credentials for Agenta
+
+The runner reads `AGENTA_HOST` / `AGENTA_API_KEY` from a local `.env` (see `.env.example`)
+or, failing that, from the repo-root `.env.test.local`.
+
+## Run
+
+```bash
+pnpm start                 # uses gpt-5.5 by default
+PI_MODEL=gpt-5.4 pnpm start # pick another available model
+```
+
+The runner prints the `trace_id` and a `/api/spans/?trace_id=...` fetch URL on exit.
+Then open Agenta observability and find the `invoke_agent` trace.
+
+> Note: `gpt-5.3-codex-spark` is **not** usable on a ChatGPT (Codex) login — it 400s.
+> Use `gpt-5.5` / `gpt-5.4`.
+
+## Verified mapping (Agenta conventional semantics)
+
+A run produces a coherent tree that types and maps correctly:
+
+```
+invoke_agent (agent)   ag.data.inputs={prompt}, ag.data.outputs=text, ag.session.id, cumulative tokens
+  turn N (chain)
+    chat <model> (chat) ag.data.inputs.prompt[] + ag.data.outputs.completion[] (OpenInference
+                        messages), ag.meta.request.model, incremental token usage
+    execute_tool <name> (tool)  ag.data.inputs={args}, ag.data.outputs=result
+```
+
+Two things make the data land in `ag.data` instead of `ag.unsupported`:
+`ag.data.inputs` must be a **JSON object** (Agenta exiles non-dict inputs), so the agent and
+tool spans emit `input.value` as JSON; the chat span emits OpenInference
+`llm.input_messages.*` / `llm.output_messages.*` so it renders as a message thread. Token
+usage is emitted under both the new (`input_tokens`) and legacy (`prompt_tokens`) names.
+
+A third thing makes the **agent-root token/cost totals correct**: Agenta rolls metrics up
+its span tree by sorting on millisecond-resolution `start_time` and attaching a span only
+once its parent is present. Same-millisecond siblings (e.g. `agent_start`/`turn_start`)
+tie and can drop a subtree from the roll-up. So the extension buffers each trace and
+exports it in one OTLP batch when the root span ends, ordered **parent-first** — without
+this, a multi-turn agent root undercounts (shows only the last turn's tokens/cost).
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts b/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts
new file mode 100644
index 0000000000..a11d959d36
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts
@@ -0,0 +1,414 @@
+/**
+ * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into
+ * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta.
+ *
+ * Span tree (one per user prompt):
+ *   invoke_agent            (openinference.span.kind = AGENT)
+ *     turn N                (CHAIN)
+ *       chat <model>        (LLM)   — the provider request for that turn
+ *       execute_tool <name> (TOOL)  — each tool the turn ran
+ *
+ * Agenta's OpenInference adapter types nodes off `openinference.span.kind`
+ * (AGENT->agent, CHAIN->chain, LLM->chat, TOOL->tool) and `session.id` ->
+ * `ag.session.id`. Token usage is emitted under BOTH the legacy
+ * (`prompt_tokens`/`completion_tokens`) and current
+ * (`input_tokens`/`output_tokens`) GenAI names so it maps regardless of which
+ * Agenta adapter claims the span.
+ *
+ * Works two ways with the same file:
+ *   - SDK: pass the default export to DefaultResourceLoader.extensionFactories,
+ *     then call shutdownTracing() after the run to flush (see run.ts).
+ *   - CLI: `pi -e ./agenta-otel.ts`; the session_shutdown handler flushes on exit.
+ *
+ * Config (read lazily so the runner can load .env first):
+ *   AGENTA_HOST, AGENTA_API_KEY         — exporter endpoint + auth (required)
+ *   PI_OTEL_CAPTURE_CONTENT=0           — disable prompt/response/tool I/O capture
+ *   OTEL_SERVICE_NAME                   — resource service.name (default "pi-agent")
+ */
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import {
+  context,
+  trace,
+  SpanStatusCode,
+  type Context,
+  type Span,
+} from "@opentelemetry/api";
+import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
+import { Resource } from "@opentelemetry/resources";
+import type {
+  ReadableSpan,
+  SpanExporter,
+  SpanProcessor,
+} from "@opentelemetry/sdk-trace-base";
+import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
+
+/**
+ * Buffer a trace's spans and export them in ONE OTLP batch when the root span
+ * ends. Agenta computes cumulative (rolled-up) token/cost metrics per ingest
+ * batch, so a trace split across batches (which BatchSpanProcessor does on its
+ * timer for long runs) loses the root aggregation — the agent node would show
+ * only the last turn's tokens/cost instead of the whole-run total.
+ */
+class TraceBatchProcessor implements SpanProcessor {
+  private readonly buffers = new Map<string, ReadableSpan[]>();
+  constructor(private readonly exporter: SpanExporter) {}
+  onStart(): void {}
+  onEnd(span: ReadableSpan): void {
+    const traceId = span.spanContext().traceId;
+    const spans = this.buffers.get(traceId) ?? [];
+    spans.push(span);
+    if (span.parentSpanId) {
+      this.buffers.set(traceId, spans);
+    } else {
+      // Root span ended: all descendants ended earlier, so the trace is complete.
+      this.buffers.delete(traceId);
+      this.exporter.export(orderParentFirst(spans), () => {});
+    }
+  }
+  forceFlush(): Promise<void> {
+    const leftovers = [...this.buffers.values()].flat();
+    this.buffers.clear();
+    if (leftovers.length === 0) return Promise.resolve();
+    return new Promise((resolve) =>
+      this.exporter.export(orderParentFirst(leftovers), () => resolve()),
+    );
+  }
+  shutdown(): Promise<void> {
+    return this.forceFlush().then(() => this.exporter.shutdown());
+  }
+}
+
+/**
+ * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at
+ * millisecond resolution and builds its roll-up tree by sorting on start_time,
+ * attaching a span only if its parent is already seen. Sibling events fired in
+ * the same millisecond (agent_start/turn_start) would otherwise tie, and a
+ * child sorted before its parent gets dropped from the cumulative tree. A
+ * parent-first request order makes the backend's stable sort keep parents ahead
+ * of children on ties.
+ */
+function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] {
+  const byId = new Map(spans.map((s) => [s.spanContext().spanId, s]));
+  const childrenOf = new Map<string, ReadableSpan[]>();
+  const roots: ReadableSpan[] = [];
+  for (const s of spans) {
+    const parentId = s.parentSpanId;
+    if (parentId && byId.has(parentId)) {
+      const list = childrenOf.get(parentId) ?? [];
+      list.push(s);
+      childrenOf.set(parentId, list);
+    } else {
+      roots.push(s);
+    }
+  }
+  const ordered: ReadableSpan[] = [];
+  const visit = (s: ReadableSpan) => {
+    ordered.push(s);
+    for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child);
+  };
+  roots.forEach(visit);
+  // Any spans not reached (defensive) get appended so nothing is dropped.
+  if (ordered.length !== spans.length) {
+    const seen = new Set(ordered);
+    for (const s of spans) if (!seen.has(s)) ordered.push(s);
+  }
+  return ordered;
+}
+
+/** Set by the runner before prompting so spans can carry session + model. */
+export const runConfig: {
+  sessionId?: string;
+  provider?: string;
+  requestModel?: string;
+  /** Filled by the extension on agent_start so the runner can print/fetch the trace. */
+  traceId?: string;
+} = {};
+
+let provider: NodeTracerProvider | undefined;
+let captureContent = true;
+
+function initTracing(): void {
+  if (provider) return;
+
+  const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace(
+    /\/+$/,
+    "",
+  );
+  const apiKey = process.env.AGENTA_API_KEY || "";
+  const url = `${host}/api/otlp/v1/traces`;
+  captureContent = process.env.PI_OTEL_CAPTURE_CONTENT !== "0";
+
+  if (!apiKey) {
+    console.warn(
+      "[agenta-otel] AGENTA_API_KEY is not set — the collector will reject spans with 401.",
+    );
+  }
+  console.log(`[agenta-otel] exporting spans to ${url} (content capture: ${captureContent})`);
+
+  const exporter = new OTLPTraceExporter({
+    url,
+    headers: { Authorization: `ApiKey ${apiKey}` },
+    timeoutMillis: 10_000,
+  });
+
+  provider = new NodeTracerProvider({
+    resource: new Resource({
+      [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent",
+    }),
+  });
+  provider.addSpanProcessor(new TraceBatchProcessor(exporter));
+  provider.register();
+}
+
+/** Flush and shut down the exporter. Call from the runner after a run completes. */
+export async function shutdownTracing(): Promise<void> {
+  if (!provider) return;
+  try {
+    await provider.forceFlush();
+    await provider.shutdown();
+  } finally {
+    provider = undefined;
+  }
+}
+
+const tracer = () => trace.getTracer("agenta-pi-otel", "0.1.0");
+
+// --- per-run span state (the POC runs one prompt at a time) ---
+let agentSpan: Span | undefined;
+let agentCtx: Context | undefined;
+let pendingPrompt: string | undefined;
+let currentTurn: { span: Span; ctx: Context; index?: number } | undefined;
+let llmSpan: Span | undefined;
+let lastContextMessages: any[] | undefined;
+const toolSpans = new Map<string, Span>();
+
+/** A string output → ag.data.outputs (any type is valid there). */
+function setOutput(span: Span, value: unknown): void {
+  if (!captureContent || value == null) return;
+  const text = typeof value === "string" ? value : JSON.stringify(value);
+  if (text.length > 0) span.setAttribute("output.value", text);
+}
+
+/**
+ * ag.data.inputs must be a dict, so emit input.value as a JSON object string.
+ * A non-object (raw string) would be relocated to ag.unsupported by Agenta.
+ */
+function setInputs(span: Span, obj: Record<string, unknown>): void {
+  if (!captureContent) return;
+  span.setAttribute("input.value", JSON.stringify(obj));
+  span.setAttribute("input.mime_type", "application/json");
+}
+
+function oiRole(role: string): string {
+  return role === "toolResult" ? "tool" : role; // user | assistant | system | tool
+}
+
+function messageText(msg: any): string {
+  const c = msg?.content;
+  if (typeof c === "string") return c;
+  if (Array.isArray(c)) {
+    return c
+      .filter((b: any) => b?.type === "text")
+      .map((b: any) => b.text)
+      .join("");
+  }
+  return "";
+}
+
+/**
+ * Emit OpenInference structured messages so Agenta renders a proper message
+ * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*,
+ * `llm.output_messages.*` -> ag.data.outputs.completion.*.
+ */
+function emitMessages(span: Span, prefix: string, messages: any[]): void {
+  if (!captureContent || !Array.isArray(messages)) return;
+  messages.forEach((m, i) => {
+    const base = `${prefix}.${i}.message`;
+    span.setAttribute(`${base}.role`, oiRole(m.role));
+    const text = messageText(m);
+    if (text) span.setAttribute(`${base}.content`, text);
+    if (m.role === "toolResult" && m.toolCallId)
+      span.setAttribute(`${base}.tool_call_id`, m.toolCallId);
+    if (Array.isArray(m.content)) {
+      m.content
+        .filter((b: any) => b?.type === "toolCall")
+        .forEach((call: any, j: number) => {
+          const tc = `${base}.tool_calls.${j}.tool_call`;
+          if (call.id) span.setAttribute(`${tc}.id`, call.id);
+          span.setAttribute(`${tc}.function.name`, call.name);
+          span.setAttribute(
+            `${tc}.function.arguments`,
+            JSON.stringify(call.arguments ?? {}),
+          );
+        });
+    }
+  });
+}
+
+function toolResultText(result: any): string {
+  if (!result) return "";
+  if (typeof result === "string") return result;
+  if (Array.isArray(result)) {
+    return result
+      .filter((c: any) => c?.type === "text")
+      .map((c: any) => c.text)
+      .join("");
+  }
+  if (result.content) return toolResultText(result.content);
+  return JSON.stringify(result);
+}
+
+function lastAssistantText(messages: any): string {
+  if (!Array.isArray(messages)) return "";
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant") return messageText(messages[i]);
+  }
+  return "";
+}
+
+/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */
+function applyAssistant(span: Span, msg: any): void {
+  if (msg.provider) span.setAttribute("gen_ai.system", msg.provider);
+  if (msg.model) span.setAttribute("gen_ai.request.model", msg.model);
+  if (msg.responseModel || msg.model)
+    span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model);
+  if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId);
+  if (msg.stopReason)
+    span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]);
+
+  const u = msg.usage;
+  if (u) {
+    // Current GenAI names (mapped by Agenta's logfire adapter) ...
+    span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0);
+    // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token
+    // usage is never silently dropped regardless of which adapter wins.
+    span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0);
+    span.setAttribute(
+      "gen_ai.usage.total_tokens",
+      u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0),
+    );
+    if (u.cacheRead)
+      span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead);
+    if (u.cacheWrite)
+      span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite);
+    if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total);
+  }
+
+  emitMessages(span, "llm.output_messages", [msg]);
+  if (msg.stopReason === "error" || msg.errorMessage) {
+    span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage });
+  }
+}
+
+export default function agentaOtel(pi: ExtensionAPI): void {
+  initTracing();
+  const t = tracer();
+
+  pi.on("before_agent_start", async (event: any) => {
+    pendingPrompt = event?.prompt;
+  });
+
+  pi.on("agent_start", async () => {
+    agentSpan = t.startSpan("invoke_agent");
+    agentSpan.setAttribute("openinference.span.kind", "AGENT");
+    agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent");
+    agentSpan.setAttribute("gen_ai.agent.name", "pi");
+    if (runConfig.sessionId) {
+      agentSpan.setAttribute("session.id", runConfig.sessionId);
+      agentSpan.setAttribute("gen_ai.conversation.id", runConfig.sessionId);
+    }
+    setInputs(agentSpan, { prompt: pendingPrompt ?? "" });
+    runConfig.traceId = agentSpan.spanContext().traceId;
+    agentCtx = trace.setSpan(context.active(), agentSpan);
+  });
+
+  // The messages handed to the next LLM call — the chat span's input.
+  pi.on("context", async (event: any) => {
+    if (Array.isArray(event?.messages)) lastContextMessages = event.messages;
+  });
+
+  pi.on("turn_start", async (event: any) => {
+    const parent = agentCtx ?? context.active();
+    const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn";
+    const span = t.startSpan(name, undefined, parent);
+    span.setAttribute("openinference.span.kind", "CHAIN");
+    if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex);
+    currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex };
+  });
+
+  pi.on("before_provider_request", async (_event: any, ctx: any) => {
+    const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+    const modelId = runConfig.requestModel ?? ctx?.model?.id;
+    const providerName = runConfig.provider ?? ctx?.model?.provider;
+    llmSpan = t.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent);
+    llmSpan.setAttribute("openinference.span.kind", "LLM");
+    llmSpan.setAttribute("gen_ai.operation.name", "chat");
+    if (providerName) llmSpan.setAttribute("gen_ai.system", providerName);
+    if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId);
+    if (lastContextMessages) emitMessages(llmSpan, "llm.input_messages", lastContextMessages);
+  });
+
+  pi.on("message_end", async (event: any) => {
+    const msg = event?.message;
+    if (!msg || msg.role !== "assistant" || !llmSpan) return;
+    applyAssistant(llmSpan, msg);
+    llmSpan.end();
+    llmSpan = undefined;
+  });
+
+  pi.on("tool_execution_start", async (event: any) => {
+    const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+    const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool";
+    const span = t.startSpan(name, undefined, parent);
+    span.setAttribute("openinference.span.kind", "TOOL");
+    span.setAttribute("gen_ai.operation.name", "execute_tool");
+    if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName);
+    if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId);
+    setInputs(span, (event?.args as Record<string, unknown>) ?? {});
+    if (event?.toolCallId) toolSpans.set(event.toolCallId, span);
+  });
+
+  pi.on("tool_execution_end", async (event: any) => {
+    const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined;
+    if (!span) return;
+    setOutput(span, toolResultText(event?.result));
+    if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR });
+    span.end();
+    toolSpans.delete(event.toolCallId);
+  });
+
+  pi.on("turn_end", async (event: any) => {
+    // Safety net: if the LLM span is still open (no assistant message_end seen),
+    // close it from the turn's assistant message.
+    if (llmSpan && event?.message) {
+      applyAssistant(llmSpan, event.message);
+      llmSpan.end();
+      llmSpan = undefined;
+    }
+    if (currentTurn) {
+      currentTurn.span.end();
+      currentTurn = undefined;
+    }
+  });
+
+  pi.on("agent_end", async (event: any) => {
+    if (!agentSpan) return;
+    setOutput(agentSpan, lastAssistantText(event?.messages));
+    agentSpan.end();
+    agentSpan = undefined;
+    agentCtx = undefined;
+    lastContextMessages = undefined;
+  });
+
+  // CLI (`pi -e`) flush path. The SDK runner additionally calls shutdownTracing().
+  pi.on("session_shutdown", async () => {
+    try {
+      await provider?.forceFlush();
+    } catch {
+      /* best effort */
+    }
+  });
+}
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json b/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json
new file mode 100644
index 0000000000..e3d23ae603
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json
@@ -0,0 +1,25 @@
+{
+  "name": "wp-1-pi-tracing-poc",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "description": "WP-1 POC: trace the Pi agent harness into Agenta via an OTel extension.",
+  "scripts": {
+    "start": "tsx run.ts",
+    "login": "pi"
+  },
+  "dependencies": {
+    "@earendil-works/pi-coding-agent": "0.79.4",
+    "@opentelemetry/api": "1.9.0",
+    "@opentelemetry/exporter-trace-otlp-proto": "0.54.0",
+    "@opentelemetry/resources": "1.28.0",
+    "@opentelemetry/sdk-trace-base": "1.28.0",
+    "@opentelemetry/sdk-trace-node": "1.28.0",
+    "@opentelemetry/semantic-conventions": "1.28.0",
+    "dotenv": "17.2.3"
+  },
+  "devDependencies": {
+    "tsx": "4.19.2",
+    "@types/node": "22.10.2"
+  }
+}
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml b/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml
new file mode 100644
index 0000000000..54c94564b7
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml
@@ -0,0 +1,1842 @@
+lockfileVersion: '9.0'
+
+settings:
+  autoInstallPeers: true
+  excludeLinksFromLockfile: false
+
+importers:
+
+  .:
+    dependencies:
+      '@earendil-works/pi-coding-agent':
+        specifier: 0.79.4
+        version: 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@opentelemetry/api':
+        specifier: 1.9.0
+        version: 1.9.0
+      '@opentelemetry/exporter-trace-otlp-proto':
+        specifier: 0.54.0
+        version: 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-node':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions':
+        specifier: 1.28.0
+        version: 1.28.0
+      dotenv:
+        specifier: 17.2.3
+        version: 17.2.3
+    devDependencies:
+      '@types/node':
+        specifier: 22.10.2
+        version: 22.10.2
+      tsx:
+        specifier: 4.19.2
+        version: 4.19.2
+
+packages:
+
+  '@anthropic-ai/sdk@0.91.1':
+    resolution: {integrity: sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw==}
+    hasBin: true
+    peerDependencies:
+      zod: ^3.25.0 || ^4.0.0
+    peerDependenciesMeta:
+      zod:
+        optional: true
+
+  '@aws-crypto/crc32@5.2.0':
+    resolution: {integrity: sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==}
+    engines: {node: '>=16.0.0'}
+
+  '@aws-crypto/sha256-browser@5.2.0':
+    resolution: {integrity: sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==}
+
+  '@aws-crypto/sha256-js@5.2.0':
+    resolution: {integrity: sha512-FFQQyu7edu4ufvIZ+OadFpHHOt+eSTBaYaki44c+akjg7qZg9oOQeLlk77F6tSYqjDAFClrHJk9tMf0HdVyOvA==}
+    engines: {node: '>=16.0.0'}
+
+  '@aws-crypto/supports-web-crypto@5.2.0':
+    resolution: {integrity: sha512-iAvUotm021kM33eCdNfwIN//F77/IADDSs58i+MDaOqFrVjZo9bAal0NK7HurRuWLLpF1iLX7gbWrjHjeo+YFg==}
+
+  '@aws-crypto/util@5.2.0':
+    resolution: {integrity: sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==}
+
+  '@aws-sdk/client-bedrock-runtime@3.1048.0':
+    resolution: {integrity: sha512-u+NT61JZEkRFtpL0CAw1N1dwxnaLgwVXQl/zjJxTGgLyS/jTIdg2SdoEoCTHxgDyCnqa1HEi9QOoE9/pYRNpOQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/core@3.974.20':
+    resolution: {integrity: sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-env@3.972.46':
+    resolution: {integrity: sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-http@3.972.48':
+    resolution: {integrity: sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-ini@3.972.53':
+    resolution: {integrity: sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-login@3.972.52':
+    resolution: {integrity: sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-node@3.972.55':
+    resolution: {integrity: sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-process@3.972.46':
+    resolution: {integrity: sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-sso@3.972.52':
+    resolution: {integrity: sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-web-identity@3.972.52':
+    resolution: {integrity: sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/eventstream-handler-node@3.972.21':
+    resolution: {integrity: sha512-mVC0hOmwGJmNFezZ+wM8Sqfap/LjsMavEf2Evl0YWrLAcrdZOEdjnY8nRvgakVViWJSGm2eJxLuPVHGdeV06kA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/middleware-eventstream@3.972.17':
+    resolution: {integrity: sha512-tdbnXbw73ww62ABWP0G0Z/euvFowEEvAoi/zG4NaZo7HJFpfGho/Z65HyVzkJLT1cMsUregr4pTyxljlarT0wA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/middleware-websocket@3.972.28':
+    resolution: {integrity: sha512-SCW06Zjugn86pq7+dxGnFcyWJuEWHT753HTU/Vj/OzVxP+NoShwdAr4ynxAcvWL883OgRVbSqW3ohnjIxwXjjw==}
+    engines: {node: '>= 14.0.0'}
+
+  '@aws-sdk/nested-clients@3.997.20':
+    resolution: {integrity: sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/signature-v4-multi-region@3.996.34':
+    resolution: {integrity: sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/token-providers@3.1048.0':
+    resolution: {integrity: sha512-k0y/GcuesuSfWyUM0WamrGyeZmltRYaPbHO82UDA6mZ/doB+FOHKutikPAtSXMn/hDz970cF+iRuuiYO9VEbAA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/token-providers@3.1066.0':
+    resolution: {integrity: sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/types@3.973.12':
+    resolution: {integrity: sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/util-locate-window@3.965.7':
+    resolution: {integrity: sha512-M0D6oIpohdNHjc7udzTHEQyot0+0iuA36jc2I9Hps+f/GtKi2HO/pyijQnCnNcwZqLB5+rtn81z3eZK/GyjAmA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/xml-builder@3.972.29':
+    resolution: {integrity: sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws/lambda-invoke-store@0.2.4':
+    resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@babel/runtime@7.29.7':
+    resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==}
+    engines: {node: '>=6.9.0'}
+
+  '@earendil-works/pi-agent-core@0.79.4':
+    resolution: {integrity: sha512-xkaZ3yK2XbP9HYdHrrdj/6HqZPM0o/mwbjMSU4RTJyR3HjDG0ZrPz76Hg6s0W+G4u6PpJr1mGx/srCG+3eQA8A==}
+    engines: {node: '>=22.19.0'}
+
+  '@earendil-works/pi-ai@0.79.4':
+    resolution: {integrity: sha512-Z1j+YP+6ZyPBKDUoc5m0GO/o1hPK17fWeErtDgegCTpm2dcKzuFvL/7GTqHeJkVkfpeXRwO37xOfgozQbK6EUw==}
+    engines: {node: '>=22.19.0'}
+    hasBin: true
+
+  '@earendil-works/pi-coding-agent@0.79.4':
+    resolution: {integrity: sha512-PthzVzM5m4XH/hrU+2fVjuwuH5M4eMFWbd0NCRScH14XKpwlPc8/Fh6JDz0jQb5kTBT9oQT183YLTHVVulFL9A==}
+    engines: {node: '>=22.19.0'}
+    hasBin: true
+
+  '@earendil-works/pi-tui@0.79.4':
+    resolution: {integrity: sha512-/ZhfFiHSBMH7AbDrBQIN+UWlJnl9tSEpLYICRGGMzmNfyCqX+30NYacIhyOEaD8R5rS6wJZysAOPU0yNwigbXw==}
+    engines: {node: '>=22.19.0'}
+
+  '@esbuild/aix-ppc64@0.23.1':
+    resolution: {integrity: sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [aix]
+
+  '@esbuild/android-arm64@0.23.1':
+    resolution: {integrity: sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [android]
+
+  '@esbuild/android-arm@0.23.1':
+    resolution: {integrity: sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [android]
+
+  '@esbuild/android-x64@0.23.1':
+    resolution: {integrity: sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [android]
+
+  '@esbuild/darwin-arm64@0.23.1':
+    resolution: {integrity: sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@esbuild/darwin-x64@0.23.1':
+    resolution: {integrity: sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [darwin]
+
+  '@esbuild/freebsd-arm64@0.23.1':
+    resolution: {integrity: sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [freebsd]
+
+  '@esbuild/freebsd-x64@0.23.1':
+    resolution: {integrity: sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [freebsd]
+
+  '@esbuild/linux-arm64@0.23.1':
+    resolution: {integrity: sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [linux]
+
+  '@esbuild/linux-arm@0.23.1':
+    resolution: {integrity: sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [linux]
+
+  '@esbuild/linux-ia32@0.23.1':
+    resolution: {integrity: sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [linux]
+
+  '@esbuild/linux-loong64@0.23.1':
+    resolution: {integrity: sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==}
+    engines: {node: '>=18'}
+    cpu: [loong64]
+    os: [linux]
+
+  '@esbuild/linux-mips64el@0.23.1':
+    resolution: {integrity: sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==}
+    engines: {node: '>=18'}
+    cpu: [mips64el]
+    os: [linux]
+
+  '@esbuild/linux-ppc64@0.23.1':
+    resolution: {integrity: sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [linux]
+
+  '@esbuild/linux-riscv64@0.23.1':
+    resolution: {integrity: sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==}
+    engines: {node: '>=18'}
+    cpu: [riscv64]
+    os: [linux]
+
+  '@esbuild/linux-s390x@0.23.1':
+    resolution: {integrity: sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==}
+    engines: {node: '>=18'}
+    cpu: [s390x]
+    os: [linux]
+
+  '@esbuild/linux-x64@0.23.1':
+    resolution: {integrity: sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [linux]
+
+  '@esbuild/netbsd-x64@0.23.1':
+    resolution: {integrity: sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [netbsd]
+
+  '@esbuild/openbsd-arm64@0.23.1':
+    resolution: {integrity: sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [openbsd]
+
+  '@esbuild/openbsd-x64@0.23.1':
+    resolution: {integrity: sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [openbsd]
+
+  '@esbuild/sunos-x64@0.23.1':
+    resolution: {integrity: sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [sunos]
+
+  '@esbuild/win32-arm64@0.23.1':
+    resolution: {integrity: sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [win32]
+
+  '@esbuild/win32-ia32@0.23.1':
+    resolution: {integrity: sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [win32]
+
+  '@esbuild/win32-x64@0.23.1':
+    resolution: {integrity: sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [win32]
+
+  '@google/genai@1.52.0':
+    resolution: {integrity: sha512-gwSvbpiN/17O9TbsqSsE/OzZcpv5Fo4RQjdngGgogtuB9RsyJ8ZHhX5KjHj1bp5N9snN2eK8LDGXSaWW2hof8Q==}
+    engines: {node: '>=20.0.0'}
+    peerDependencies:
+      '@modelcontextprotocol/sdk': ^1.25.2
+    peerDependenciesMeta:
+      '@modelcontextprotocol/sdk':
+        optional: true
+
+  '@mariozechner/clipboard-darwin-arm64@0.3.9':
+    resolution: {integrity: sha512-BfgV7vCEWZwJwZJw03r6bP5+tf0iI/ANuQYCxi9RNn7FrWB3yzGuMKCrNLRl6V761vXRdL8+OqZ0wd4TqlsNOQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@mariozechner/clipboard-darwin-universal@0.3.9':
+    resolution: {integrity: sha512-BGGR4iA9Z2shAjI65eI5xtyb3LYNlDW9X3gxKxDbqtbnREohsrqznov6zpKoIrsRWpzlYVEdKphS7ksJ0/ndSQ==}
+    engines: {node: '>= 10'}
+    os: [darwin]
+
+  '@mariozechner/clipboard-darwin-x64@0.3.9':
+    resolution: {integrity: sha512-4kURmCbS6nt8uYhtmWpUcJWyPHfmAr5dTpXD1nO3pIfa+TSQ9DbrGOYCKH+aEFW47XhQ4Vp8ZTszie+wfFvDKg==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [darwin]
+
+  '@mariozechner/clipboard-linux-arm64-gnu@0.3.9':
+    resolution: {integrity: sha512-g59OkUGP2DDfCOIKypHeYgv2M55u/cKvXa5dSxFbEJ34XvIQMdcVmpKCkGUro3ZgefXiGVdwguvTMQGpHWzIXw==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-arm64-musl@0.3.9':
+    resolution: {integrity: sha512-AGuJdgKsmJdm4Pych7kv3sqe591ERRaAHW3xjLooiFzn8J+PxUyof++7YZrB5Y5tpnTO+K18Og3taj2NpluCRQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9':
+    resolution: {integrity: sha512-DXBEAiuMpk7dhS1a9NzNxVAFi1vaKoPu7rQNgY8LIDLGrK3lnIp3nT10DUum+PKVJoJppIP+NAA8IZe4DMNDPw==}
+    engines: {node: '>= 10'}
+    cpu: [riscv64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-x64-gnu@0.3.9':
+    resolution: {integrity: sha512-WORrMLd6EpElEME7JRKfSaY34nW1P5LbdgK5YNCS1ncG2LqmITsSMEJ8nh2mpvxb3TxqbOOKgY7k9eMJYlW9Mw==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-x64-musl@0.3.9':
+    resolution: {integrity: sha512-/DHn+1DrfL6oRaPPWXaOKvonFFrni666fxd+zFqiQEfvBH0tsHVWjq9iqBk0oDp0qaPA72lIMy5BptxISBEhZQ==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  '@mariozechner/clipboard-win32-arm64-msvc@0.3.9':
+    resolution: {integrity: sha512-O5FHD3ErkMwMhNzAfu3ggy0ug4z7btZuoQgwwxlzPrwV2bxlD6WDpqBY4NCgICAgZdDKdp+loUEKVAVt8aYnhQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [win32]
+
+  '@mariozechner/clipboard-win32-x64-msvc@0.3.9':
+    resolution: {integrity: sha512-ihQC3EufqEY81vhXBgVBtK4prL+wc62zJsSvxrgz7K1hsdt6OObz6v9p3Rn1OG3GJksTTKMJF0u/guMISHPhSA==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [win32]
+
+  '@mariozechner/clipboard@0.3.9':
+    resolution: {integrity: sha512-ABnA53mdfkGZwOFUdZNv2S0CWGO/EIuPj8Vv9xmBFmSYg/qFc7ihO6q5FcQjvoE67kZpWkEc4AhD6B/os04yuA==}
+    engines: {node: '>= 10'}
+
+  '@mistralai/mistralai@2.2.1':
+    resolution: {integrity: sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ==}
+
+  '@nodable/entities@2.2.0':
+    resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==}
+
+  '@opentelemetry/api-logs@0.54.0':
+    resolution: {integrity: sha512-9HhEh5GqFrassUndqJsyW7a0PzfyWr2eV2xwzHLIS+wX3125+9HE9FMRAKmJRwxZhgZGwH3HNQQjoMGZqmOeVA==}
+    engines: {node: '>=14'}
+
+  '@opentelemetry/api@1.9.0':
+    resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
+    engines: {node: '>=8.0.0'}
+
+  '@opentelemetry/context-async-hooks@1.28.0':
+    resolution: {integrity: sha512-igcl4Ve+F1N2063PJUkesk/GkYyuGIWinYkSyAFTnIj3gzrOgvOA4k747XNdL47HRRL1w/qh7UW8NDuxOLvKFA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/core@1.27.0':
+    resolution: {integrity: sha512-yQPKnK5e+76XuiqUH/gKyS8wv/7qITd5ln56QkBTf3uggr0VkXOXfcaAuG330UfdYu83wsyoBwqwxigpIG+Jkg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/core@1.28.0':
+    resolution: {integrity: sha512-ZLwRMV+fNDpVmF2WYUdBHlq0eOWtEaUJSusrzjGnBt7iSRvfjFE3RXYUZJrqou/wIDWV0DwQ5KIfYe9WXg9Xqw==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.54.0':
+    resolution: {integrity: sha512-cpDQj5wl7G8pLu3lW94SnMpn0C85A9Ehe7+JBow2IL5DGPWXTkynFngMtCC3PpQzQgzlyOVe0MVZfoBB3M5ECA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-exporter-base@0.54.0':
+    resolution: {integrity: sha512-g+H7+QleVF/9lz4zhaR9Dt4VwApjqG5WWupy5CTMpWJfHB/nLxBbX73GBZDgdiNfh08nO3rNa6AS7fK8OhgF5g==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-transformer@0.54.0':
+    resolution: {integrity: sha512-jRexIASQQzdK4AjfNIBfn94itAq4Q8EXR9d3b/OVbhd3kKQKvMr7GkxYDjbeTbY7hHCOLcLfJ3dpYQYGOe8qOQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/propagator-b3@1.28.0':
+    resolution: {integrity: sha512-Q7HVDIMwhN5RxL4bECMT4BdbyYSAKkC6U/RGn4NpO/cbqP6ZRg+BS7fPo/pGZi2w8AHfpIGQFXQmE8d2PC5xxQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/propagator-jaeger@1.28.0':
+    resolution: {integrity: sha512-wKJ94+s8467CnIRgoSRh0yXm/te0QMOwTq9J01PfG/RzYZvlvN8aRisN2oZ9SznB45dDGnMj3BhUlchSA9cEKA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/resources@1.27.0':
+    resolution: {integrity: sha512-jOwt2VJ/lUD5BLc+PMNymDrUCpm5PKi1E9oSVYAvz01U/VdndGmrtV3DU1pG4AwlYhJRHbHfOUIlpBeXCPw6QQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/resources@1.28.0':
+    resolution: {integrity: sha512-cIyXSVJjGeTICENN40YSvLDAq4Y2502hGK3iN7tfdynQLKWb3XWZQEkPc+eSx47kiy11YeFAlYkEfXwR1w8kfw==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-logs@0.54.0':
+    resolution: {integrity: sha512-HeWvOPiWhEw6lWvg+lCIi1WhJnIPbI4/OFZgHq9tKfpwF3LX6/kk3+GR8sGUGAEZfbjPElkkngzvd2s03zbD7Q==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.4.0 <1.10.0'
+
+  '@opentelemetry/sdk-metrics@1.27.0':
+    resolution: {integrity: sha512-JzWgzlutoXCydhHWIbLg+r76m+m3ncqvkCcsswXAQ4gqKS+LOHKhq+t6fx1zNytvLuaOUBur7EvWxECc4jPQKg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-base@1.27.0':
+    resolution: {integrity: sha512-btz6XTQzwsyJjombpeqCX6LhiMQYpzt2pIYNPnw0IPO/3AhT6yjnf8Mnv3ZC2A4eRYOjqrg+bfaXg9XHDRJDWQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-base@1.28.0':
+    resolution: {integrity: sha512-ceUVWuCpIao7Y5xE02Xs3nQi0tOGmMea17ecBdwtCvdo9ekmO+ijc9RFDgfifMl7XCBf41zne/1POM3LqSTZDA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-node@1.28.0':
+    resolution: {integrity: sha512-N0sYfYXvHpP0FNIyc+UfhLnLSTOuZLytV0qQVrDWIlABeD/DWJIGttS7nYeR14gQLXch0M1DW8zm3VeN6Opwtg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/semantic-conventions@1.27.0':
+    resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==}
+    engines: {node: '>=14'}
+
+  '@opentelemetry/semantic-conventions@1.28.0':
+    resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==}
+    engines: {node: '>=14'}
+
+  '@protobufjs/aspromise@1.1.2':
+    resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
+
+  '@protobufjs/base64@1.1.2':
+    resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
+
+  '@protobufjs/codegen@2.0.5':
+    resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==}
+
+  '@protobufjs/eventemitter@1.1.1':
+    resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==}
+
+  '@protobufjs/fetch@1.1.1':
+    resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==}
+
+  '@protobufjs/float@1.0.2':
+    resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
+
+  '@protobufjs/path@1.1.2':
+    resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
+
+  '@protobufjs/pool@1.1.0':
+    resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
+
+  '@protobufjs/utf8@1.1.1':
+    resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
+
+  '@silvia-odwyer/photon-node@0.3.4':
+    resolution: {integrity: sha512-bnly4BKB3KDTFxrUIcgCLbaeVVS8lrAkri1pEzskpmxu9MdfGQTy8b8EgcD83ywD3RPMsIulY8xJH5Awa+t9fA==}
+
+  '@smithy/core@3.24.7':
+    resolution: {integrity: sha512-KoUi4M1f3BG6kzN1FnCwL7oyFptTbyBJKjR6yhSib+JHRdUmM1o+VwsFtJ66NZCkCzVfJMWRHJNo0R0jznp0Pg==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/credential-provider-imds@4.3.9':
+    resolution: {integrity: sha512-ZlfJ/4Fa3jYb+3eaohPfG9utX9HmdhFNcFtpoGAhUhdynAOmGXtmigbi7eEiONKM+ykHw8RwKuDEb85Lx7t7fA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/fetch-http-handler@5.4.7':
+    resolution: {integrity: sha512-NslaM2ir0N2hisDmzXLstPaVINZheh8SokyOC++kzFPloZucL2R7Y7bS57mSzx/1Fc/fqmn7twjkeezTTrV0EA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/is-array-buffer@2.2.0':
+    resolution: {integrity: sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==}
+    engines: {node: '>=14.0.0'}
+
+  '@smithy/node-http-handler@4.7.3':
+    resolution: {integrity: sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/node-http-handler@4.7.8':
+    resolution: {integrity: sha512-f+DbsWUwSbtMu1a/j8Y93KiU1SRg9nyzfjereqn1BJ33QOTUXxdlYvVXMhAYl1vuR1Kmna5aIJe09KSIfyFNYw==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/signature-v4@5.4.7':
+    resolution: {integrity: sha512-LwQZazFayImv+IOm0S0enoLeUJwmAlhGC5O6YCcLWezyu08dF46GOxPOq35OpBIHkgd7OvNvBStIFwVNyrvoBw==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/types@4.14.4':
+    resolution: {integrity: sha512-B2S9+UGm1+/pHkcx3ZoLVX1a+pmSk8rqxRR+ZsNqZaJ5q9FWX9AFGQVM4qG5+OBeQUZVy99HY8HqW8gK/wgXzQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/util-buffer-from@2.2.0':
+    resolution: {integrity: sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==}
+    engines: {node: '>=14.0.0'}
+
+  '@smithy/util-utf8@2.3.0':
+    resolution: {integrity: sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==}
+    engines: {node: '>=14.0.0'}
+
+  '@types/node@22.10.2':
+    resolution: {integrity: sha512-Xxr6BBRCAOQixvonOye19wnzyDiUtTeqldOOmj3CkeblonbccA12PFwlufvRdrpjXxqnmUaeiU5EOA+7s5diUQ==}
+
+  '@types/retry@0.12.0':
+    resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==}
+
+  agent-base@7.1.4:
+    resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
+    engines: {node: '>= 14'}
+
+  anynum@1.0.0:
+    resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==}
+
+  balanced-match@4.0.4:
+    resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
+    engines: {node: 18 || 20 || >=22}
+
+  base64-js@1.5.1:
+    resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
+
+  bignumber.js@9.3.1:
+    resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==}
+
+  bowser@2.14.1:
+    resolution: {integrity: sha512-tzPjzCxygAKWFOJP011oxFHs57HzIhOEracIgAePE4pqB3LikALKnSzUyU4MGs9/iCEUuHlAJTjTc5M+u7YEGg==}
+
+  brace-expansion@5.0.6:
+    resolution: {integrity: sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==}
+    engines: {node: 18 || 20 || >=22}
+
+  buffer-equal-constant-time@1.0.1:
+    resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
+
+  chalk@5.6.2:
+    resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==}
+    engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
+
+  cross-spawn@7.0.6:
+    resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
+    engines: {node: '>= 8'}
+
+  data-uri-to-buffer@4.0.1:
+    resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==}
+    engines: {node: '>= 12'}
+
+  debug@4.4.3:
+    resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==}
+    engines: {node: '>=6.0'}
+    peerDependencies:
+      supports-color: '*'
+    peerDependenciesMeta:
+      supports-color:
+        optional: true
+
+  diff@8.0.4:
+    resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==}
+    engines: {node: '>=0.3.1'}
+
+  dotenv@17.2.3:
+    resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==}
+    engines: {node: '>=12'}
+
+  ecdsa-sig-formatter@1.0.11:
+    resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==}
+
+  esbuild@0.23.1:
+    resolution: {integrity: sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==}
+    engines: {node: '>=18'}
+    hasBin: true
+
+  extend@3.0.2:
+    resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==}
+
+  fast-xml-builder@1.2.0:
+    resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==}
+
+  fast-xml-parser@5.7.3:
+    resolution: {integrity: sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==}
+    hasBin: true
+
+  fetch-blob@3.2.0:
+    resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==}
+    engines: {node: ^12.20 || >= 14.13}
+
+  formdata-polyfill@4.0.10:
+    resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==}
+    engines: {node: '>=12.20.0'}
+
+  fsevents@2.3.3:
+    resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
+    engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
+    os: [darwin]
+
+  gaxios@7.1.5:
+    resolution: {integrity: sha512-5FZy72Rh8LhtjmvDrKkI+lVhrsQrVKVsItxMoDm5mNQE+xR0WVIIs+jzPSJgBvKVsLi24fZhXJIsNI0bihDzFg==}
+    engines: {node: '>=18'}
+
+  gcp-metadata@8.1.2:
+    resolution: {integrity: sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==}
+    engines: {node: '>=18'}
+
+  get-east-asian-width@1.6.0:
+    resolution: {integrity: sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA==}
+    engines: {node: '>=18'}
+
+  get-tsconfig@4.14.0:
+    resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==}
+
+  glob@13.0.6:
+    resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==}
+    engines: {node: 18 || 20 || >=22}
+
+  google-auth-library@10.7.0:
+    resolution: {integrity: sha512-QpTAbNJ36TliZLx3TTtahR8HG0hN9RllL1e3FymOvQSIKK8JmgV58H924ub2wa2DsS3ANjjP1Aw1N+Ramc8hqQ==}
+    engines: {node: '>=18'}
+
+  google-logging-utils@1.1.3:
+    resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==}
+    engines: {node: '>=14'}
+
+  graceful-fs@4.2.11:
+    resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
+
+  highlight.js@10.7.3:
+    resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==}
+
+  hosted-git-info@9.0.3:
+    resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==}
+    engines: {node: ^20.17.0 || >=22.9.0}
+
+  http-proxy-agent@7.0.2:
+    resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==}
+    engines: {node: '>= 14'}
+
+  https-proxy-agent@7.0.6:
+    resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==}
+    engines: {node: '>= 14'}
+
+  ignore@7.0.5:
+    resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==}
+    engines: {node: '>= 4'}
+
+  isexe@2.0.0:
+    resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
+
+  jiti@2.7.0:
+    resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==}
+    hasBin: true
+
+  json-bigint@1.0.0:
+    resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==}
+
+  json-schema-to-ts@3.1.1:
+    resolution: {integrity: sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==}
+    engines: {node: '>=16'}
+
+  jwa@2.0.1:
+    resolution: {integrity: sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==}
+
+  jws@4.0.1:
+    resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==}
+
+  long@5.3.2:
+    resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==}
+
+  lru-cache@11.5.1:
+    resolution: {integrity: sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==}
+    engines: {node: 20 || >=22}
+
+  marked@15.0.12:
+    resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
+    engines: {node: '>= 18'}
+    hasBin: true
+
+  minimatch@10.2.5:
+    resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==}
+    engines: {node: 18 || 20 || >=22}
+
+  minipass@7.1.3:
+    resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==}
+    engines: {node: '>=16 || 14 >=14.17'}
+
+  ms@2.1.3:
+    resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
+
+  node-domexception@1.0.0:
+    resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
+    engines: {node: '>=10.5.0'}
+    deprecated: Use your platform's native DOMException instead
+
+  node-fetch@3.3.2:
+    resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==}
+    engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
+
+  openai@6.26.0:
+    resolution: {integrity: sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA==}
+    hasBin: true
+    peerDependencies:
+      ws: ^8.18.0
+      zod: ^3.25 || ^4.0
+    peerDependenciesMeta:
+      ws:
+        optional: true
+      zod:
+        optional: true
+
+  p-retry@4.6.2:
+    resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==}
+    engines: {node: '>=8'}
+
+  partial-json@0.1.7:
+    resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==}
+
+  path-expression-matcher@1.5.0:
+    resolution: {integrity: sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==}
+    engines: {node: '>=14.0.0'}
+
+  path-key@3.1.1:
+    resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
+    engines: {node: '>=8'}
+
+  path-scurry@2.0.2:
+    resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==}
+    engines: {node: 18 || 20 || >=22}
+
+  proper-lockfile@4.1.2:
+    resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==}
+
+  protobufjs@7.6.4:
+    resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==}
+    engines: {node: '>=12.0.0'}
+
+  resolve-pkg-maps@1.0.0:
+    resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==}
+
+  retry@0.12.0:
+    resolution: {integrity: sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==}
+    engines: {node: '>= 4'}
+
+  retry@0.13.1:
+    resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==}
+    engines: {node: '>= 4'}
+
+  safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+
+  semver@7.8.0:
+    resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==}
+    engines: {node: '>=10'}
+    hasBin: true
+
+  semver@7.8.4:
+    resolution: {integrity: sha512-rUCObTnP32Q08R2uuIrt7r9PlEonuTmtuXYcW6s5kjdlj3xbnwe+21yXptAUYcMAABLkYYTtnmzb3w3EDZfueA==}
+    engines: {node: '>=10'}
+    hasBin: true
+
+  shebang-command@2.0.0:
+    resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==}
+    engines: {node: '>=8'}
+
+  shebang-regex@3.0.0:
+    resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
+    engines: {node: '>=8'}
+
+  signal-exit@3.0.7:
+    resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==}
+
+  strnum@2.4.0:
+    resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==}
+
+  ts-algebra@2.0.0:
+    resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==}
+
+  tslib@2.8.1:
+    resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==}
+
+  tsx@4.19.2:
+    resolution: {integrity: sha512-pOUl6Vo2LUq/bSa8S5q7b91cgNSjctn9ugq/+Mvow99qW6x/UZYwzxy/3NmqoT66eHYfCVvFvACC58UBPFf28g==}
+    engines: {node: '>=18.0.0'}
+    hasBin: true
+
+  typebox@1.1.38:
+    resolution: {integrity: sha512-pZ0aQPmMmXoUvSbeuWf/Hzsc+avNw/Zd6VeE8CFgkVGWyuHPJvqeJJDeJqLve+K70LvjYIoleGcoJHPT17cWoA==}
+
+  undici-types@6.20.0:
+    resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==}
+
+  undici@8.3.0:
+    resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==}
+    engines: {node: '>=22.19.0'}
+
+  web-streams-polyfill@3.3.3:
+    resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==}
+    engines: {node: '>= 8'}
+
+  which@2.0.2:
+    resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
+    engines: {node: '>= 8'}
+    hasBin: true
+
+  ws@8.21.0:
+    resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==}
+    engines: {node: '>=10.0.0'}
+    peerDependencies:
+      bufferutil: ^4.0.1
+      utf-8-validate: '>=5.0.2'
+    peerDependenciesMeta:
+      bufferutil:
+        optional: true
+      utf-8-validate:
+        optional: true
+
+  xml-naming@0.1.0:
+    resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==}
+    engines: {node: '>=16.0.0'}
+
+  yaml@2.9.0:
+    resolution: {integrity: sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==}
+    engines: {node: '>= 14.6'}
+    hasBin: true
+
+  zod-to-json-schema@3.25.2:
+    resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==}
+    peerDependencies:
+      zod: ^3.25.28 || ^4
+
+  zod@4.4.3:
+    resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==}
+
+snapshots:
+
+  '@anthropic-ai/sdk@0.91.1(zod@4.4.3)':
+    dependencies:
+      json-schema-to-ts: 3.1.1
+    optionalDependencies:
+      zod: 4.4.3
+
+  '@aws-crypto/crc32@5.2.0':
+    dependencies:
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      tslib: 2.8.1
+
+  '@aws-crypto/sha256-browser@5.2.0':
+    dependencies:
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-crypto/supports-web-crypto': 5.2.0
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      '@aws-sdk/util-locate-window': 3.965.7
+      '@smithy/util-utf8': 2.3.0
+      tslib: 2.8.1
+
+  '@aws-crypto/sha256-js@5.2.0':
+    dependencies:
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      tslib: 2.8.1
+
+  '@aws-crypto/supports-web-crypto@5.2.0':
+    dependencies:
+      tslib: 2.8.1
+
+  '@aws-crypto/util@5.2.0':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/util-utf8': 2.3.0
+      tslib: 2.8.1
+
+  '@aws-sdk/client-bedrock-runtime@3.1048.0':
+    dependencies:
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/credential-provider-node': 3.972.55
+      '@aws-sdk/eventstream-handler-node': 3.972.21
+      '@aws-sdk/middleware-eventstream': 3.972.17
+      '@aws-sdk/middleware-websocket': 3.972.28
+      '@aws-sdk/token-providers': 3.1048.0
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.3
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/core@3.974.20':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@aws-sdk/xml-builder': 3.972.29
+      '@aws/lambda-invoke-store': 0.2.4
+      '@smithy/core': 3.24.7
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      bowser: 2.14.1
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-env@3.972.46':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-http@3.972.48':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-ini@3.972.53':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/credential-provider-env': 3.972.46
+      '@aws-sdk/credential-provider-http': 3.972.48
+      '@aws-sdk/credential-provider-login': 3.972.52
+      '@aws-sdk/credential-provider-process': 3.972.46
+      '@aws-sdk/credential-provider-sso': 3.972.52
+      '@aws-sdk/credential-provider-web-identity': 3.972.52
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-login@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-node@3.972.55':
+    dependencies:
+      '@aws-sdk/credential-provider-env': 3.972.46
+      '@aws-sdk/credential-provider-http': 3.972.48
+      '@aws-sdk/credential-provider-ini': 3.972.53
+      '@aws-sdk/credential-provider-process': 3.972.46
+      '@aws-sdk/credential-provider-sso': 3.972.52
+      '@aws-sdk/credential-provider-web-identity': 3.972.52
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-process@3.972.46':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-sso@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/token-providers': 3.1066.0
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-web-identity@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/eventstream-handler-node@3.972.21':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/middleware-eventstream@3.972.17':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/middleware-websocket@3.972.28':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/nested-clients@3.997.20':
+    dependencies:
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/signature-v4-multi-region': 3.996.34
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/signature-v4-multi-region@3.996.34':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/token-providers@3.1048.0':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/token-providers@3.1066.0':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/types@3.973.12':
+    dependencies:
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/util-locate-window@3.965.7':
+    dependencies:
+      tslib: 2.8.1
+
+  '@aws-sdk/xml-builder@3.972.29':
+    dependencies:
+      '@smithy/types': 4.14.4
+      fast-xml-parser: 5.7.3
+      tslib: 2.8.1
+
+  '@aws/lambda-invoke-store@0.2.4': {}
+
+  '@babel/runtime@7.29.7': {}
+
+  '@earendil-works/pi-agent-core@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      ignore: 7.0.5
+      typebox: 1.1.38
+      yaml: 2.9.0
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-ai@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@anthropic-ai/sdk': 0.91.1(zod@4.4.3)
+      '@aws-sdk/client-bedrock-runtime': 3.1048.0
+      '@google/genai': 1.52.0
+      '@mistralai/mistralai': 2.2.1
+      '@smithy/node-http-handler': 4.7.3
+      http-proxy-agent: 7.0.2
+      https-proxy-agent: 7.0.6
+      openai: 6.26.0(ws@8.21.0)(zod@4.4.3)
+      partial-json: 0.1.7
+      typebox: 1.1.38
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-coding-agent@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@earendil-works/pi-agent-core': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@earendil-works/pi-tui': 0.79.4
+      '@silvia-odwyer/photon-node': 0.3.4
+      chalk: 5.6.2
+      cross-spawn: 7.0.6
+      diff: 8.0.4
+      glob: 13.0.6
+      highlight.js: 10.7.3
+      hosted-git-info: 9.0.3
+      ignore: 7.0.5
+      jiti: 2.7.0
+      minimatch: 10.2.5
+      proper-lockfile: 4.1.2
+      semver: 7.8.0
+      typebox: 1.1.38
+      undici: 8.3.0
+      yaml: 2.9.0
+    optionalDependencies:
+      '@mariozechner/clipboard': 0.3.9
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-tui@0.79.4':
+    dependencies:
+      get-east-asian-width: 1.6.0
+      marked: 15.0.12
+
+  '@esbuild/aix-ppc64@0.23.1':
+    optional: true
+
+  '@esbuild/android-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/android-arm@0.23.1':
+    optional: true
+
+  '@esbuild/android-x64@0.23.1':
+    optional: true
+
+  '@esbuild/darwin-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/darwin-x64@0.23.1':
+    optional: true
+
+  '@esbuild/freebsd-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/freebsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-arm@0.23.1':
+    optional: true
+
+  '@esbuild/linux-ia32@0.23.1':
+    optional: true
+
+  '@esbuild/linux-loong64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-mips64el@0.23.1':
+    optional: true
+
+  '@esbuild/linux-ppc64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-riscv64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-s390x@0.23.1':
+    optional: true
+
+  '@esbuild/linux-x64@0.23.1':
+    optional: true
+
+  '@esbuild/netbsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/openbsd-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/openbsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/sunos-x64@0.23.1':
+    optional: true
+
+  '@esbuild/win32-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/win32-ia32@0.23.1':
+    optional: true
+
+  '@esbuild/win32-x64@0.23.1':
+    optional: true
+
+  '@google/genai@1.52.0':
+    dependencies:
+      google-auth-library: 10.7.0
+      p-retry: 4.6.2
+      protobufjs: 7.6.4
+      ws: 8.21.0
+    transitivePeerDependencies:
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+
+  '@mariozechner/clipboard-darwin-arm64@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-darwin-universal@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-darwin-x64@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-arm64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-arm64-musl@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-x64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-x64-musl@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-win32-arm64-msvc@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-win32-x64-msvc@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard@0.3.9':
+    optionalDependencies:
+      '@mariozechner/clipboard-darwin-arm64': 0.3.9
+      '@mariozechner/clipboard-darwin-universal': 0.3.9
+      '@mariozechner/clipboard-darwin-x64': 0.3.9
+      '@mariozechner/clipboard-linux-arm64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-arm64-musl': 0.3.9
+      '@mariozechner/clipboard-linux-riscv64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-x64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-x64-musl': 0.3.9
+      '@mariozechner/clipboard-win32-arm64-msvc': 0.3.9
+      '@mariozechner/clipboard-win32-x64-msvc': 0.3.9
+    optional: true
+
+  '@mistralai/mistralai@2.2.1':
+    dependencies:
+      ws: 8.21.0
+      zod: 4.4.3
+      zod-to-json-schema: 3.25.2(zod@4.4.3)
+    transitivePeerDependencies:
+      - bufferutil
+      - utf-8-validate
+
+  '@nodable/entities@2.2.0': {}
+
+  '@opentelemetry/api-logs@0.54.0':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
+  '@opentelemetry/api@1.9.0': {}
+
+  '@opentelemetry/context-async-hooks@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
+  '@opentelemetry/core@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/core@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/otlp-exporter-base@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/otlp-transformer@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.54.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0)
+      protobufjs: 7.6.4
+
+  '@opentelemetry/propagator-b3@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/propagator-jaeger@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/resources@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/resources@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-logs@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.54.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/sdk-metrics@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/sdk-trace-base@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-trace-base@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-trace-node@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/context-async-hooks': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-b3': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-jaeger': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.28.0(@opentelemetry/api@1.9.0)
+      semver: 7.8.4
+
+  '@opentelemetry/semantic-conventions@1.27.0': {}
+
+  '@opentelemetry/semantic-conventions@1.28.0': {}
+
+  '@protobufjs/aspromise@1.1.2': {}
+
+  '@protobufjs/base64@1.1.2': {}
+
+  '@protobufjs/codegen@2.0.5': {}
+
+  '@protobufjs/eventemitter@1.1.1': {}
+
+  '@protobufjs/fetch@1.1.1':
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+
+  '@protobufjs/float@1.0.2': {}
+
+  '@protobufjs/path@1.1.2': {}
+
+  '@protobufjs/pool@1.1.0': {}
+
+  '@protobufjs/utf8@1.1.1': {}
+
+  '@silvia-odwyer/photon-node@0.3.4': {}
+
+  '@smithy/core@3.24.7':
+    dependencies:
+      '@aws-crypto/crc32': 5.2.0
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/credential-provider-imds@4.3.9':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/fetch-http-handler@5.4.7':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/is-array-buffer@2.2.0':
+    dependencies:
+      tslib: 2.8.1
+
+  '@smithy/node-http-handler@4.7.3':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/node-http-handler@4.7.8':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/signature-v4@5.4.7':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/types@4.14.4':
+    dependencies:
+      tslib: 2.8.1
+
+  '@smithy/util-buffer-from@2.2.0':
+    dependencies:
+      '@smithy/is-array-buffer': 2.2.0
+      tslib: 2.8.1
+
+  '@smithy/util-utf8@2.3.0':
+    dependencies:
+      '@smithy/util-buffer-from': 2.2.0
+      tslib: 2.8.1
+
+  '@types/node@22.10.2':
+    dependencies:
+      undici-types: 6.20.0
+
+  '@types/retry@0.12.0': {}
+
+  agent-base@7.1.4: {}
+
+  anynum@1.0.0: {}
+
+  balanced-match@4.0.4: {}
+
+  base64-js@1.5.1: {}
+
+  bignumber.js@9.3.1: {}
+
+  bowser@2.14.1: {}
+
+  brace-expansion@5.0.6:
+    dependencies:
+      balanced-match: 4.0.4
+
+  buffer-equal-constant-time@1.0.1: {}
+
+  chalk@5.6.2: {}
+
+  cross-spawn@7.0.6:
+    dependencies:
+      path-key: 3.1.1
+      shebang-command: 2.0.0
+      which: 2.0.2
+
+  data-uri-to-buffer@4.0.1: {}
+
+  debug@4.4.3:
+    dependencies:
+      ms: 2.1.3
+
+  diff@8.0.4: {}
+
+  dotenv@17.2.3: {}
+
+  ecdsa-sig-formatter@1.0.11:
+    dependencies:
+      safe-buffer: 5.2.1
+
+  esbuild@0.23.1:
+    optionalDependencies:
+      '@esbuild/aix-ppc64': 0.23.1
+      '@esbuild/android-arm': 0.23.1
+      '@esbuild/android-arm64': 0.23.1
+      '@esbuild/android-x64': 0.23.1
+      '@esbuild/darwin-arm64': 0.23.1
+      '@esbuild/darwin-x64': 0.23.1
+      '@esbuild/freebsd-arm64': 0.23.1
+      '@esbuild/freebsd-x64': 0.23.1
+      '@esbuild/linux-arm': 0.23.1
+      '@esbuild/linux-arm64': 0.23.1
+      '@esbuild/linux-ia32': 0.23.1
+      '@esbuild/linux-loong64': 0.23.1
+      '@esbuild/linux-mips64el': 0.23.1
+      '@esbuild/linux-ppc64': 0.23.1
+      '@esbuild/linux-riscv64': 0.23.1
+      '@esbuild/linux-s390x': 0.23.1
+      '@esbuild/linux-x64': 0.23.1
+      '@esbuild/netbsd-x64': 0.23.1
+      '@esbuild/openbsd-arm64': 0.23.1
+      '@esbuild/openbsd-x64': 0.23.1
+      '@esbuild/sunos-x64': 0.23.1
+      '@esbuild/win32-arm64': 0.23.1
+      '@esbuild/win32-ia32': 0.23.1
+      '@esbuild/win32-x64': 0.23.1
+
+  extend@3.0.2: {}
+
+  fast-xml-builder@1.2.0:
+    dependencies:
+      path-expression-matcher: 1.5.0
+      xml-naming: 0.1.0
+
+  fast-xml-parser@5.7.3:
+    dependencies:
+      '@nodable/entities': 2.2.0
+      fast-xml-builder: 1.2.0
+      path-expression-matcher: 1.5.0
+      strnum: 2.4.0
+
+  fetch-blob@3.2.0:
+    dependencies:
+      node-domexception: 1.0.0
+      web-streams-polyfill: 3.3.3
+
+  formdata-polyfill@4.0.10:
+    dependencies:
+      fetch-blob: 3.2.0
+
+  fsevents@2.3.3:
+    optional: true
+
+  gaxios@7.1.5:
+    dependencies:
+      extend: 3.0.2
+      https-proxy-agent: 7.0.6
+      node-fetch: 3.3.2
+    transitivePeerDependencies:
+      - supports-color
+
+  gcp-metadata@8.1.2:
+    dependencies:
+      gaxios: 7.1.5
+      google-logging-utils: 1.1.3
+      json-bigint: 1.0.0
+    transitivePeerDependencies:
+      - supports-color
+
+  get-east-asian-width@1.6.0: {}
+
+  get-tsconfig@4.14.0:
+    dependencies:
+      resolve-pkg-maps: 1.0.0
+
+  glob@13.0.6:
+    dependencies:
+      minimatch: 10.2.5
+      minipass: 7.1.3
+      path-scurry: 2.0.2
+
+  google-auth-library@10.7.0:
+    dependencies:
+      base64-js: 1.5.1
+      ecdsa-sig-formatter: 1.0.11
+      gaxios: 7.1.5
+      gcp-metadata: 8.1.2
+      google-logging-utils: 1.1.3
+      jws: 4.0.1
+    transitivePeerDependencies:
+      - supports-color
+
+  google-logging-utils@1.1.3: {}
+
+  graceful-fs@4.2.11: {}
+
+  highlight.js@10.7.3: {}
+
+  hosted-git-info@9.0.3:
+    dependencies:
+      lru-cache: 11.5.1
+
+  http-proxy-agent@7.0.2:
+    dependencies:
+      agent-base: 7.1.4
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
+  https-proxy-agent@7.0.6:
+    dependencies:
+      agent-base: 7.1.4
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
+  ignore@7.0.5: {}
+
+  isexe@2.0.0: {}
+
+  jiti@2.7.0: {}
+
+  json-bigint@1.0.0:
+    dependencies:
+      bignumber.js: 9.3.1
+
+  json-schema-to-ts@3.1.1:
+    dependencies:
+      '@babel/runtime': 7.29.7
+      ts-algebra: 2.0.0
+
+  jwa@2.0.1:
+    dependencies:
+      buffer-equal-constant-time: 1.0.1
+      ecdsa-sig-formatter: 1.0.11
+      safe-buffer: 5.2.1
+
+  jws@4.0.1:
+    dependencies:
+      jwa: 2.0.1
+      safe-buffer: 5.2.1
+
+  long@5.3.2: {}
+
+  lru-cache@11.5.1: {}
+
+  marked@15.0.12: {}
+
+  minimatch@10.2.5:
+    dependencies:
+      brace-expansion: 5.0.6
+
+  minipass@7.1.3: {}
+
+  ms@2.1.3: {}
+
+  node-domexception@1.0.0: {}
+
+  node-fetch@3.3.2:
+    dependencies:
+      data-uri-to-buffer: 4.0.1
+      fetch-blob: 3.2.0
+      formdata-polyfill: 4.0.10
+
+  openai@6.26.0(ws@8.21.0)(zod@4.4.3):
+    optionalDependencies:
+      ws: 8.21.0
+      zod: 4.4.3
+
+  p-retry@4.6.2:
+    dependencies:
+      '@types/retry': 0.12.0
+      retry: 0.13.1
+
+  partial-json@0.1.7: {}
+
+  path-expression-matcher@1.5.0: {}
+
+  path-key@3.1.1: {}
+
+  path-scurry@2.0.2:
+    dependencies:
+      lru-cache: 11.5.1
+      minipass: 7.1.3
+
+  proper-lockfile@4.1.2:
+    dependencies:
+      graceful-fs: 4.2.11
+      retry: 0.12.0
+      signal-exit: 3.0.7
+
+  protobufjs@7.6.4:
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+      '@protobufjs/base64': 1.1.2
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.1
+      '@protobufjs/fetch': 1.1.1
+      '@protobufjs/float': 1.0.2
+      '@protobufjs/path': 1.1.2
+      '@protobufjs/pool': 1.1.0
+      '@protobufjs/utf8': 1.1.1
+      '@types/node': 22.10.2
+      long: 5.3.2
+
+  resolve-pkg-maps@1.0.0: {}
+
+  retry@0.12.0: {}
+
+  retry@0.13.1: {}
+
+  safe-buffer@5.2.1: {}
+
+  semver@7.8.0: {}
+
+  semver@7.8.4: {}
+
+  shebang-command@2.0.0:
+    dependencies:
+      shebang-regex: 3.0.0
+
+  shebang-regex@3.0.0: {}
+
+  signal-exit@3.0.7: {}
+
+  strnum@2.4.0:
+    dependencies:
+      anynum: 1.0.0
+
+  ts-algebra@2.0.0: {}
+
+  tslib@2.8.1: {}
+
+  tsx@4.19.2:
+    dependencies:
+      esbuild: 0.23.1
+      get-tsconfig: 4.14.0
+    optionalDependencies:
+      fsevents: 2.3.3
+
+  typebox@1.1.38: {}
+
+  undici-types@6.20.0: {}
+
+  undici@8.3.0: {}
+
+  web-streams-polyfill@3.3.3: {}
+
+  which@2.0.2:
+    dependencies:
+      isexe: 2.0.0
+
+  ws@8.21.0: {}
+
+  xml-naming@0.1.0: {}
+
+  yaml@2.9.0: {}
+
+  zod-to-json-schema@3.25.2(zod@4.4.3):
+    dependencies:
+      zod: 4.4.3
+
+  zod@4.4.3: {}
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts b/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts
new file mode 100644
index 0000000000..03164e6311
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts
@@ -0,0 +1,197 @@
+/**
+ * WP-1 runner: install Pi, run a small tool-using agent task, and export the run
+ * to Agenta as OpenTelemetry traces via the agenta-otel extension.
+ *
+ * Auth: uses AuthStorage.create(), which reads ~/.pi/agent/auth.json. Log in once
+ * with `pnpm exec pi` -> `/login` -> "ChatGPT Plus/Pro (Codex)" (no API key needed),
+ * or set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment.
+ *
+ * Run: `pnpm start`
+ */
+import dotenv from "dotenv";
+import { existsSync, mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import {
+  AuthStorage,
+  createAgentSession,
+  DefaultResourceLoader,
+  getAgentDir,
+  ModelRegistry,
+  SessionManager,
+} from "@earendil-works/pi-coding-agent";
+
+import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel.ts";
+
+// Load env before anything reads it: poc-local .env first, then walk up to the
+// repo-root .env.test.local for the shared dev-box Agenta credentials.
+function loadEnv(): void {
+  dotenv.config();
+  let dir = dirname(fileURLToPath(import.meta.url));
+  for (let i = 0; i < 8; i++) {
+    const candidate = join(dir, ".env.test.local");
+    if (existsSync(candidate)) {
+      dotenv.config({ path: candidate });
+      break;
+    }
+    const parent = dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+}
+
+type Scenario = { name: string; seed: (dir: string) => void; prompts: string[] };
+
+const SCENARIOS: Record<string, Scenario> = {
+  simple: {
+    name: "simple",
+    seed: (dir) =>
+      writeFileSync(
+        join(dir, "notes.txt"),
+        "TODO: greet the user by name (use 'Mahmoud')\n" +
+          "TODO: add a two-line haiku about tracing\n",
+      ),
+    prompts: [
+      "Read notes.txt in the current directory, then create greeting.txt that " +
+        "addresses each TODO. Keep it short.",
+    ],
+  },
+  // Many tool calls across several turns, ending in a structured return.
+  complex: {
+    name: "complex",
+    seed: (dir) => {
+      writeFileSync(
+        join(dir, "alpha.py"),
+        "def add(a, b):\n    return a + b\n\n\ndef sub(a, b):\n    return a - b\n",
+      );
+      writeFileSync(
+        join(dir, "beta.py"),
+        "import math\n\n\ndef area(r):\n    return math.pi * r * r\n",
+      );
+      writeFileSync(join(dir, "README.md"), "# demo\n\nA tiny demo package.\n");
+    },
+    prompts: [
+      "Explore this directory: list the files, read every .py file, and use bash " +
+        "(wc -l) to count the total number of lines across the .py files. Then write " +
+        "REPORT.md describing what each .py file does and the total line count. " +
+        "Finally, reply with ONLY a JSON object: " +
+        '{"files": ["..."], "total_py_lines": <int>, "report": "REPORT.md"}.',
+    ],
+  },
+  // A longer, multi-prompt session: each prompt is its own trace, all sharing one session.id.
+  session: {
+    name: "session",
+    seed: () => {},
+    prompts: [
+      "Create todo.md with exactly 3 short tasks about adding distributed tracing to a service.",
+      "Append 2 more tasks to todo.md, then show me the full file with the bash 'cat' command.",
+      'Read todo.md and reply with ONLY a JSON object: {"count": <number of tasks>, "tasks": ["..."]}.',
+    ],
+  },
+};
+
+function pickScenario(cliPrompts: string[]): Scenario {
+  if (cliPrompts.length > 0) {
+    return { name: "custom", seed: SCENARIOS.complex.seed, prompts: cliPrompts };
+  }
+  const key = process.env.PI_SCENARIO || "complex";
+  return SCENARIOS[key] ?? SCENARIOS.complex;
+}
+
+async function main(): Promise<void> {
+  loadEnv();
+
+  // A throwaway working dir seeded per scenario so the agent actually uses tools.
+  const cwd = mkdtempSync(join(tmpdir(), "pi-poc-"));
+  const scenario = pickScenario(process.argv.slice(2));
+  scenario.seed(cwd);
+
+  const authStorage = AuthStorage.create();
+  const modelRegistry = ModelRegistry.create(authStorage);
+  const available = await modelRegistry.getAvailable();
+  if (available.length === 0) {
+    console.error(
+      "\nNo model is available. Authenticate Pi first:\n" +
+        "  pnpm exec pi   then  /login  ->  \"ChatGPT Plus/Pro (Codex)\"\n" +
+        "or export OPENAI_API_KEY / ANTHROPIC_API_KEY.\n",
+    );
+    process.exit(1);
+  }
+
+  const wanted = process.env.PI_MODEL; // "gpt-5.5" or "openai-codex/gpt-5.5"
+  const model =
+    (wanted &&
+      available.find(
+        (m: any) => m.id === wanted || `${m.provider}/${m.id}` === wanted,
+      )) ||
+    available.find((m: any) => m.id === "gpt-5.5") ||
+    available.find((m: any) => !/spark|mini/i.test(m.id)) ||
+    available[0];
+  if (wanted && model.id !== wanted && `${model.provider}/${model.id}` !== wanted) {
+    console.warn(`[run] PI_MODEL="${wanted}" not available; using ${model.id}`);
+  }
+  console.log(`[run] scenario: ${scenario.name} (${scenario.prompts.length} prompt(s))`);
+  console.log(`[run] model: ${model.provider}/${model.id}`);
+  console.log(`[run] cwd:   ${cwd}`);
+
+  const loader = new DefaultResourceLoader({
+    cwd,
+    agentDir: getAgentDir(),
+    extensionFactories: [agentaOtel],
+  });
+  await loader.reload();
+
+  const { session } = await createAgentSession({
+    cwd,
+    model,
+    authStorage,
+    modelRegistry,
+    tools: ["read", "bash", "edit", "write", "ls"],
+    sessionManager: SessionManager.inMemory(cwd),
+    resourceLoader: loader,
+  });
+
+  // Hand the session id + model to the extension so spans carry them.
+  runConfig.sessionId = session.sessionId;
+  runConfig.provider = model.provider;
+  runConfig.requestModel = model.id;
+
+  session.subscribe((event: any) => {
+    if (
+      event.type === "message_update" &&
+      event.assistantMessageEvent?.type === "text_delta"
+    ) {
+      process.stdout.write(event.assistantMessageEvent.delta);
+    } else if (event.type === "tool_execution_start") {
+      process.stdout.write(`\n[tool] ${event.toolName}\n`);
+    }
+  });
+
+  const traceIds: string[] = [];
+  for (let i = 0; i < scenario.prompts.length; i++) {
+    const p = scenario.prompts[i];
+    console.log(`\n[run] prompt ${i + 1}/${scenario.prompts.length}: ${p}\n`);
+    await session.prompt(p);
+    if (runConfig.traceId) traceIds.push(runConfig.traceId);
+  }
+
+  console.log("\n\n[run] flushing spans to Agenta...");
+  session.dispose();
+  await shutdownTracing();
+
+  const host = (process.env.AGENTA_HOST || "").replace(/\/+$/, "");
+  console.log("[run] flushed.");
+  console.log(`[run] session_id=${session.sessionId}`);
+  traceIds.forEach((tid, i) => {
+    console.log(`[run] trace ${i + 1}: ${tid}`);
+    console.log(`        ${host}/api/spans/?trace_id=${tid}`);
+  });
+  process.exit(0);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
new file mode 100644
index 0000000000..0bb4b12777
--- /dev/null
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
@@ -0,0 +1,113 @@
+# Tracing the agent run into the response, like completion and chat
+
+Status: built and verified end to end against the dev box (2026-06-15). Audience:
+whoever works on the agent service (WP-2) and its tracing.
+
+This is the follow-on to [integrating-the-tracing-extension.md](integrating-the-tracing-extension.md).
+That doc made a standalone Pi run show up in Agenta as its own trace. This one wires
+the same extension into the WP-2 agent service so the agent's whole run becomes part
+of the `/invoke` trace, the way completion and chat nest their LLM spans under the
+workflow span.
+
+## What changed and why
+
+Completion and chat are traced as one tree: the SDK opens a workflow span for the
+`/invoke` request, the LLM call nests under it, and the response carries that
+`trace_id`. Open the trace and you see the whole call.
+
+The agent service runs the model work in a separate Node process (the Pi wrapper), so
+its spans were not part of that tree. The WP-1 doc flagged the fix as future work:
+thread a W3C `traceparent` across the boundary and start the agent span as its child.
+That is what this change does.
+
+The result is one tree under the response's `trace_id`:
+
+```
+_agent                 workflow   (the Python /invoke span, root)
+  invoke_agent         AGENT      (the Pi run, now a child of _agent)
+    turn N             CHAIN
+      chat <model>     LLM        model, tokens, cost, message thread
+      execute_tool ... TOOL
+```
+
+Verified shape from a live run (trace `0f47e5f5...`): four spans, one trace, the
+`chat` span carrying `ag.data.inputs`/`outputs` as a message thread, token usage
+(598/21/619), and cost, with nothing in `ag.unsupported`.
+
+## How it works
+
+Three seams carry the context from the Python service to the Pi spans.
+
+1. **Capture (Python, `services/oss/src/agent.py`).** Inside the instrumented
+   `_agent` handler the current OpenTelemetry span is the workflow span. `_trace_context()`
+   reads it with the SDK's `propagation.inject()`, which yields the `traceparent`,
+   `baggage`, and the request `Authorization`. It also reads the OTLP endpoint from
+   `ag.tracing.otlp_url`, the exact URL the Python spans use. This is best effort: if
+   capture fails the run still works, just without cross-trace linking.
+
+2. **Carry (`services/oss/src/agent_pi`).** `HarnessRequest` gains a `TraceContext`
+   (`ports.py`). `TraceContext.to_wire()` serializes it to the camelCase shape the
+   wrapper expects, and both harness adapters send it: the local subprocess one
+   (`pi_harness.py`) and the HTTP sidecar one (`pi_http_harness.py`).
+
+3. **Consume (Node, `services/agent/src/agenta-otel.ts`).** When a `traceparent` is
+   present the extension starts `invoke_agent` as a child of that remote span, so the
+   whole Pi subtree shares the caller's `trace_id`. It exports each trace to the
+   endpoint and with the `Authorization` the caller passed, falling back to env. The
+   runner (`runPi.ts`) flushes the trace before it returns the result.
+
+Because the Python span and the Pi spans share one `trace_id` and the Pi root points
+at the Python span, Agenta merges them into one tree at ingest. No backend change.
+
+## What is different from the POC extension
+
+The service build keeps the POC's span tree and every load-bearing attribute choice
+(read the [five rules](integrating-the-tracing-extension.md#what-you-must-not-change-and-why)
+again before touching attributes). It adds three things the service needs:
+
+- **Per-run state, not module globals.** The POC ran one prompt at a time. The HTTP
+  sidecar can drive several runs in one process, so all span state lives in the
+  closure `createAgentaOtel()` returns. Only the tracer, provider, and exporter cache
+  stay process wide.
+- **A remote parent.** `invoke_agent` nests under the incoming `traceparent` instead
+  of starting a fresh root. The parent has no end event in this process, so the
+  per-trace batch flushes by trace id after the run rather than only on root-end.
+- **Per-trace export target.** The OTLP endpoint and `Authorization` come from the run
+  config, so one shared process can serve more than one project. They fall back to
+  `AGENTA_HOST` / `AGENTA_API_KEY` when the caller passes nothing.
+
+## Auth and endpoint
+
+The Node side ships spans to the same place and with the same credentials as the
+Python span. When the request carries `Authorization` (the project key or service
+secret) the wrapper uses it verbatim, matching how the SDK exporter authorizes per
+trace. With auth disabled locally there is no request credential, so the wrapper falls
+back to the container's `AGENTA_API_KEY`. Set `AGENTA_AGENT_CAPTURE_CONTENT=0` on the
+Python service to drop prompts, completions, and tool I/O from the spans.
+
+For the HTTP sidecar the endpoint passed from Python is the URL the Python container
+uses to reach Agenta. The sidecar must be able to reach the same host. On one Docker
+network the internal hostname resolves from both; if it does not, the sidecar's
+`AGENTA_HOST` fallback applies.
+
+## How to verify
+
+1. Start the service (`entrypoints.agent_main:app`) with `AGENTA_HOST` and
+   `AGENTA_API_KEY` set and a Pi login or provider key available.
+2. POST a chat-style body to `/agent/v0/invoke` and read `x-ag-trace-id` from the
+   response headers (it equals `trace_id` in the body).
+3. Fetch the trace and confirm the merged tree and the totals:
+   ```
+   curl -s "${AGENTA_HOST}/api/spans/?trace_id=<id>" -H "Authorization: ApiKey ${AGENTA_API_KEY}"
+   ```
+   Expect `_agent` (workflow) over `invoke_agent` (agent) over `turn N` (chain) over
+   `chat` (chat), all sharing one `trace_id`, with token usage and cost on the `chat`
+   span and nothing under `ag.unsupported`.
+
+## Files
+
+- `services/oss/src/agent.py` — `_trace_context()` captures the workflow span context.
+- `services/oss/src/agent_pi/ports.py` — `TraceContext` and `HarnessRequest.trace`.
+- `services/oss/src/agent_pi/pi_harness.py`, `pi_http_harness.py` — forward the context.
+- `services/agent/src/agenta-otel.ts` — the service build of the extension.
+- `services/agent/src/runPi.ts` — registers the extension, sets run config, flushes.
diff --git a/docs/design/agent-workflows/wp-2-agent-service/README.md b/docs/design/agent-workflows/wp-2-agent-service/README.md
new file mode 100644
index 0000000000..c0a5731f6a
--- /dev/null
+++ b/docs/design/agent-workflows/wp-2-agent-service/README.md
@@ -0,0 +1,124 @@
+# WP-2: Agent service wrapping Pi
+
+Status: not started.
+
+## Goal
+
+Stand up a new service that wraps Pi and exposes an interface like Agenta's completion/chat
+services, so we can talk to an agent: set it up (auth, AGENTS.md), send a message, and get response streamed back. Local only for the POC. No Daytona yet.
+
+Basically we want:
+
+- A new docker service that has the same structure as completion and chat
+- that opens endpoints for the same interface as chat
+- that you can send a message history and context and get back response 
+
+
+
+
+## Scope
+
+In:
+
+- A thin TypeScript harness-wrapper that drives Pi's SDK (`createAgentSession`).
+- Configure the agent fully in memory: AGENTS.md, LLM auth, model. Skills and custom tools
+  can be stubbed for the first cut.
+- Expose our own protocol on a port: a send-message / get-response surface that mirrors the
+  shape of the existing completion/chat services.
+
+Out (later work packages):
+
+- Daytona sandbox. The wrapper runs as a local process for the POC.
+- Swapping in other harnesses (Codex, Claude Code). Design the protocol so it is possible,
+  but only implement Pi here.
+- Persisting sessions or storing config server-side. Use a config passed in at startup.
+- Stream the multi-message output back to the caller.
+- multimessages
+- tools
+
+In step 1 we will hard code the auth for pi.dev (the openai api key for instance or codex). We wont have any configuration just ability to run things. The docker compose will be reloadable automatic change which mean we can simply change the files in the volume locally and change things there. 
+
+We will make sure in the implementation to first think about the port and adapters. So that even the first MVP is very simple it has the right ports and adapters. 
+
+First between our agent implementation and calling pi.dev and setting it up there is a clear port. pi is an implementation for this. 
+
+there is also another port for setting up the run environment. So it's not just setup the agent but also the run environment. 
+
+because you might run pi.dev or claude code locally. As you might run each in daytona or something else. 
+
+We need to set these up. EAch with an adapter. starting env - shutting down - pausing - connecting volume - 
+
+then set up pi.dev setting up - invoking - stoping? (all the rpc interactions) - shutting down 
+
+For pi.dev it might make sense to have two adapters one for RPC and the other for json 
+
+Success for this WP1 is:
+- I go to the UI
+- Create a new agent (with some hard coded config Say hello world)
+- I run it in the playground and I see the output. 
+
+note here that instrumentation here might needed, we are working in parallel on the research for that
+
+
+As soon as we have that we can start working on adding a config first to the playground. which include agents.md then authentication (model used) then setting up tools. then we can talk about streaming, multi messages, intermediate messages. 
+
+
+
+
+--- The rest of the article might be out of date for some parts. The main requirements are above ---
+
+
+## Approach (grounded in research)
+
+See [`../research/pi-interaction.md`](../research/pi-interaction.md),
+[`../research/auth-secrets.md`](../research/auth-secrets.md), and
+[`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md).
+
+- Use the **SDK**, not RPC. The SDK is what exposes the in-memory overrides and runtime
+  credential injection; RPC mode cannot inject credentials post-spawn.
+- Inject everything in memory:
+  - AGENTS.md via `systemPromptOverride` / `appendSystemPrompt` / `agentsFilesOverride`,
+    with `noContextFiles` so no on-disk AGENTS.md leaks in.
+  - LLM auth via `setRuntimeApiKey(provider, key)` or `AuthStorage.inMemory()` (env at
+    spawn also works).
+  - State via `SessionManager.inMemory()`, `SettingsManager.inMemory()`,
+    `ModelRegistry.inMemory()`.
+- Diskless: set `TMPDIR` to a per-run tmpfs for bash output spillover; pre-install `rg`/`fd`
+  so search tools do not write binaries to disk.
+- Stream output via `session.subscribe()` callbacks (`message_update` -> `text_delta`),
+  mapping Pi events onto the service's streamed response.
+- This wrapper is the "works with our port" contract and the swappable-harness seam. Keep
+  the protocol harness-agnostic.
+
+## Interface to mirror
+
+Match the existing Agenta completion/chat service surface so callers and the playground can
+treat an agent like the other workflow types. Reconcile the single-output completion/chat
+shape with Pi's multi-message output (the response is a list of messages, not one
+completion).
+
+## Definition of done
+
+- The service starts locally with a passed-in config (AGENTS.md text, model, provider key).
+- A caller can send a message and receive the streamed multi-message response.
+- Auth and AGENTS.md are applied in memory, with nothing invocation-specific written to a
+  persistent disk.
+- The same wrapper binary runs as a plain local process (parity baseline for later sandbox
+  and pull-config-and-run-locally work).
+
+## Open questions
+
+- Where the service lives in the repo (a new entry under `services/`, or alongside `api/`),
+  and how a Node service fits the Python backend. Decide before writing code.
+- The exact protocol on the port (JSON-lines over stdio, a small HTTP/SSE server, or
+  websockets). Pick the one that matches how Agenta calls completion/chat today.
+- How the multi-message output maps to the completion/chat response contract.
+- Whether WP-1's tracing extension is embedded here from the start or added after.
+
+## Links
+
+- [`../research/pi-interaction.md`](../research/pi-interaction.md)
+- [`../research/auth-secrets.md`](../research/auth-secrets.md)
+- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md)
+- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
new file mode 100644
index 0000000000..81f8cb6e88
--- /dev/null
+++ b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
@@ -0,0 +1,273 @@
+# WP-2 implementation plan: agent service wrapping Pi
+
+Status: MVP built and verified by curl (2026-06-15). Decisions below were taken; the
+"Implemented" section records what shipped. Original decision points are kept marked
+**[DECISION]** for history.
+
+## Implemented (MVP, verified by curl)
+
+Per the decisions: a Python service exposes the Agenta `/invoke` contract (auth,
+middleware, CORS via `ag.create_app`) and calls a thin TypeScript Pi wrapper. Standalone,
+verified with curl. Pi runs on the local login (`openai-codex` / `gpt-5.5`).
+
+What shipped:
+
+- TypeScript Pi wrapper: `services/agent/` (`src/runPi.ts`, `src/cli.ts`). One-shot
+  JSON-over-stdio: read a request on stdin, drive Pi's SDK (`createAgentSession`) with
+  AGENTS.md injected in memory, write the reply as JSON on stdout. Pinned
+  `@earendil-works/pi-coding-agent@0.79.4`. Editable config in `services/agent/config/`
+  (`AGENTS.md`, `agent.json`), read per request so edits need no restart.
+- Python service: `services/oss/src/agent.py` mirrors `chat.py` (`ag.create_app` +
+  `ag.workflow` + `ag.route`, `is_chat` flag). Ports and adapters in
+  `services/oss/src/agent_pi/`: `Harness` port + `PiHarness` (spawns the wrapper over the
+  JSON transport), `Runtime` port + `LocalRuntime` (local subprocess; Daytona slots in
+  here later).
+- Standalone entrypoint: `services/entrypoints/agent_main.py` mounts only the agent app +
+  `/health` for isolated local runs.
+
+How to run and verify locally:
+
+```bash
+cd services/agent && pnpm install            # once
+cd ../ && set -a && source ../.env.test.local && set +a
+AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false \
+  uv run uvicorn entrypoints.agent_main:app --host 0.0.0.0 --port 8090
+
+curl -s -X POST http://localhost:8090/agent/v0/invoke -H "Content-Type: application/json" \
+  -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}'
+# -> {"data":{"outputs":{"role":"assistant","content":"Hi! I'm your friendly hello-world AI assistant."}}, "status":{"code":200}, ...}
+```
+
+## Dockerized (verified by curl)
+
+The agent now runs fully in Docker via a dedicated, self-contained compose that does not
+touch other stacks. Two containers:
+
+- `agent-pi`: the TypeScript Pi wrapper as an HTTP sidecar
+  (`services/agent/src/server.ts`, `docker/Dockerfile.dev`). It copies the read-only
+  mounted `~/.pi/agent` login into a writable container path at startup, so OAuth refresh
+  never writes back to the host. `node_modules` is baked into the image; `src` is
+  bind-mounted so `tsx watch` hot-reloads code edits. Adding npm deps needs a rebuild.
+- `agent-api`: the Python agent service, built from the current services dev Dockerfile
+  (`agenta-agent-api:dev`, a dedicated tag). Selects the HTTP harness via
+  `AGENTA_AGENT_PI_URL` and calls the sidecar in-network. Published on host port 8092.
+
+The Python -> Pi seam is now two adapters behind the same Harness port: `PiHarness`
+(subprocess, local) and `PiHttpHarness` (HTTP, docker). `agent.py` picks by env.
+
+Run and verify:
+
+```bash
+docker compose -f services/agent/docker-compose.agent.yml up --build -d
+curl localhost:8092/health
+curl -s -X POST localhost:8092/agent/v0/invoke -H 'Content-Type: application/json' \
+  -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}'
+# -> 200, {"data":{"outputs":{"role":"assistant","content":"Hello from your friendly Docker agent!"}}, ...}
+docker compose -f services/agent/docker-compose.agent.yml down   # tear down
+```
+
+Note: do not reuse the stale `agenta-oss-dev-services:latest` image (Python 3.11, old SDK
+without `route(app=...)`); the compose builds a fresh `agenta-agent-api:dev` from the
+current Dockerfile instead.
+
+Known gaps / next steps: auth header is bypassed for local curl; streaming, multi-message
+output, and tools; tracing across the boundary is being wired in (OTel deps + `agenta-otel.ts`
+in the wrapper, `TraceContext` in the ports) and the HTTP path / OTLP target still need
+finishing; registering `agenta:builtin:agent:v0` as a real workflow type + template (WP-6)
+and pointing a real dev stack at the sidecar so it runs from the playground.
+
+---
+
+Status: draft for review. Add inline comments anywhere. Decision points are marked
+**[DECISION]** and have a recommended default.
+
+## Context
+
+Agenta runs prompt-style workflows today (completion, chat, LLM-as-a-judge). Each is a
+Python FastAPI app exposing `/invoke` and `/inspect`, all mounted in one `services`
+container (`services/entrypoints/main.py`). The backend and playground call a service by
+POSTing a `WorkflowInvokeRequest` to `{serviceUrl}/invoke` and reading
+`WorkflowBatchResponse.data.outputs` back.
+
+WP-2 adds a new kind of workflow: an agent. An agent runs a harness (Pi by default) that
+drives a model over multiple turns. Pi is a TypeScript/Node SDK
+(`@earendil-works/pi-coding-agent`, pinned `0.79.4`). It has no Python SDK. So the agent
+service is a Node service, the first non-Python service in the dev stack.
+
+This work package builds only the service. It runs Pi locally (no Daytona), with hardcoded
+config (AGENTS.md text, model, provider key from env). The goal is to stand up the right
+ports and adapters even for the simplest MVP, so Daytona and other harnesses slot in later
+without reshaping the service.
+
+Source: `wp-2-agent-service/README.md` and the research it links
+(`research/pi-interaction.md`, `research/diskless-in-memory-config.md`).
+
+## What I confirmed in the codebase
+
+- All Python services run in one `services` container, each mounted at its own path and
+  exposing `/invoke` + `/inspect` (`services/entrypoints/main.py:135`).
+- The chat handler takes `inputs`, `messages`, and `parameters`
+  (`services/oss/src/chat.py:18`). The routing decorator pulls these from the
+  `WorkflowInvokeRequest` envelope.
+- The playground resolves `serviceUrl` from the workflow's `data.url` (or builds it from
+  `data.uri`) and POSTs directly from the browser to `{serviceUrl}/invoke`
+  (`web/packages/agenta-entities/src/workflow/state/runnableSetup.ts:246`). So the service
+  needs the same request/response shapes and CORS as the Python services
+  (`services/entrypoints/main.py:115`).
+- The dev stack hot-reloads via bind mounts plus uvicorn `--reload`, and traefik routes
+  `PathPrefix(/services/)` after stripping the prefix
+  (`hosting/docker-compose/oss/docker-compose.dev.yml:351`).
+- Research confirms Pi runs fully diskless through its SDK: in-memory auth, AGENTS.md,
+  model, and sessions (`research/diskless-in-memory-config.md`).
+
+## Scope
+
+In:
+- A new Node/TypeScript service that exposes the Agenta `/invoke` contract directly.
+- Drives Pi through its SDK (`createAgentSession`) in-process, config in memory.
+- Hardcoded config: AGENTS.md text, model id, provider key from env. Config read from a
+  mounted file so it is editable and hot-reloads.
+- Ports and adapters wired from the start (see Architecture).
+- Dockerized with hot-reload, wired into the OSS dev compose and traefik.
+
+Out (later WPs, per the design doc):
+- Daytona sandbox. The runtime adapter is the local process for now.
+- Streaming and multi-message output. This cut returns the final assistant text as a
+  single `data.outputs`.
+- Custom tools and skills. Stubbed for the first cut.
+- Server-side config persistence. Config is passed in at startup.
+- Other harnesses (Codex, Claude Code). Design the port for them, implement only Pi.
+
+## Architecture: ports and adapters
+
+The service is harness-agnostic at its core, with the two ports the design doc calls out.
+
+```
+HTTP layer (Fastify or Express): POST /invoke, POST /inspect, GET /health, CORS
+        |
+Core (no Pi, no Daytona):
+   AgentRunner.run(config, messages, inputs) -> { output }
+        |                                  |
+   Port: Harness                      Port: Runtime (environment)
+   setup(config)                      start() / shutdown()
+   invoke(messages, inputs)           pause() / connectVolume()
+   stop() / shutdown()
+        |                                  |
+   Adapter: PiSdkHarness              Adapter: LocalRuntime
+   (createAgentSession,              (in-process; the Node process
+    in-memory auth + AGENTS.md         itself is the run environment)
+    + model, SessionManager
+    .inMemory())                      [later: DaytonaRuntime in WP-3]
+   [later: PiRpcHarness]
+```
+
+- Harness port: the seam between our service and the agent engine. Pi is one
+  implementation. The MVP ships one adapter, `PiSdkHarness`. The doc also floats RPC and
+  JSON adapters; the port shape leaves room for `PiRpcHarness` later.
+  **[DECISION]** Drive Pi via the SDK in-process for the MVP (recommended: simplest for a
+  Node service, gives in-memory auth + AGENTS.md + model), rather than spawning `pi --mode
+  rpc`.
+- Runtime port: the seam for the run environment (start, shutdown, pause, connect volume).
+  The MVP adapter is `LocalRuntime` (the Node process). `DaytonaRuntime` lands in WP-3
+  behind the same port.
+
+### PiSdkHarness (the MVP adapter)
+
+Per `research/diskless-in-memory-config.md`:
+- `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` for the LLM key.
+- `DefaultResourceLoader` with `noContextFiles: true` and `agentsFilesOverride` (or
+  `systemPromptOverride`) to inject AGENTS.md text in memory.
+- `SessionManager.inMemory()`, `SettingsManager.inMemory()`,
+  `ModelRegistry.inMemory(auth)` so nothing persists.
+- `model: getModel(provider, modelId)`.
+- `TMPDIR` set to a tmpfs for Pi's bash output spillover (the one forced write).
+- MVP run: `await session.prompt(text)`, then read the final assistant text from
+  `session.messages` (or the `agent_end` event). Return it as `data.outputs`. No
+  streaming.
+
+## HTTP contract (mirror chat)
+
+- `POST /invoke`: accept `{ data: { parameters, inputs }, references?, ... }`. Pull the
+  user message from `inputs`/`messages` the way chat does
+  (`services/oss/src/chat.py:18`). Return
+  `{ version, data: { outputs }, status: { code: 200 }, trace_id, span_id }`.
+- `POST /inspect`: return the parameters/inputs schema. The MVP can return a minimal
+  static schema, enough for the backend inspect path.
+- `GET /health`: `{ status: "ok" }`.
+- CORS: allow the same origins as the Python services so the browser can call it directly.
+
+Auth note: the Python services verify an `Authorization: Secret {token}` header via SDK
+middleware. The local MVP can accept the header without verifying it. Real verification is
+a later concern. Flagging this as a known gap.
+
+## Repo placement and Docker
+
+- New Node project at `services/agent/`: own `package.json`, `tsconfig.json`, `src/` (with
+  `http/`, `core/`, `adapters/pi/`, `adapters/runtime/`), `config/` (the editable
+  AGENTS.md and model config), and `docker/Dockerfile.dev` + `docker/Dockerfile.gh`.
+- Pin `@earendil-works/pi-coding-agent@0.79.4` and `@earendil-works/pi-ai@0.79.4`.
+- Hot-reload: run with `tsx watch` (or `node --watch`). Bind-mount `services/agent/src` and
+  `services/agent/config`; keep `node_modules` in the image via an anonymous volume so the
+  host/container split does not break it.
+- New compose service block in `hosting/docker-compose/oss/docker-compose.dev.yml` (model
+  the existing `services` block at line 351). Own port (for example 8090), traefik router
+  `PathPrefix(/agent/)` that strips the prefix, env_file for the provider key.
+- The provider key (for example `OPENAI_API_KEY`) goes in the dev env file the compose
+  service reads.
+
+## Verification
+
+1. Bring up the OSS dev stack with the new service:
+   `./hosting/docker-compose/run.sh --oss --dev --build`.
+2. `curl http://localhost/agent/health` returns ok.
+3. `curl -X POST http://localhost/agent/invoke` with a chat-style body and a message;
+   confirm the response carries the agent reply in `data.outputs`. This is the core WP-2
+   definition of done.
+4. Edit `services/agent/config/AGENTS.md`; confirm the change is picked up without a
+   rebuild.
+5. End-to-end demo (only if decided in scope below): register an agent workflow whose
+   `data.url` points at the agent service, open it in the playground, send a message, see
+   the output.
+
+## Decisions to confirm
+
+**[DECISION 1] Service shape.** Recommended: a pure Node service that speaks `/invoke`
+directly (matches the doc, fewest moving parts). Alternative: a Python shim in the existing
+services container that bridges to a Node Pi sidecar (reuses Agenta auth/tracing
+middleware, adds a hop).
+> Your call: We should use python then call ts for the moment. The Py provides authentication, middleware, and a bunch of things. 
+
+**[DECISION 2] How far this iteration goes.** Option A: standalone service, verified by
+curl (the true WP-2 definition of done). Option B: also wire the minimal end-to-end so you
+can create an agent and run it in the playground (overlaps WP-6's workflow-type
+registration).
+> Your call: Let's start with the standalone service verified by curl
+
+**[DECISION 3] LLM key for Pi.** `.env.test.local` only has Agenta cloud creds, not a model
+key. Pi needs a real provider key to run. Which provider and model for the hardcoded
+"hello world" agent (for example OpenAI `gpt-4o-mini`)? Can you supply the key as an env
+var for a live verification, or should I build without live verification for now?
+> Your call: I have set up 
+
+**[DECISION 4] Pi driving mode.** Recommended: SDK in-process. Alternative: `pi --mode rpc`
+subprocess. SDK is simpler here and supports in-memory auth and AGENTS.md.
+> Your call:
+I have set up auth What's left — your one-time Pi login
+`~/.pi/agent` doesn't exist yet, so no model is available. Pi can't reuse the `~/.codex` token directly; it needs its own login (same ChatGPT account, browser OAuth — I can't drive that for you):
+
+```bash
+cd docs/design/agent-workflows/wp-1-pi-tracing/poc
+pnpm exec pi          # TUI opens
+# type:  /login  →  choose "ChatGPT Plus/Pro (Codex)"  →  finish browser OAuth  →  quit
+pnpm start            # runs the agent, exports the trace
+```
+
+(Or `export OPENAI_API_KEY=...` / `ANTHROPIC_API_KEY=...` instead of logging in.)
+
+After `pnpm start`, watch for `[agenta-otel] exporting spans to .../api/otlp/v1/traces` and `[run] flushed`, then open Agenta observability on the dev box and find the `invoke_agent` trace — verify the tree types correctly and the `chat` span carries model, latency, and token usage.
+
+Want me to wait while you log in, then I'll run it and verify the trace in Agenta together — or would you rather I add the Pi-native model-usage cost (`gen_ai.usage.cost`) display check to the verification while you do that?
+
+
+ Logged in to ChatGPT Plus/Pro (Codex Subscription). Selected gpt-5.5. Credentials saved
+ to /home/mahmoud/.pi/agent/auth.json
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md b/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md
new file mode 100644
index 0000000000..89a775f7e0
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md
@@ -0,0 +1,99 @@
+# WP-3: Daytona sandbox running Pi
+
+Status: **POC complete** against Daytona cloud (`target=eu`). See
+[`poc/`](poc/README.md). Ran in parallel with WP-1 and WP-2.
+
+## Goal
+
+Prove the sandbox track end to end: create a Daytona sandbox with Pi installed, inject the
+agent's files and secrets, run an agent, stream the output back, and tear down. This takes
+the local Pi wrapper (WP-2) and shows it running inside a sandbox. The two can be developed
+in parallel, since the Daytona lifecycle and image work do not depend on the wrapper being
+finished.
+
+## What the POC established
+
+The POC ([`poc/`](poc/README.md)) does the full loop against Daytona cloud and answers the
+key unknowns:
+
+- **Bake Pi into a snapshot.** `build_snapshot.py` builds `agenta-pi-harness` from
+  `node:22-bookworm` + Pi `0.79.4` + ripgrep/fd in ~26s. Daytona injects its toolbox daemon
+  into the custom image, so `process.exec` / `fs` / sessions work on a plain node base (no
+  need to layer on `daytonaio/sandbox`).
+- **Cold start is sub-second warm.** Creating a sandbox from the prebuilt snapshot is
+  ~0.7-1.1s on a warm runner, with an occasional few-second spike when a runner pulls the
+  custom image cold. That beats installing Pi per run (npm install alone is ~3s).
+- **Inject config + secret, run, stream, tear down.** `run_agent.py` lays an `AGENTS.md`
+  and a task file into a per-run dir, injects the provider credential (env var or uploaded
+  credential file), runs Pi headless in `--mode json`, streams the typed event lines, and
+  deletes the sandbox. The agent honored the injected `AGENTS.md` and used tools
+  (`read`, `read`, `write`).
+- **Gotcha: Pi blocks on a trust prompt.** With an `AGENTS.md` in cwd, Pi asks to trust
+  project-local files and hangs in a non-interactive session. Pass `--approve` and run with
+  stdin from `/dev/null`. This was the main trap.
+
+Full findings, the measured numbers, and how to run it: [`poc/README.md`](poc/README.md).
+
+## Scope
+
+In:
+
+- Create a Daytona sandbox from the Python SDK (`pip install daytona`,
+  `Daytona` / `AsyncDaytona`): `create` -> `process.exec` / sessions -> `stop` -> `delete`.
+- Bake Pi into a Daytona snapshot (declarative `Image` builder or Dockerfile) so runs skip
+  per-run `npm install`. Pre-install `rg` / `fd`.
+- Inject files (`fs.upload_file` / `upload_files`) and secrets (`env_vars` at create, or
+  per-exec `env`).
+- Run Pi headless and stream stdout/stderr back (session with `run_async=True`,
+  `get_session_command_logs_async`).
+- Expose and use the port via `get_preview_link(port)` (the "works with our port" contract).
+- One shared long-lived sandbox (`auto_stop_interval: 0`), per-run working directory plus a
+  per-run tmpfs for `TMPDIR`, bounded concurrency.
+
+Out:
+
+- Volume-per-execution. Not feasible in Daytona (volumes mount at create time only); use the
+  per-run dir + tmpfs approach instead.
+- The provider abstraction for non-Daytona sandboxes. Keep the seam thin, but only implement
+  Daytona here.
+
+## Approach (grounded in research)
+
+See [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md) and
+[`../research/sandbox-sharing.md`](../research/sandbox-sharing.md).
+
+## Definition of done
+
+- [x] A script creates a sandbox from a Pi snapshot, injects an AGENTS.md and a provider
+  key, runs an agent, streams the multi-message output, and tears down cleanly.
+- [x] Nothing invocation-specific is written to a persistent volume. No volume is mounted;
+  each run uses a per-run dir plus a `TMPDIR` inside it, and the sandbox is deleted at the
+  end.
+- [x] Cold-start with the custom snapshot is measured and recorded (`poc/README.md`).
+
+## Open questions
+
+Answered by the POC:
+
+- Daytona cloud works end to end with the provided `eu` credentials; the node-base snapshot
+  gets a working toolbox; cold start from the prebuilt snapshot is sub-second warm.
+- Secret injection has two working paths: `env_vars` at create (secret-as-env) and an
+  uploaded credential file via `fs.upload_file` (secret-as-file).
+
+Still open:
+
+- Self-hosted Daytona vs Daytona cloud (AGPL review if self-host-and-modify). POC used
+  cloud only.
+- Whether an actively streaming session resets the auto-stop idle timer. Sidestepped with
+  `auto_stop_interval=0` and owning the lifecycle; not independently confirmed.
+- Realistic safe parallel-run count for one small sandbox (needs load testing).
+- The snapshot build/version pipeline: who builds and pins `agenta-pi-harness` per agent
+  revision, and where that runs (CI or config-publish time).
+
+## Links
+
+- [`poc/`](poc/README.md) — the working POC (build snapshot, run agent, bench cold start)
+- [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md)
+- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md)
+- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md
new file mode 100644
index 0000000000..452322d858
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md
@@ -0,0 +1,118 @@
+# WP-3 POC: run a Pi agent in a Daytona cloud sandbox
+
+Bakes Pi into a Daytona snapshot, then creates a sandbox from it, injects the agent's
+credential and config, runs the agent headless, streams its multi-message output back,
+and tears the sandbox down. Runs against **Daytona cloud** (`target=eu`).
+
+This is the sandbox half of the agent runtime. It validates the `DaytonaRuntime` adapter
+that WP-2 leaves behind its `Runtime` port (`start` -> create sandbox, inject config ->
+lay down the per-run dir, `invoke` -> run Pi and stream, `shutdown` -> delete).
+
+## What's here
+
+- `build_snapshot.py` — bake Pi (+ ripgrep, fd) into the reusable `agenta-pi-harness`
+  snapshot so per-run cold start skips `npm install`. Run once.
+- `run_agent.py` — the deliverable. Create -> inject -> run -> stream -> tear down.
+- `bench_coldstart.py` — measure cold start, Pi snapshot vs the default image.
+- `cleanup.py` — list sandboxes and delete leaked WP-3 ones.
+
+## Setup
+
+Needs `uv` and Daytona cloud credentials. Export them (the dev values live in
+`hosting/docker-compose/ee/.env.ee.dev.local`):
+
+```bash
+export DAYTONA_API_KEY=dtn_...
+export DAYTONA_API_URL=https://app.daytona.io/api
+export DAYTONA_TARGET=eu
+```
+
+Each script declares its own deps inline, so `uv run <script>.py` is enough.
+
+### Build the snapshot (once)
+
+```bash
+uv run build_snapshot.py        # ~26s; idempotent, pass --force to rebuild
+```
+
+### Provider credential
+
+The agent needs a model. `run_agent.py` supports two injection paths:
+
+- `--auth codex` (default): uploads your local Pi ChatGPT login
+  (`~/.pi/agent/auth.json`) into the sandbox and runs on `openai-codex/gpt-5.5`. This is
+  the **secret-as-file** path and needs no paid key. Log in once locally with `pi` then
+  `/login` -> "ChatGPT Plus/Pro (Codex)".
+- `--auth anthropic|openai|google`: injects the matching `*_API_KEY` env var into the
+  sandbox (`env_vars`) and runs on that provider. This is the **secret-as-env** path.
+
+## Run
+
+```bash
+uv run run_agent.py                       # codex / gpt-5.5
+uv run run_agent.py --auth anthropic      # needs ANTHROPIC_API_KEY with credit
+uv run run_agent.py --keep                # leave the sandbox up for debugging
+```
+
+The agent reads a task file and an injected `AGENTS.md`, then writes `greeting.txt`. A
+clean run streams `[tool] read`, `[tool] read`, `[tool] write`, prints the reconstructed
+multi-message transcript and the file the agent produced, then deletes the sandbox.
+
+## What the POC proves (definition of done)
+
+- A script creates a sandbox from a Pi snapshot, injects an `AGENTS.md` and a provider
+  credential, runs an agent, streams the multi-message output, and tears down cleanly.
+  **Done.** The agent honored the injected `AGENTS.md` (it signed the file `-- signed,
+  Pip`, an instruction that exists only in the injected file) and used the `read`/`write`
+  tools to do the task.
+- Nothing invocation-specific is written to a persistent volume. **Done.** No Daytona
+  volume is mounted. Each run gets `/home/daytona/runs/<id>/` for its config, session
+  file, and `TMPDIR` (Pi's only forced write, the bash output spillover). The sandbox is
+  deleted at the end.
+- Cold start with the custom snapshot is measured and recorded. **Done.** See below.
+
+## Cold start (measured)
+
+`create()` to `STARTED`, Daytona cloud `eu`:
+
+| snapshot              | min   | mean  | max   | notes                                  |
+| --------------------- | ----- | ----- | ----- | -------------------------------------- |
+| `agenta-pi-harness`   | 0.7s  | ~1s   | 4.9s  | sub-1.1s warm; spikes when a runner pulls the custom image cold |
+| `daytona-small`       | 0.66s | 0.86s | 1.06s | steadier; the base image is pre-cached on every runner |
+
+The prebuilt Pi snapshot lands in the same sub-second range as the stock image on a warm
+runner, and occasionally pays a one-time image-pull penalty (a few seconds) on a cold
+runner. Both beat installing Pi at runtime, where `npm install` alone is ~3s every run.
+The agent task itself (gpt-5.5, read + read + write) ran in ~11s.
+
+## Findings and gotchas
+
+- **The node base image works.** Daytona injects its toolbox daemon into the custom image,
+  so `process.exec` / `fs` / sessions work on a `node:22-bookworm` base. No need to layer
+  on `daytonaio/sandbox`. Inside: `USER=root`, `HOME=/root`, `PWD=/home/daytona`.
+- **Pi blocks on a trust prompt without `--approve`.** With an `AGENTS.md` in the working
+  dir, Pi asks to trust project-local files and hangs in a non-interactive session. Pass
+  `--approve` (and run with stdin from `/dev/null`) so headless runs never stall. This was
+  the single biggest gotcha; the run looked hung when it was waiting on a prompt.
+- **Streaming maps cleanly.** Run Pi as `--mode json`; each stdout line is one typed
+  event. `get_session_command_logs_async(session_id, cmd_id, on_stdout, on_stderr)` streams
+  them live. The `agent_end` event carries the full `messages[]` array (the multi-message
+  output); `message_end` events carry per-message token usage and cost. Chunks are not
+  line-aligned, so buffer and split on `\n`.
+- **`SessionExecuteRequest` has no `env`/`cwd`.** Only the one-shot `process.exec` does. For
+  the streaming session path, inject the key via `env_vars` at create time and `cd` into
+  the per-run dir inside the command string.
+- **All three stored API keys are dead** (Anthropic: no credit; OpenAI: invalid; Gemini:
+  expired), so the POC defaults to the developer's ChatGPT login. Production needs a real
+  provider key or org credential injected the same way (`env_vars` or a credential file).
+- **Avoid `gpt-5.3-codex-spark` on a ChatGPT login** (it 400s). Use `gpt-5.5` / `gpt-5.4`.
+
+## Open questions answered vs. still open
+
+Answered: prebuilt-snapshot cold start (sub-1.1s warm), the node-base toolbox question
+(works), and the secret-injection path (env var or uploaded credential file).
+
+Still open: realistic safe parallel-run count in one shared sandbox (needs load testing,
+not measured here); whether an actively streaming session resets the auto-stop timer (we
+sidestep it with `auto_stop_interval=0`); and the snapshot build/version pipeline (who
+builds and pins `agenta-pi-harness` per agent revision).
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py
new file mode 100644
index 0000000000..780ddc0a3a
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py
@@ -0,0 +1,49 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "daytona",
+# ]
+# ///
+"""Measure sandbox cold start from the prebuilt Pi snapshot vs the default
+`daytona-small`, to answer the WP-3 open question. Creates N sandboxes per snapshot
+(serially), times `create()` -> STARTED, then deletes. Prints per-create timings and
+a summary."""
+
+import os
+import statistics
+import time
+
+from daytona import CreateSandboxFromSnapshotParams, Daytona, DaytonaConfig
+
+N = 3
+SNAPSHOTS = ["agenta-pi-harness", "daytona-small"]
+
+daytona = Daytona(
+    DaytonaConfig(
+        api_key=os.environ["DAYTONA_API_KEY"],
+        api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
+        target=os.environ.get("DAYTONA_TARGET", "eu"),
+    )
+)
+
+results: dict[str, list[float]] = {}
+for snap in SNAPSHOTS:
+    times: list[float] = []
+    for i in range(N):
+        t = time.monotonic()
+        sb = daytona.create(
+            CreateSandboxFromSnapshotParams(snapshot=snap, auto_stop_interval=0),
+            timeout=120,
+        )
+        dt = time.monotonic() - t
+        times.append(dt)
+        print(f"{snap:20} run {i + 1}/{N}: {dt:.2f}s  state={sb.state}", flush=True)
+        daytona.delete(sb)
+    results[snap] = times
+
+print("\n=== cold-start summary (create -> STARTED) ===")
+for snap, times in results.items():
+    print(
+        f"{snap:20} min={min(times):.2f}s  mean={statistics.mean(times):.2f}s  "
+        f"max={max(times):.2f}s  n={len(times)}"
+    )
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py
new file mode 100644
index 0000000000..d990803712
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py
@@ -0,0 +1,95 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "daytona",
+# ]
+# ///
+"""
+WP-3 step 1: bake Pi into a reusable Daytona snapshot so per-run cold start skips
+`npm install`. Built from a Node base image with the Pi coding agent and the search
+binaries (rg, fd) Pi expects pre-installed.
+
+Idempotent: skips the build if the snapshot already exists, unless --force is passed
+(which deletes and rebuilds it). Streams the build logs and prints the wall-clock
+build time.
+
+Run:
+    DAYTONA_API_KEY=... DAYTONA_API_URL=... DAYTONA_TARGET=eu \
+        uv run build_snapshot.py [--force]
+"""
+
+import os
+import sys
+import time
+
+from daytona import (
+    CreateSnapshotParams,
+    Daytona,
+    DaytonaConfig,
+    Image,
+    Resources,
+)
+
+SNAPSHOT_NAME = "agenta-pi-harness"
+PI_PACKAGE = "@earendil-works/pi-coding-agent@0.79.4"
+
+
+def client() -> Daytona:
+    return Daytona(
+        DaytonaConfig(
+            api_key=os.environ["DAYTONA_API_KEY"],
+            api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
+            target=os.environ.get("DAYTONA_TARGET", "eu"),
+        )
+    )
+
+
+def snapshot_exists(daytona: Daytona, name: str) -> bool:
+    page = daytona.snapshot.list()
+    items = getattr(page, "items", page)
+    return any(getattr(s, "name", None) == name for s in items)
+
+
+def main() -> None:
+    force = "--force" in sys.argv
+    daytona = client()
+
+    if snapshot_exists(daytona, SNAPSHOT_NAME):
+        if not force:
+            print(
+                f"snapshot '{SNAPSHOT_NAME}' already exists; pass --force to rebuild."
+            )
+            return
+        print(f"deleting existing snapshot '{SNAPSHOT_NAME}' to rebuild...")
+        snap = daytona.snapshot.get(SNAPSHOT_NAME)
+        daytona.snapshot.delete(snap)
+
+    # Node base + Pi + search binaries. fd ships on Debian as `fdfind`; Pi looks for
+    # `fd`, so symlink it. --ignore-scripts matches the Pi README's install guidance.
+    image = (
+        Image.base("node:22-bookworm")
+        .run_commands(
+            "apt-get update && apt-get install -y --no-install-recommends ripgrep fd-find && rm -rf /var/lib/apt/lists/*",
+            "ln -sf $(command -v fdfind) /usr/local/bin/fd",
+            f"npm install -g --ignore-scripts {PI_PACKAGE}",
+            "pi --version || true",
+        )
+        .workdir("/home/daytona")
+    )
+
+    print(f"building snapshot '{SNAPSHOT_NAME}' (this builds + pushes an image)...")
+    started = time.monotonic()
+    daytona.snapshot.create(
+        CreateSnapshotParams(
+            name=SNAPSHOT_NAME,
+            image=image,
+            resources=Resources(cpu=2, memory=4, disk=8),
+        ),
+        on_logs=print,
+    )
+    elapsed = time.monotonic() - started
+    print(f"\nsnapshot '{SNAPSHOT_NAME}' built in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py
new file mode 100644
index 0000000000..341bccd3e8
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py
@@ -0,0 +1,43 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "daytona",
+# ]
+# ///
+"""List non-archived sandboxes and delete any labeled as WP-3 runs (or pass an id).
+
+Run:
+    DAYTONA_API_KEY=... DAYTONA_API_URL=... DAYTONA_TARGET=eu uv run cleanup.py [sandbox_id ...]
+"""
+
+import os
+import sys
+
+from daytona import Daytona, DaytonaConfig
+
+daytona = Daytona(
+    DaytonaConfig(
+        api_key=os.environ["DAYTONA_API_KEY"],
+        api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
+        target=os.environ.get("DAYTONA_TARGET", "eu"),
+    )
+)
+
+ids = sys.argv[1:]
+boxes = list(daytona.list())
+print(f"{len(boxes)} sandbox(es):")
+for b in boxes:
+    labels = getattr(b, "labels", {}) or {}
+    is_wp3 = labels.get("agenta-wp") == "wp-3"
+    print(f"  id={b.id} state={b.state} labels={labels}")
+    if b.id in ids or (
+        not ids
+        and is_wp3
+        and str(b.state) not in ("SandboxState.ARCHIVED", "SandboxState.DELETED")
+    ):
+        print(f"    -> deleting {b.id}")
+        try:
+            daytona.delete(b)
+            print("    deleted.")
+        except Exception as e:  # noqa: BLE001
+            print(f"    delete failed: {e}")
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py
new file mode 100644
index 0000000000..4e2f63ab59
--- /dev/null
+++ b/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py
@@ -0,0 +1,325 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "daytona",
+# ]
+# ///
+"""
+WP-3 deliverable: run a Pi agent inside a Daytona cloud sandbox end to end.
+
+Steps, matching the WP-3 definition of done:
+  1. Create a sandbox from the prebuilt `agenta-pi-harness` snapshot (Pi baked in).
+     Time the cold start.
+  2. Inject the provider credential (see "Auth" below) and lay the agent's config
+     into a per-run working directory: AGENTS.md (the agent's instructions) plus a
+     task input file. Nothing is written to a persistent volume; the per-run dir is
+     the isolation unit (sandbox-sharing research), and TMPDIR is pinned inside it so
+     bash spillover stays contained.
+  3. Run Pi headless in `--mode json` inside a Daytona session and stream the JSON
+     event lines back live.
+  4. Reconstruct the multi-message output (assistant text + tool calls/results) and
+     token usage from the streamed events.
+  5. Tear down: delete the session and the sandbox.
+
+Auth (PI_AUTH env or --auth):
+  - codex (default): upload the developer's Pi ChatGPT login (~/.pi/agent/auth.json)
+    into the sandbox and run on openai-codex/gpt-5.5. This is the secret-as-file
+    injection path and is what works without a paid provider key.
+  - anthropic | openai | google: inject the matching *_API_KEY env var into the
+    sandbox (env_vars) and run on that provider. This is the secret-as-env path.
+
+Run:
+    DAYTONA_API_KEY=... DAYTONA_API_URL=... DAYTONA_TARGET=eu \
+        uv run run_agent.py [--keep] [--auth codex|anthropic|openai|google] [--model ID]
+
+  --keep   leave the sandbox running (skip teardown) for debugging
+  --auth   credential strategy (default: codex)
+  --model  override the model id
+"""
+
+import asyncio
+import json
+import os
+import sys
+import time
+import uuid
+from pathlib import Path
+
+from daytona import (
+    AsyncDaytona,
+    CreateSandboxFromSnapshotParams,
+    DaytonaConfig,
+    SessionExecuteRequest,
+)
+
+SNAPSHOT_NAME = "agenta-pi-harness"
+
+# provider -> (default model, api-key env var name)
+PROVIDERS = {
+    "anthropic": ("claude-sonnet-4-5", "ANTHROPIC_API_KEY"),
+    "openai": ("gpt-4o-mini", "OPENAI_API_KEY"),
+    "google": ("gemini-2.0-flash", "GEMINI_API_KEY"),
+    "codex": ("gpt-5.5", None),  # openai-codex, auth via uploaded auth.json
+}
+
+# The agent's instructions. Pi auto-discovers AGENTS.md from the working dir, so a
+# behavioural marker here ("sign off as Pip") proves the injected config is honored.
+AGENTS_MD = """\
+# Greeter agent
+
+You are a terse assistant running in a sandbox.
+
+- Do exactly what the task file asks, nothing more.
+- Always end any file you create with a final line: `-- signed, Pip`
+"""
+
+# A task the agent must read with a tool, then act on. Forces a read -> write tool
+# sequence, which exercises the multi-message output path.
+TASK_TXT = """\
+TODO: greet the user by name (use "Mahmoud")
+TODO: state the current working directory
+TODO: add a one-line haiku about sandboxes
+Write the result to greeting.txt.
+"""
+
+PROMPT = (
+    "Read task.txt in the current directory and carry out every TODO in it. "
+    "Follow the instructions in AGENTS.md."
+)
+
+
+def log(msg: str) -> None:
+    print(msg, flush=True)
+
+
+class EventCollector:
+    """Parses Pi's --mode json event stream into a multi-message output."""
+
+    def __init__(self) -> None:
+        self.buffer = ""
+        self.session_id: str | None = None
+        self.messages: list[dict] = []  # final messages[] from agent_end
+        self.usage: dict | None = None
+        self.tool_calls: list[str] = []
+        self.error: str | None = None
+
+    def feed_stdout(self, chunk: str) -> None:
+        self.buffer += chunk
+        while "\n" in self.buffer:
+            line, self.buffer = self.buffer.split("\n", 1)
+            if line.strip():
+                self._handle_line(line.strip())
+
+    def feed_stderr(self, chunk: str) -> None:
+        text = chunk.rstrip()
+        if text:
+            log(f"  [stderr] {text}")
+
+    def flush(self) -> None:
+        if self.buffer.strip():
+            self._handle_line(self.buffer.strip())
+            self.buffer = ""
+
+    def _handle_line(self, line: str) -> None:
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            log(f"  [raw] {line[:200]}")
+            return
+
+        etype = ev.get("type")
+        if etype == "session":
+            self.session_id = ev.get("id")
+            log(f"  [session] {self.session_id}")
+        elif etype == "message_update":
+            ame = ev.get("assistantMessageEvent", {})
+            if ame.get("type") == "text_delta":
+                sys.stdout.write(ame.get("delta", ""))
+                sys.stdout.flush()
+            elif ame.get("type") in ("tool_call_start", "tool_start"):
+                name = ame.get("toolName") or ame.get("name", "?")
+                log(f"\n  [tool-call] {name}")
+                self.tool_calls.append(name)
+        elif etype in ("tool_execution_start", "tool_start"):
+            name = ev.get("toolName") or ev.get("name", "?")
+            log(f"\n  [tool] {name}")
+            self.tool_calls.append(name)
+        elif etype == "message_end":
+            msg = ev.get("message", {})
+            if msg.get("usage"):
+                self.usage = msg["usage"]
+            if msg.get("stopReason") == "error":
+                self.error = (msg.get("errorMessage") or "")[:300]
+        elif etype == "agent_end":
+            self.messages = ev.get("messages", [])
+            log("\n  [agent_end]")
+        elif etype == "error":
+            self.error = json.dumps(ev)[:300]
+            log(f"\n  [error] {self.error}")
+
+
+def render_messages(messages: list[dict]) -> str:
+    """Flatten Pi's messages[] into a readable multi-message transcript."""
+    out: list[str] = []
+    for m in messages:
+        role = m.get("role", "?")
+        parts: list[str] = []
+        for c in m.get("content", []):
+            ctype = c.get("type")
+            if ctype == "text":
+                parts.append(c.get("text", ""))
+            elif ctype in ("tool_use", "toolUse"):
+                parts.append(
+                    f"<tool_use {c.get('name')} {json.dumps(c.get('input', {}))[:160]}>"
+                )
+            elif ctype in ("tool_result", "toolResult"):
+                parts.append(f"<tool_result {json.dumps(c.get('content'))[:160]}>")
+            else:
+                parts.append(f"<{ctype}>")
+        out.append(f"[{role}] " + " ".join(p for p in parts if p))
+    return "\n".join(out)
+
+
+def arg(name: str, default: str) -> str:
+    return sys.argv[sys.argv.index(name) + 1] if name in sys.argv else default
+
+
+async def main() -> None:
+    keep = "--keep" in sys.argv
+    auth = arg("--auth", os.environ.get("PI_AUTH", "codex"))
+    if auth not in PROVIDERS:
+        log(f"unknown --auth '{auth}'; choose one of {list(PROVIDERS)}")
+        sys.exit(1)
+    default_model, key_env = PROVIDERS[auth]
+    model = arg("--model", default_model)
+    provider = "openai-codex" if auth == "codex" else auth
+
+    # Resolve the credential to inject.
+    env_vars: dict[str, str] = {}
+    auth_json: bytes | None = None
+    if auth == "codex":
+        auth_path = Path(arg("--auth-json", str(Path.home() / ".pi/agent/auth.json")))
+        if not auth_path.exists():
+            log(f"codex auth requires {auth_path}; run `pi` then `/login` first.")
+            sys.exit(1)
+        auth_json = auth_path.read_bytes()
+    else:
+        val = os.environ.get(key_env or "", "")
+        if not val:
+            log(f"--auth {auth} requires {key_env} in the environment.")
+            sys.exit(1)
+        env_vars[key_env] = val
+
+    run_id = uuid.uuid4().hex[:12]
+    run_dir = f"/home/daytona/runs/{run_id}"
+    session_id = f"agenta-run-{run_id}"
+    timings: dict[str, float] = {}
+
+    config = DaytonaConfig(
+        api_key=os.environ["DAYTONA_API_KEY"],
+        api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
+        target=os.environ.get("DAYTONA_TARGET", "eu"),
+    )
+
+    async with AsyncDaytona(config) as daytona:
+        log(
+            f"[1/5] creating sandbox from '{SNAPSHOT_NAME}' (provider={provider} model={model})..."
+        )
+        t0 = time.monotonic()
+        sandbox = await daytona.create(
+            CreateSandboxFromSnapshotParams(
+                snapshot=SNAPSHOT_NAME,
+                env_vars=env_vars or None,
+                auto_stop_interval=0,  # own the lifecycle; no idle auto-stop
+                labels={"agenta-wp": "wp-3", "run-id": run_id},
+            ),
+            timeout=120,
+        )
+        timings["cold_start_s"] = time.monotonic() - t0
+        log(f"      sandbox {sandbox.id} ready in {timings['cold_start_s']:.2f}s")
+
+        try:
+            log(f"[2/5] injecting credential + AGENTS.md + task.txt into {run_dir} ...")
+            await sandbox.fs.create_folder(run_dir, "755")
+            await sandbox.fs.create_folder(f"{run_dir}/tmp", "777")
+            await sandbox.fs.upload_file(AGENTS_MD.encode(), f"{run_dir}/AGENTS.md")
+            await sandbox.fs.upload_file(TASK_TXT.encode(), f"{run_dir}/task.txt")
+            if auth_json is not None:
+                # Secret-as-file: drop the Pi login where Pi looks for it ($HOME=/root).
+                await sandbox.fs.create_folder("/root/.pi/agent", "700")
+                await sandbox.fs.upload_file(auth_json, "/root/.pi/agent/auth.json")
+                await sandbox.fs.set_file_permissions(
+                    "/root/.pi/agent/auth.json", mode="600"
+                )
+
+            log("[3/5] running Pi headless (--mode json), streaming events:\n")
+            # --approve trusts the project-local AGENTS.md so Pi does not block on an
+            # interactive trust prompt. stdin from /dev/null guards against any other
+            # read. cwd is the per-run dir so AGENTS.md/task.txt are discovered.
+            pi_cmd = (
+                f"cd {run_dir} && TMPDIR={run_dir}/tmp "
+                f"pi -p {json.dumps(PROMPT)} "
+                f"--mode json --approve --provider {provider} --model {model} "
+                f"-t read,bash,edit,write,ls "
+                f"--session-dir {run_dir}/.pi-sessions --name {session_id} "
+                f"< /dev/null"
+            )
+
+            await sandbox.process.create_session(session_id)
+            collector = EventCollector()
+            t1 = time.monotonic()
+            resp = await sandbox.process.execute_session_command(
+                session_id,
+                SessionExecuteRequest(command=pi_cmd, run_async=True),
+            )
+            cmd_id = resp.cmd_id
+            await sandbox.process.get_session_command_logs_async(
+                session_id,
+                cmd_id,
+                collector.feed_stdout,
+                collector.feed_stderr,
+            )
+            collector.flush()
+            timings["agent_run_s"] = time.monotonic() - t1
+
+            info = await sandbox.process.get_session_command(session_id, cmd_id)
+            exit_code = getattr(info, "exit_code", None)
+
+            log("\n\n[4/5] reconstructed multi-message output:")
+            log("-" * 64)
+            log(render_messages(collector.messages) or "(no messages)")
+            log("-" * 64)
+
+            try:
+                produced = await sandbox.process.exec(
+                    f"cat {run_dir}/greeting.txt", timeout=30
+                )
+                log("\ngreeting.txt produced by the agent:")
+                log(getattr(produced, "result", str(produced)))
+            except Exception as e:  # noqa: BLE001
+                log(f"(could not read greeting.txt: {e})")
+
+            log("\nsummary:")
+            log(f"  pi session id : {collector.session_id}")
+            log(f"  daytona run id: {run_id}")
+            log(f"  exit code     : {exit_code}")
+            log(f"  tool calls    : {collector.tool_calls}")
+            log(f"  token usage   : {collector.usage}")
+            log(f"  error         : {collector.error}")
+            log(f"  cold start    : {timings['cold_start_s']:.2f}s")
+            log(f"  agent run     : {timings['agent_run_s']:.2f}s")
+        finally:
+            if keep:
+                log(f"\n[5/5] --keep set; leaving sandbox {sandbox.id} running.")
+            else:
+                log(f"\n[5/5] tearing down session + sandbox {sandbox.id} ...")
+                try:
+                    await sandbox.process.delete_session(session_id)
+                except Exception:  # noqa: BLE001
+                    pass
+                await daytona.delete(sandbox)
+                log("      deleted.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/design/agent-workflows/wp-4-multi-message-output/README.md b/docs/design/agent-workflows/wp-4-multi-message-output/README.md
new file mode 100644
index 0000000000..cac5894e47
--- /dev/null
+++ b/docs/design/agent-workflows/wp-4-multi-message-output/README.md
@@ -0,0 +1,55 @@
+# WP-4: Multi-message output shape
+
+Status: not started. Feeds the interface in WP-2.
+
+## Goal
+
+Define how an agent's multi-message output is represented, streamed, stored, and surfaced,
+and how it maps onto Agenta, whose existing workflows (completion, chat) return a single
+output. This is a design and investigation task, not a service build.
+
+## Scope
+
+In:
+
+- The output schema: an agent run returns a list of messages, each with content blocks
+  (text, image, tool calls / results), not one completion.
+- The streaming contract: how partial messages arrive (`message_update` -> `text_delta`)
+  and how a consumer assembles the final list (`agent_end.messages` / `session.messages`).
+- The mapping onto Agenta's storage and display: how a message list fits the current
+  output/trace data model, and how the playground and observability would render it.
+- How images and other non-text blocks are carried (base64 content blocks; Pi also has
+  `generateImages()`).
+
+Out:
+
+- Building the service (WP-2) or the tracing (WP-1). This WP produces the schema and mapping
+  those depend on.
+
+## Approach (grounded in research)
+
+See [`../research/pi-interaction.md`](../research/pi-interaction.md).
+
+- Pi streams via `subscribe()` callbacks and exposes the full set on `agent_end.messages`.
+- Structured output uses a terminating tool (TypeBox schema), read from the tool args.
+- Examine Agenta's existing output model and the trace ingestion so the message list and the
+  span tree (WP-1) tell a consistent story.
+
+## Definition of done
+
+- A written schema for agent multi-message output, with the streaming contract.
+- A mapping from that schema onto Agenta's storage and onto playground / observability
+  rendering, with the gaps versus single-output workflows called out.
+
+## Open questions
+
+- Reconcile a message list with the single-output completion/chat response contract (ties to
+  WP-5).
+- Whether non-text artifacts (images, files) are inlined, stored, or referenced.
+- How the message list relates to the trace span tree so they do not duplicate or diverge.
+
+## Links
+
+- [`../research/pi-interaction.md`](../research/pi-interaction.md)
+- [`wp-5-chat-vs-completion/`](../wp-5-chat-vs-completion/README.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-5-chat-vs-completion/README.md b/docs/design/agent-workflows/wp-5-chat-vs-completion/README.md
new file mode 100644
index 0000000000..e9ea245ef7
--- /dev/null
+++ b/docs/design/agent-workflows/wp-5-chat-vs-completion/README.md
@@ -0,0 +1,51 @@
+# WP-5: Chat vs completion interface
+
+Status: not started. Feeds the interface in WP-2.
+
+## Goal
+
+Decide the interface contract an agent exposes, comparing Agenta's chat and completion
+shapes. Working assumption: start with chat, with a single input. Agents are multi-turn and
+conversational, so chat is the natural fit, and a single input keeps the first cut small.
+
+## Scope
+
+In:
+
+- Compare the existing completion and chat service contracts in Agenta (request shape,
+  input handling, response shape, streaming).
+- Define a minimal v1 agent contract: chat with one input. Spell out the request, the
+  response (which is multi-message, see WP-4), and how a turn maps to a Pi prompt.
+- Identify what is deferred: multiple inputs, structured inputs, tools exposed as inputs,
+  history handling.
+
+Out:
+
+- Implementing the service (WP-2). This WP produces the contract WP-2 implements.
+- The full multi-message output schema (WP-4 owns that; this WP references it).
+
+## Approach (grounded in research)
+
+- Pi is driven turn by turn (`prompt` / `followUp` / `steer`) and threads a `session_id`,
+  which lines up with chat semantics.
+- Lean on the existing chat contract so an agent can sit beside the other workflow types in
+  the playground with minimal new surface.
+
+## Definition of done
+
+- A short decision doc: chat selected over completion (with the reasoning), the minimal
+  single-input chat request/response contract for v1, and the path to richer inputs and
+  multi-turn history later.
+
+## Open questions
+
+- How conversation history is held: in the Pi session (`session_id`) only, or also passed
+  in the request.
+- Whether v1 is single-turn (one input, one multi-message answer) or already multi-turn.
+- How the single-input chat contract reconciles with the multi-message response from WP-4.
+
+## Links
+
+- [`wp-4-multi-message-output/`](../wp-4-multi-message-output/README.md)
+- [`../research/pi-interaction.md`](../research/pi-interaction.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md b/docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md
new file mode 100644
index 0000000000..77e7830cdd
--- /dev/null
+++ b/docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md
@@ -0,0 +1,84 @@
+# WP-6: Agent as a new workflow type and template
+
+Status: not started. Backend integration; ties WP-2, WP-4, and WP-5 together.
+
+## Goal
+
+Register "agent" as a new workflow type in Agenta, define its configuration schema, expose
+it as a template users can pick like completion / chat / LLM-as-a-judge, and wire a workflow
+invocation to connect to a running agent (the WP-2 service, later the WP-3 sandbox).
+
+## Scope
+
+In:
+
+- Add "agent" as a new workflow type alongside completion / chat / judge, stored and
+  versioned as a workflow revision.
+- Define the revision configuration schema: AGENTS.md, skills, model, tools, secrets, files,
+  and harness (Pi by default, configurable).
+- Add a starter template so a user can create an agent workflow with sensible defaults.
+- Define the connection: how an invocation routes to the harness over the port, threads
+  `session_id`, and streams the multi-message output back.
+
+Out:
+
+- The service implementation (WP-2) and the sandbox (WP-3). This WP defines the type, the
+  config, the template, and the connection contract; it does not build the runtime.
+- The playground UI for agents. Later.
+
+## Configuration, especially the model
+
+- **Model.** The agent's model must resolve providers and keys through the same path as
+  chat / completion (the provider/model resolution aligned in prompt-runtime-unification),
+  then be handed to the harness. For Pi that means resolving the provider key (into
+  `setRuntimeApiKey` or env) and selecting the model in Pi's `ModelRegistry` (`set_model`).
+  Pi supports 15+ providers, and this is also the OpenAI/Codex swap point.
+- **Harness.** A config field selects the harness (Pi default, configurable). Decide whether
+  the harness choice constrains the available model list.
+- **Secrets and files.** Define how secrets (for example an OpenAI key) and config files
+  attach to a revision, and how they map to the in-memory injection from the diskless
+  finding (`systemPromptOverride` for AGENTS.md, `setRuntimeApiKey` for auth) rather than
+  being written to disk.
+- **Skills and tools.** How they are declared in config and passed to the harness
+  (`skillsOverride` / `customTools`).
+
+## Connection to the agent
+
+- How a workflow invocation reaches the harness: the port contract from WP-2
+  (works-with-our-port), `session_id` threading, and streaming the multi-message output
+  (WP-4) back through the workflow response (chat-first per WP-5).
+
+## Needs a grounding pass first
+
+The backend workflow-type / template / revision model was not covered in the research round.
+Before writing config schemas, investigate how existing workflow types (completion, chat,
+judge) are registered, where templates live, and how revision `parameters` are shaped.
+The prompt-runtime-unification "Future Directions" sketched giving the judge a shared
+`prompt` block; agents would similarly get an `agent` / `harness` block under the revision
+parameters. Confirm the actual registration points in `api/`.
+
+## Definition of done
+
+- A documented config schema for the agent workflow type, with model resolution and harness
+  selection spelled out.
+- A defined connection contract from workflow invocation to the running agent.
+- A plan to register the type and ship a starter template.
+
+## Open questions
+
+- Where workflow types and templates are registered in the backend (to confirm in the
+  grounding pass).
+- Whether the harness choice constrains the available models.
+- How secrets and files attach to a revision and reach the harness in memory.
+- How agent config maps onto the existing workflow / revision / variant model
+  (`artifact.name` is the entity name, `revision.name` is the variant name).
+
+## Links
+
+- [`wp-2-agent-service/`](../wp-2-agent-service/README.md)
+- [`wp-4-multi-message-output/`](../wp-4-multi-message-output/README.md)
+- [`wp-5-chat-vs-completion/`](../wp-5-chat-vs-completion/README.md)
+- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md)
+- [`../research/auth-secrets.md`](../research/auth-secrets.md)
+- [`../../prompt-runtime-unification/README.md`](../../prompt-runtime-unification/README.md)
+- [Project README](../README.md)
diff --git a/docs/design/agent-workflows/wp-7-tools/README.md b/docs/design/agent-workflows/wp-7-tools/README.md
new file mode 100644
index 0000000000..225c77eb26
--- /dev/null
+++ b/docs/design/agent-workflows/wp-7-tools/README.md
@@ -0,0 +1,214 @@
+# WP-7: Runnable tools as agent configuration
+
+Status: design draft. Builds on WP-2 (agent service) and WP-6 (workflow type and template).
+
+## Goal
+
+Make runnable tools part of an agent's configuration. Start with Composio actions, and keep
+the door open to workflow-as-tool and MCP without reworking the agent path. The agent runs on
+the Pi harness, which drives its own multi-turn loop and executes tools in-loop. So the work
+is to declare tools in the agent revision config, resolve those declarations into tools Pi can
+call, and run each tool call through Agenta's existing tools subsystem.
+
+## What already exists (the key reuse)
+
+Agenta already ships a full, provider-agnostic tools subsystem in `api/oss/src/core/tools/`.
+It is not wired to agents yet, but the hard parts are done and verified against the code:
+
+- **Tools are callable from the backend today.** `ToolsService.execute_tool(...)`
+  (`api/oss/src/core/tools/service.py:389`) runs a Composio action. It is exposed as
+  `POST /tools/call` (`api/oss/src/apis/fastapi/tools/router.py:891`). The endpoint takes an
+  OpenAI-style envelope whose `function.name` encodes
+  `tools.{provider}.{integration}.{action}.{connection}` (router.py:916), looks up the
+  project-scoped connection, checks it is active and valid, and dispatches through the gateway
+  using the stored `provider_connection_id` (router.py:1000).
+- **Tool auth is settable today, per project.** Composio connections are created via
+  `POST /tools/connections/` (signed-state OAuth) and activated via
+  `GET /tools/connections/callback` (router.py:173). The `connected_account_id` is stored in
+  the `tool_connections` table (`api/oss/src/dbs/postgres/tools/dbes.py:38`), scoped to the
+  project, with `is_active` and `is_valid` flags. The Composio API key lives only in the
+  backend (`ComposioConfig`, `api/oss/src/utils/env.py`).
+- **The gateway is provider-agnostic.** `ToolsGatewayInterface`
+  (`api/oss/src/core/tools/interfaces.py:88`) defines `get_action` (line 128) and `execute`
+  (line 175). A registry dispatches by `provider_key`. The Composio adapter implements both
+  (`providers/composio/adapter.py:122` and `:381`, which POSTs to `/tools/execute/{slug}`).
+- **A catalog with JSON Schemas exists.** `ToolsService.get_action(...)` (service.py:120)
+  returns `ToolCatalogActionDetails.schemas.inputs`, a JSON Schema built by the Composio
+  adapter, ready to hand to a model as a tool definition.
+
+What is missing is only two things: attaching tool references to the agent config, and letting
+Pi call them during a run.
+
+## Do not copy the chat tools contract
+
+The completion and chat handlers carry tools as
+`parameters["tools"] = {internal: [...], external: [...]}`. There, external tools are not
+executed server-side. The `llm_v0` loop returns HTTP 202 "tool_requested" and the client
+executes them. The agent path is the opposite. Pi runs the loop and must execute tools in-loop.
+We reuse the tools subsystem, but not the chat tools-config shape.
+
+## Scope
+
+In:
+
+- A provider-agnostic `tools` list in the agent revision config (WP-6 `parameters`).
+- A backend resolver that turns each tool reference into a tool Pi can call.
+- An execution bridge so a Pi tool call routes back through `POST /tools/call`.
+- The Composio path end to end, plus the extension argument for MCP and workflow-as-tool.
+
+Out:
+
+- Building the MCP and workflow-as-tool adapters. This WP defines the shape they slot into.
+- Per-invoke injection of LLM provider keys from the vault. That is orthogonal and tracked
+  with WP-6.
+- A tool-picker UI in the agent playground. Later.
+
+## Configuration shape
+
+Store one provider-agnostic list under the agent revision `parameters["tools"]`. Each entry is
+a discriminated union on `type`. Config holds references and display metadata only, never
+secrets.
+
+```json
+{
+  "model": "gpt-5.5",
+  "tools": [
+    { "type": "builtin", "name": "read_file" },
+    {
+      "type": "composio",
+      "integration": "gmail",
+      "action": "SEND_EMAIL",
+      "connection": "gmail-team",
+      "name": "gmail__SEND_EMAIL"
+    }
+  ]
+}
+```
+
+- `builtin` entries are the current `List[str]` of Pi built-in tool names
+  (`services/oss/src/agent_pi/ports.py:95`). They pass straight into Pi's `tools: string[]`.
+  This reconciles the existing field: it becomes the `builtin` subset.
+- `composio` entries carry the exact slug segments `/tools/call` already parses: `integration`,
+  `action`, and `connection`. The backend owns slug encoding in one place.
+- `connection` is a project-scoped **slug**, resolved to the live connection row at run time.
+  This keeps a single config revision promotable across environments where the underlying
+  connection differs but the slug is stable.
+- `name` is the function name shown to the model. The description and input schema resolve live
+  from the catalog so config never drifts from the provider.
+
+## Execution bridge
+
+Route tool execution back through Agenta's existing `POST /tools/call`. Pi never sees the
+Composio key. End to end:
+
+1. The backend invokes the agent workflow with the resolved config (WP-6).
+2. The backend **resolves** each `composio` reference: `ToolsService.get_action(...)` for the
+   input schema, plus a connection-slug lookup that fails fast if the connection is missing,
+   inactive, or invalid. It builds a resolved spec `{ name, description, inputSchema, callRef }`
+   where `callRef` is `tools.composio.{integration}.{action}.{connection}`. `builtin` references
+   pass through as names.
+3. The backend injects the specs plus a callback context (endpoint and authorization) into the
+   harness request. The endpoint and credential reuse the mechanism that already threads the
+   OTLP credential down to the wrapper (`TraceContext`, ports.py:60).
+4. The TS wrapper (`services/agent/src/runPi.ts`) turns each spec into a Pi `customTool`: name,
+   description, JSON-Schema params, and an async `execute(args)` closure. It passes them via
+   `createAgentSession({ tools, customTools })` next to the existing `tools` (runPi.ts:179).
+5. The model emits a tool call. Pi runs the matching `execute(args)` closure itself.
+6. The closure does one `POST {endpoint}/tools/call` with the envelope
+   `{ data: { function: { name: callRef, arguments: args } } }` and the callback Authorization.
+7. The backend runs the existing path: `RUN_TOOLS` permission check, project-scoped connection
+   lookup, then `ToolsService.execute_tool(...)` to Composio with the stored
+   `connected_account_id`.
+8. The result string returns to the closure, Pi feeds it to the model, and the loop continues.
+9. Tracing is free. `services/agent/src/agenta-otel.ts` already spans
+   `tool_execution_start` and `tool_execution_end`, so the Composio call appears under the
+   agent's invoke span.
+
+Why route through the backend rather than inject provider creds into the sandbox:
+
+- The Composio API key and the connection auth stay server-side, out of the sandbox.
+- It reuses the tested path: connection lookup, active and valid gating, EE `RUN_TOOLS`, adapter
+  dispatch, and error mapping.
+- Execution happens outside the sandbox, which is the Daytona-friendly property we want.
+- New providers work through the same callback with no sandbox or Pi changes.
+
+## Auth model: two distinct kinds, never mixed
+
+- **LLM provider keys** live in the vault (`api/oss/src/core/secrets/`). They are injected into
+  Pi as env or a runtime key. Today `runPi.ts` reads a local login or `*_API_KEY` env. Per-invoke
+  vault injection is orthogonal to tools and tracked with WP-6.
+- **Tool connection auth** (Composio OAuth) lives behind `/tools/call`, scoped to the project,
+  and is fully settable today. Pi and the sandbox never see it.
+
+## Where resolution happens: the backend, not the services runner
+
+`ToolsService`, the catalog, the connections, and the project scope all live in `api`. WP-6
+already has the backend invoking the agent workflow with the config in hand, so resolving
+references into `customTools` is a natural pre-invoke step there. The `services` runner stays a
+thin harness driver that receives ready specs plus a callback URL. Until WP-6 lands, a Composio
+demo could resolve inside `services/oss/src/agent.py` by calling the `api` catalog over HTTP,
+but the real design resolves in `api`.
+
+## Extensibility to MCP and workflow-as-tool
+
+Both are just a new `ToolsGatewayInterface` adapter plus a new config `type`, with no change to
+Pi, the bridge, or the sandbox:
+
+- **MCP adapter.** Map `get_action` to MCP `tools/list` and `execute` to MCP `tools/call`.
+  Register under `provider_key="mcp"`. Config gains `type: "mcp"`. The `callRef` becomes
+  `tools.mcp.{server}.{action}.{connection}`, which the existing 5-segment parser handles.
+- **Workflow-as-tool adapter.** Return the target workflow's input schema from `get_action`, and
+  call the workflow `/invoke` from `execute`. Register under `provider_key="workflow"`.
+
+Because the bridge only ever speaks the OpenAI-style envelope to `/tools/call`, and `/tools/call`
+dispatches purely by `provider_key` through the registry, the agent side stays provider-blind.
+
+## Implementation sketch (Composio MVP)
+
+- `services/oss/src/agent_pi/ports.py` — add `custom_tools` and `tool_callback` to
+  `HarnessRequest`.
+- `services/oss/src/agent_pi/config.py` — evolve `tools` from `List[str]` to the discriminated
+  shape; split `builtin` from runnable references.
+- `services/oss/src/agent_pi/pi_http_harness.py` and `pi_harness.py` — serialize the new fields
+  onto the wire.
+- `services/agent/src/runPi.ts` — build Pi `customTools` and the `/tools/call` closure.
+- `api/oss/src/core/tools/service.py` — add `resolve_connection_by_slug(...)`, extracted from the
+  router so the resolver and `call_tool` share it.
+- WP-6 invoke path in `api` — the resolver that turns `parameters["tools"]` into `custom_tools`
+  and `tool_callback`. Reuse `router.py` `call_tool` unchanged as the execution endpoint.
+
+## Risks and open questions
+
+1. **Blocking latency in Pi's loop.** Each tool call is Pi to `/tools/call` to Composio and back,
+   serialized per turn. The agent timeout is 180s and the Composio client timeout is 30s. Surface
+   per-tool timeouts as tool-error strings, not run failures, and keep a generous overall budget.
+2. **Connection-slug resolution.** Pre-validate every referenced connection at resolve time and
+   fail the invoke early with a clear message, rather than letting the model hit a runtime tool
+   error mid-loop. Decide the behavior when one environment lacks a connection a shared revision
+   references.
+3. **EE `RUN_TOOLS` scoping.** The callback credential must carry `RUN_TOOLS` for the project.
+   Recommend scoping to the invoking user's permissions, threaded like the OTLP credential, so an
+   agent run cannot call tools the user could not.
+4. **Streaming.** The agent `/invoke` returns a single final message. Intermediate tool calls are
+   visible only via the trace today. A streaming channel for tool events is out of scope for the
+   MVP but should be flagged.
+5. **Slug encoding round-trip.** The `__` vs `.` convention only holds if integration, action, and
+   connection names never contain `__` or `.`. The connection slug rules already guard this; verify
+   Composio action keys do too. Send `arguments` as a dict to avoid double-encoding.
+
+## Definition of done
+
+- A documented config schema for agent tools, with the discriminated `type` and the Composio
+  fields spelled out.
+- A backend resolver that turns references into `customTools` and validates connections up front.
+- An execution bridge that routes Pi tool calls through `/tools/call`, verified with a Composio
+  smoke run, with the call nested under the agent invoke span and the Composio key absent from the
+  sandbox.
+
+## Links
+
+- [`wp-2-agent-service/`](../wp-2-agent-service/README.md)
+- [`wp-6-workflow-type-and-template/`](../wp-6-workflow-type-and-template/README.md)
+- [`../research/auth-secrets.md`](../research/auth-secrets.md)
+- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md)
+- [Project README](../README.md)
diff --git a/hosting/docker-compose/ee/docker-compose.dev.yml b/hosting/docker-compose/ee/docker-compose.dev.yml
index 812578fa9c..27974d996f 100644
--- a/hosting/docker-compose/ee/docker-compose.dev.yml
+++ b/hosting/docker-compose/ee/docker-compose.dev.yml
@@ -394,6 +394,8 @@ services:
             - ${ENV_FILE:-./.env.ee.dev}
         environment:
             DOCKER_NETWORK_MODE: ${DOCKER_NETWORK_MODE:-bridge}
+            # Agent workflow (WP-2): reach the Pi harness sidecar in-network.
+            AGENTA_AGENT_PI_URL: http://agent-pi:8765
         # === NETWORK ============================================== #
         networks:
             - agenta-network
@@ -411,6 +413,40 @@ services:
         # === LIFECYCLE ============================================ #
         restart: always
 
+    agent-pi:
+        # === IMAGE ================================================ #
+        # Pi harness sidecar for the agent workflow (WP-2). The services container
+        # calls it in-network at http://agent-pi:8765 (AGENTA_AGENT_PI_URL).
+        build:
+            context: ../../../services/agent
+            dockerfile: docker/Dockerfile.dev
+        # === EXECUTION ============================================ #
+        # No file watcher (the box's inotify limit is shared across stacks). Copy the
+        # read-only mounted Pi login into a writable path so OAuth refresh stays
+        # in-container.
+        command: >
+            sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
+            exec node_modules/.bin/tsx src/server.ts"
+        # === CONFIGURATION ======================================== #
+        env_file:
+            - ${ENV_FILE:-./.env.ee.dev}
+        environment:
+            PORT: "8765"
+            PI_CODING_AGENT_DIR: /pi-agent
+            # Tracing export fallback (used when a request carries no usable OTLP
+            # credential). Must be reachable from this container.
+            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
+            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+        # === STORAGE ============================================== #
+        volumes:
+            - ../../../services/agent/src:/app/src
+            - ${HOME}/.pi/agent:/pi-agent-ro:ro
+        # === NETWORK ============================================== #
+        networks:
+            - agenta-network
+        # === LIFECYCLE ============================================ #
+        restart: always
+
     postgres:
         # === IMAGE ================================================ #
         image: postgres:17
diff --git a/sdks/python/agenta/sdk/engines/running/interfaces.py b/sdks/python/agenta/sdk/engines/running/interfaces.py
index d84908a164..cf587e21fb 100644
--- a/sdks/python/agenta/sdk/engines/running/interfaces.py
+++ b/sdks/python/agenta/sdk/engines/running/interfaces.py
@@ -524,6 +524,42 @@ def llm_inputs_schema(
     ),
 )
 
+agent_v0_interface = WorkflowRevisionData(
+    uri="agenta:builtin:agent:v0",
+    schemas=dict(  # type: ignore
+        parameters=obj(
+            properties={
+                "model": scalar(
+                    jtype="string",
+                    default="gpt-5.5",
+                    description="Model the agent runs on.",
+                ),
+                "agents_md": scalar(
+                    jtype="string",
+                    default=(
+                        "You are a friendly hello-world agent running on the "
+                        "Agenta agent service.\n\n- Greet the user warmly.\n- "
+                        "Answer the user's message in one or two short sentences."
+                    ),
+                    description="The agent's instructions (AGENTS.md).",
+                ),
+            },
+            additional_properties=True,
+        ),
+        inputs=llm_inputs_schema(
+            include_messages=True,
+        ),
+        outputs={
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            **semantic_field(
+                x_ag_type_ref="message",
+                jtype="object",
+                description="Final assistant message returned by the agent.",
+            ),
+        },
+    ),
+)
+
 completion_v0_interface = WorkflowRevisionData(
     uri="agenta:builtin:completion:v0",
     schemas=dict(  # type: ignore
diff --git a/sdks/python/agenta/sdk/engines/running/utils.py b/sdks/python/agenta/sdk/engines/running/utils.py
index da84036e5a..a55a7069ca 100644
--- a/sdks/python/agenta/sdk/engines/running/utils.py
+++ b/sdks/python/agenta/sdk/engines/running/utils.py
@@ -51,6 +51,7 @@
     # --- OLD URI
     chat_v0_interface,
     completion_v0_interface,
+    agent_v0_interface,
     echo_v0_interface,
     auto_exact_match_v0_interface,
     auto_regex_test_v0_interface,
@@ -88,6 +89,7 @@
             # --- OLD URI
             chat=dict(v0=chat_v0_interface),
             completion=dict(v0=completion_v0_interface),
+            agent=dict(v0=agent_v0_interface),
             echo=dict(v0=echo_v0_interface),
             auto_exact_match=dict(v0=auto_exact_match_v0_interface),
             auto_regex_test=dict(v0=auto_regex_test_v0_interface),
@@ -243,6 +245,15 @@ def _catalog_entry() -> dict:
                     presets=[],
                 )
             ),
+            agent=dict(
+                v0=dict(
+                    name="agent",
+                    description="Agent that runs tools over multiple turns on the Pi harness.",
+                    categories=None,
+                    flags=None,
+                    presets=[],
+                )
+            ),
             #
             echo=dict(v0=_catalog_entry()),
             auto_exact_match=dict(v0=_catalog_entry()),
@@ -282,6 +293,18 @@ def _catalog_entry() -> dict:
             # --- OLD URI
             chat=dict(v0=WorkflowRevisionData()),
             completion=dict(v0=WorkflowRevisionData()),
+            agent=dict(
+                v0=WorkflowRevisionData(
+                    parameters={
+                        "model": "gpt-5.5",
+                        "agents_md": (
+                            "You are a friendly hello-world agent running on the "
+                            "Agenta agent service.\n\n- Greet the user warmly.\n- "
+                            "Answer the user's message in one or two short sentences."
+                        ),
+                    }
+                )
+            ),
             echo=dict(v0=WorkflowRevisionData()),
             auto_exact_match=dict(v0=WorkflowRevisionData()),
             auto_regex_test=dict(v0=WorkflowRevisionData()),
@@ -543,12 +566,12 @@ def infer_url_from_uri(uri: Optional[str]) -> Optional[str]:
     # agenta:builtin:* — application-only (not evaluators)
     ("builtin", "chat"): (True, False, False),
     ("builtin", "completion"): (True, False, False),
+    ("builtin", "agent"): (True, False, False),
     # agenta:builtin:* — both evaluator and application
     ("builtin", "llm"): (True, True, False),
     # agenta:builtin:* — evaluator-only
     ("builtin", "match"): (False, True, False),
     ("builtin", "prompt"): (False, True, False),
-    ("builtin", "agent"): (False, True, False),
     ("builtin", "echo"): (False, True, False),
     ("builtin", "human"): (False, True, False),
     ("builtin", "auto_exact_match"): (False, True, False),
diff --git a/services/agent/.dockerignore b/services/agent/.dockerignore
new file mode 100644
index 0000000000..e250b4f174
--- /dev/null
+++ b/services/agent/.dockerignore
@@ -0,0 +1,3 @@
+node_modules
+*.log
+.git
diff --git a/services/agent/README.md b/services/agent/README.md
new file mode 100644
index 0000000000..c920de19f9
--- /dev/null
+++ b/services/agent/README.md
@@ -0,0 +1,73 @@
+# Agent service: Pi wrapper (WP-2)
+
+This is the TypeScript side of the agent workflow service. It is a thin wrapper that
+drives the [Pi](https://pi.dev) agent harness for a single run. The Python service
+(`services/oss/src/agent.py`) calls it; see
+`docs/design/agent-workflows/wp-2-agent-service/`.
+
+## What it does
+
+`src/cli.ts` reads one JSON request on stdin, runs Pi once via the SDK
+(`createAgentSession`), and writes one JSON result on stdout. AGENTS.md is injected in
+memory; the session and working dir are throwaway. stdout is the result channel only,
+logs go to stderr.
+
+Request (stdin):
+
+```json
+{
+  "agentsMd": "You are a hello-world agent.",
+  "model": "gpt-5.5",
+  "prompt": "Hi there",
+  "messages": [{"role": "user", "content": "Hi there"}],
+  "tools": []
+}
+```
+
+Result (stdout):
+
+```json
+{ "ok": true, "output": "Hello! ...", "sessionId": "...", "model": "openai-codex/gpt-5.5", "traceId": "..." }
+```
+
+## Tracing
+
+When the request carries a `trace` block, the run is traced into Agenta as
+OpenTelemetry spans and nested under the caller's `/invoke` span, so the agent's whole
+run is part of the same trace (the way completion/chat nest their LLM spans). The
+Python service fills `trace` in from the live workflow span; see
+`docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md`.
+
+```json
+{
+  "prompt": "Hi there",
+  "trace": {
+    "traceparent": "00-<32hex trace>-<16hex span>-01",
+    "endpoint": "https://host/api/otlp/v1/traces",
+    "authorization": "ApiKey ...",
+    "captureContent": true
+  }
+}
+```
+
+With no `trace` block the run is traced standalone using `AGENTA_HOST` /
+`AGENTA_API_KEY`, or not at all when neither is set. The extension lives in
+`src/agenta-otel.ts`.
+
+## Auth
+
+`AuthStorage.create()` reads `~/.pi/agent/auth.json`. Log in once with `pnpm exec pi`
+then `/login`, or set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`.
+
+## Local use
+
+```bash
+pnpm install
+echo '{"agentsMd":"You are a hello-world agent.","prompt":"Hi"}' | pnpm run run:cli
+```
+
+## Config
+
+`config/AGENTS.md` and `config/agent.json` hold the hardcoded MVP config. They are read
+by the Python service and passed into the request, so editing them changes the agent
+without a code change.
diff --git a/services/agent/config/AGENTS.md b/services/agent/config/AGENTS.md
new file mode 100644
index 0000000000..767a2cdd49
--- /dev/null
+++ b/services/agent/config/AGENTS.md
@@ -0,0 +1,7 @@
+# Hello-world agent
+
+You are a friendly hello-world agent running on the Agenta agent service.
+
+- Greet the user warmly.
+- Answer the user's message in one or two short sentences.
+- Do not use tools. Keep replies plain text.
diff --git a/services/agent/config/agent.json b/services/agent/config/agent.json
new file mode 100644
index 0000000000..adc26f793c
--- /dev/null
+++ b/services/agent/config/agent.json
@@ -0,0 +1,4 @@
+{
+  "model": "gpt-5.5",
+  "tools": []
+}
diff --git a/services/agent/docker-compose.agent.yml b/services/agent/docker-compose.agent.yml
new file mode 100644
index 0000000000..43f733d1c7
--- /dev/null
+++ b/services/agent/docker-compose.agent.yml
@@ -0,0 +1,98 @@
+# Dedicated, self-contained compose for the agent service (WP-2).
+#
+# Runs the agent fully in Docker, invokable by curl, without touching any other stack:
+#
+#   agent-pi   - the TypeScript Pi wrapper as an HTTP sidecar. Uses the local Pi login
+#                (~/.pi/agent) copied in at startup so token refresh never writes to the
+#                host. Reachable only on the internal network.
+#   agent-api  - the Python agent service (reuses the prebuilt services dev image). Speaks
+#                the Agenta /invoke contract and calls agent-pi over HTTP. Published on a
+#                host port for curl.
+#
+# Bring up:
+#   docker compose -f services/agent/docker-compose.agent.yml up --build
+# Verify:
+#   curl localhost:8092/health
+#   curl -X POST localhost:8092/agent/v0/invoke -H 'Content-Type: application/json' \
+#     -d '{"data":{"inputs":{"messages":[{"role":"user","content":"hi"}]}}}'
+# Tear down:
+#   docker compose -f services/agent/docker-compose.agent.yml down
+
+name: agenta-agent
+
+services:
+    agent-pi:
+        build:
+            context: .
+            dockerfile: docker/Dockerfile.dev
+        # Copy the read-only mounted login into a writable container path so OAuth token
+        # refresh works and never writes back to the host ~/.pi/agent.
+        command: >
+            sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
+            exec node_modules/.bin/tsx watch src/server.ts"
+        environment:
+            PORT: "8765"
+            PI_CODING_AGENT_DIR: /pi-agent
+            # Tracing export fallback when the request carries no Authorization
+            # (auth disabled locally). Must be reachable from this container.
+            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
+            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+        volumes:
+            - ./src:/app/src
+            - ${HOME}/.pi/agent:/pi-agent-ro:ro
+        networks:
+            - agent-net
+        restart: unless-stopped
+
+    agent-api:
+        # Built from the current services dev Dockerfile (Python 3.13, current SDK +
+        # deps). A dedicated tag so we never clobber other stacks' images.
+        image: agenta-agent-api:dev
+        build:
+            context: ../..
+            dockerfile: services/oss/docker/Dockerfile.dev
+        command:
+            [
+                "uvicorn",
+                "entrypoints.agent_main:app",
+                "--host",
+                "0.0.0.0",
+                "--port",
+                "8080",
+                "--reload",
+                "--reload-dir",
+                "/app/oss/src",
+                "--reload-dir",
+                "/app/entrypoints",
+                "--reload-dir",
+                "/sdks/python/agenta",
+                "--reload-exclude",
+                "*.pyc",
+                "--reload-exclude",
+                "__pycache__",
+            ]
+        environment:
+            # Local curl: skip the remote credential check (the Python layer still runs
+            # its auth/middleware stack, it just passes the header through).
+            AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED: "false"
+            # Drives the harness selection: HTTP harness -> the agent-pi sidecar.
+            AGENTA_AGENT_PI_URL: http://agent-pi:8765
+            # Tracing export target. Must be reachable from this container AND from the
+            # agent-pi sidecar (the endpoint is passed across to nest the Pi spans), so
+            # use the host IP, not localhost. The API key authorizes the OTLP export.
+            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
+            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+        volumes:
+            - ..:/app
+            - ../../sdks/python:/sdks/python
+            - ../../clients/python:/clients/python
+        ports:
+            - "8092:8080"
+        depends_on:
+            - agent-pi
+        networks:
+            - agent-net
+        restart: unless-stopped
+
+networks:
+    agent-net:
diff --git a/services/agent/docker-compose.stack.yml b/services/agent/docker-compose.stack.yml
new file mode 100644
index 0000000000..774e942517
--- /dev/null
+++ b/services/agent/docker-compose.stack.yml
@@ -0,0 +1,86 @@
+# Same-origin demo: the agent served exactly like chat and completion.
+#
+# Runs the FULL services app (entrypoints.main, which now mounts /agent/v0 next to
+# /chat/v0 and /completion/v0) behind its own traefik, so the agent answers at
+# {origin}/services/agent/v0/invoke just like {origin}/services/chat/v0/invoke. The
+# Pi sidecar is called in-network. This is the integration; a full dev stack (with the
+# web app) would serve the playground at the same origin so there is no CORS at all.
+#
+# Bring up (creds for tracing/export come from the shell):
+#   set -a && source .env.test.local && set +a
+#   docker compose -f services/agent/docker-compose.stack.yml up --build -d
+# Verify:
+#   curl -X POST localhost:8480/services/agent/v0/invoke -H 'content-type: application/json' \
+#     -d '{"data":{"inputs":{"messages":[{"role":"user","content":"hi"}]}}}'
+
+name: agenta-agent-stack
+
+services:
+    traefik:
+        image: traefik:2
+        command:
+            - --providers.docker
+            - --providers.docker.constraints=Label(`com.docker.compose.project`,`agenta-agent-stack`)
+            - --entrypoints.web.address=:80
+        volumes:
+            - /var/run/docker.sock:/var/run/docker.sock
+        ports:
+            - "8480:80"
+        networks:
+            - stack-net
+        restart: unless-stopped
+
+    services:
+        image: agenta-agent-api:dev
+        command:
+            [
+                "uvicorn",
+                "entrypoints.main:app",
+                "--host",
+                "0.0.0.0",
+                "--port",
+                "8080",
+                "--root-path",
+                "/services",
+            ]
+        environment:
+            AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED: "false"
+            AGENTA_AGENT_PI_URL: http://agent-pi:8765
+            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
+            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+        volumes:
+            - ..:/app
+            - ../../sdks/python:/sdks/python
+            - ../../clients/python:/clients/python
+        networks:
+            - stack-net
+        labels:
+            - "traefik.http.routers.aservices.rule=PathPrefix(`/services/`)"
+            - "traefik.http.routers.aservices.entrypoints=web"
+            - "traefik.http.middlewares.aservices-strip.stripprefix.prefixes=/services"
+            - "traefik.http.middlewares.aservices-strip.stripprefix.forceslash=true"
+            - "traefik.http.routers.aservices.middlewares=aservices-strip"
+            - "traefik.http.services.aservices.loadbalancer.server.port=8080"
+        restart: unless-stopped
+
+    agent-pi:
+        build:
+            context: .
+            dockerfile: docker/Dockerfile.dev
+        command: >
+            sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
+            exec node_modules/.bin/tsx src/server.ts"
+        environment:
+            PORT: "8765"
+            PI_CODING_AGENT_DIR: /pi-agent
+            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
+            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+        volumes:
+            - ./src:/app/src
+            - ${HOME}/.pi/agent:/pi-agent-ro:ro
+        networks:
+            - stack-net
+        restart: unless-stopped
+
+networks:
+    stack-net:
diff --git a/services/agent/docker/Dockerfile.dev b/services/agent/docker/Dockerfile.dev
new file mode 100644
index 0000000000..2b2320600e
--- /dev/null
+++ b/services/agent/docker/Dockerfile.dev
@@ -0,0 +1,28 @@
+# Pi harness sidecar (WP-2), dev image.
+#
+# Runs the TypeScript Pi wrapper as an HTTP server. The Python agent service calls
+# it in-network. Source is bind-mounted in dev so `tsx watch` hot-reloads; node_modules
+# stays baked into the image. Build context is services/agent.
+
+FROM node:24-slim
+
+WORKDIR /app
+
+RUN corepack enable
+
+# Install deps as a cached layer (manifest + lockfile only).
+COPY package.json pnpm-lock.yaml ./
+RUN pnpm install --frozen-lockfile
+
+# Fallback copy for non-mounted runs; in dev these are bind-mounted over.
+COPY tsconfig.json ./
+COPY src ./src
+
+ENV NODE_ENV=development \
+    PORT=8765
+
+EXPOSE 8765
+
+# Call the local tsx binary directly to avoid pnpm/corepack HOME writes when the
+# container runs as a non-root host uid.
+CMD ["node_modules/.bin/tsx", "watch", "src/server.ts"]
diff --git a/services/agent/package.json b/services/agent/package.json
new file mode 100644
index 0000000000..5f2a39fb88
--- /dev/null
+++ b/services/agent/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "agenta-agent-pi-wrapper",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "packageManager": "pnpm@10.30.0",
+  "description": "WP-2: thin TypeScript wrapper that drives the Pi agent harness for one run. Reads a JSON request on stdin, returns a JSON result on stdout.",
+  "scripts": {
+    "run:cli": "tsx src/cli.ts",
+    "serve": "tsx src/server.ts",
+    "serve:watch": "tsx watch src/server.ts",
+    "login": "pi"
+  },
+  "dependencies": {
+    "@earendil-works/pi-coding-agent": "0.79.4",
+    "@opentelemetry/api": "1.9.0",
+    "@opentelemetry/exporter-trace-otlp-proto": "0.54.0",
+    "@opentelemetry/resources": "1.28.0",
+    "@opentelemetry/sdk-trace-base": "1.28.0",
+    "@opentelemetry/sdk-trace-node": "1.28.0",
+    "@opentelemetry/semantic-conventions": "1.28.0"
+  },
+  "devDependencies": {
+    "tsx": "4.19.2",
+    "@types/node": "22.10.2"
+  }
+}
diff --git a/services/agent/pnpm-lock.yaml b/services/agent/pnpm-lock.yaml
new file mode 100644
index 0000000000..eab8e5fb3a
--- /dev/null
+++ b/services/agent/pnpm-lock.yaml
@@ -0,0 +1,1826 @@
+lockfileVersion: '9.0'
+
+settings:
+  autoInstallPeers: true
+  excludeLinksFromLockfile: false
+
+importers:
+
+  .:
+    dependencies:
+      '@earendil-works/pi-coding-agent':
+        specifier: 0.79.4
+        version: 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@opentelemetry/api':
+        specifier: 1.9.0
+        version: 1.9.0
+      '@opentelemetry/exporter-trace-otlp-proto':
+        specifier: 0.54.0
+        version: 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-node':
+        specifier: 1.28.0
+        version: 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions':
+        specifier: 1.28.0
+        version: 1.28.0
+    devDependencies:
+      '@types/node':
+        specifier: 22.10.2
+        version: 22.10.2
+      tsx:
+        specifier: 4.19.2
+        version: 4.19.2
+
+packages:
+
+  '@anthropic-ai/sdk@0.91.1':
+    resolution: {integrity: sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw==}
+    hasBin: true
+    peerDependencies:
+      zod: ^3.25.0 || ^4.0.0
+    peerDependenciesMeta:
+      zod:
+        optional: true
+
+  '@aws-crypto/crc32@5.2.0':
+    resolution: {integrity: sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==}
+    engines: {node: '>=16.0.0'}
+
+  '@aws-crypto/sha256-browser@5.2.0':
+    resolution: {integrity: sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==}
+
+  '@aws-crypto/sha256-js@5.2.0':
+    resolution: {integrity: sha512-FFQQyu7edu4ufvIZ+OadFpHHOt+eSTBaYaki44c+akjg7qZg9oOQeLlk77F6tSYqjDAFClrHJk9tMf0HdVyOvA==}
+    engines: {node: '>=16.0.0'}
+
+  '@aws-crypto/supports-web-crypto@5.2.0':
+    resolution: {integrity: sha512-iAvUotm021kM33eCdNfwIN//F77/IADDSs58i+MDaOqFrVjZo9bAal0NK7HurRuWLLpF1iLX7gbWrjHjeo+YFg==}
+
+  '@aws-crypto/util@5.2.0':
+    resolution: {integrity: sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==}
+
+  '@aws-sdk/client-bedrock-runtime@3.1048.0':
+    resolution: {integrity: sha512-u+NT61JZEkRFtpL0CAw1N1dwxnaLgwVXQl/zjJxTGgLyS/jTIdg2SdoEoCTHxgDyCnqa1HEi9QOoE9/pYRNpOQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/core@3.974.20':
+    resolution: {integrity: sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-env@3.972.46':
+    resolution: {integrity: sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-http@3.972.48':
+    resolution: {integrity: sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-ini@3.972.53':
+    resolution: {integrity: sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-login@3.972.52':
+    resolution: {integrity: sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-node@3.972.55':
+    resolution: {integrity: sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-process@3.972.46':
+    resolution: {integrity: sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-sso@3.972.52':
+    resolution: {integrity: sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/credential-provider-web-identity@3.972.52':
+    resolution: {integrity: sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/eventstream-handler-node@3.972.21':
+    resolution: {integrity: sha512-mVC0hOmwGJmNFezZ+wM8Sqfap/LjsMavEf2Evl0YWrLAcrdZOEdjnY8nRvgakVViWJSGm2eJxLuPVHGdeV06kA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/middleware-eventstream@3.972.17':
+    resolution: {integrity: sha512-tdbnXbw73ww62ABWP0G0Z/euvFowEEvAoi/zG4NaZo7HJFpfGho/Z65HyVzkJLT1cMsUregr4pTyxljlarT0wA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/middleware-websocket@3.972.28':
+    resolution: {integrity: sha512-SCW06Zjugn86pq7+dxGnFcyWJuEWHT753HTU/Vj/OzVxP+NoShwdAr4ynxAcvWL883OgRVbSqW3ohnjIxwXjjw==}
+    engines: {node: '>= 14.0.0'}
+
+  '@aws-sdk/nested-clients@3.997.20':
+    resolution: {integrity: sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/signature-v4-multi-region@3.996.34':
+    resolution: {integrity: sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/token-providers@3.1048.0':
+    resolution: {integrity: sha512-k0y/GcuesuSfWyUM0WamrGyeZmltRYaPbHO82UDA6mZ/doB+FOHKutikPAtSXMn/hDz970cF+iRuuiYO9VEbAA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/token-providers@3.1066.0':
+    resolution: {integrity: sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/types@3.973.12':
+    resolution: {integrity: sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/util-locate-window@3.965.7':
+    resolution: {integrity: sha512-M0D6oIpohdNHjc7udzTHEQyot0+0iuA36jc2I9Hps+f/GtKi2HO/pyijQnCnNcwZqLB5+rtn81z3eZK/GyjAmA==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/xml-builder@3.972.29':
+    resolution: {integrity: sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws/lambda-invoke-store@0.2.4':
+    resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@babel/runtime@7.29.7':
+    resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==}
+    engines: {node: '>=6.9.0'}
+
+  '@earendil-works/pi-agent-core@0.79.4':
+    resolution: {integrity: sha512-xkaZ3yK2XbP9HYdHrrdj/6HqZPM0o/mwbjMSU4RTJyR3HjDG0ZrPz76Hg6s0W+G4u6PpJr1mGx/srCG+3eQA8A==}
+    engines: {node: '>=22.19.0'}
+
+  '@earendil-works/pi-ai@0.79.4':
+    resolution: {integrity: sha512-Z1j+YP+6ZyPBKDUoc5m0GO/o1hPK17fWeErtDgegCTpm2dcKzuFvL/7GTqHeJkVkfpeXRwO37xOfgozQbK6EUw==}
+    engines: {node: '>=22.19.0'}
+    hasBin: true
+
+  '@earendil-works/pi-coding-agent@0.79.4':
+    resolution: {integrity: sha512-PthzVzM5m4XH/hrU+2fVjuwuH5M4eMFWbd0NCRScH14XKpwlPc8/Fh6JDz0jQb5kTBT9oQT183YLTHVVulFL9A==}
+    engines: {node: '>=22.19.0'}
+    hasBin: true
+
+  '@earendil-works/pi-tui@0.79.4':
+    resolution: {integrity: sha512-/ZhfFiHSBMH7AbDrBQIN+UWlJnl9tSEpLYICRGGMzmNfyCqX+30NYacIhyOEaD8R5rS6wJZysAOPU0yNwigbXw==}
+    engines: {node: '>=22.19.0'}
+
+  '@esbuild/aix-ppc64@0.23.1':
+    resolution: {integrity: sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [aix]
+
+  '@esbuild/android-arm64@0.23.1':
+    resolution: {integrity: sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [android]
+
+  '@esbuild/android-arm@0.23.1':
+    resolution: {integrity: sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [android]
+
+  '@esbuild/android-x64@0.23.1':
+    resolution: {integrity: sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [android]
+
+  '@esbuild/darwin-arm64@0.23.1':
+    resolution: {integrity: sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@esbuild/darwin-x64@0.23.1':
+    resolution: {integrity: sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [darwin]
+
+  '@esbuild/freebsd-arm64@0.23.1':
+    resolution: {integrity: sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [freebsd]
+
+  '@esbuild/freebsd-x64@0.23.1':
+    resolution: {integrity: sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [freebsd]
+
+  '@esbuild/linux-arm64@0.23.1':
+    resolution: {integrity: sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [linux]
+
+  '@esbuild/linux-arm@0.23.1':
+    resolution: {integrity: sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [linux]
+
+  '@esbuild/linux-ia32@0.23.1':
+    resolution: {integrity: sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [linux]
+
+  '@esbuild/linux-loong64@0.23.1':
+    resolution: {integrity: sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==}
+    engines: {node: '>=18'}
+    cpu: [loong64]
+    os: [linux]
+
+  '@esbuild/linux-mips64el@0.23.1':
+    resolution: {integrity: sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==}
+    engines: {node: '>=18'}
+    cpu: [mips64el]
+    os: [linux]
+
+  '@esbuild/linux-ppc64@0.23.1':
+    resolution: {integrity: sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [linux]
+
+  '@esbuild/linux-riscv64@0.23.1':
+    resolution: {integrity: sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==}
+    engines: {node: '>=18'}
+    cpu: [riscv64]
+    os: [linux]
+
+  '@esbuild/linux-s390x@0.23.1':
+    resolution: {integrity: sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==}
+    engines: {node: '>=18'}
+    cpu: [s390x]
+    os: [linux]
+
+  '@esbuild/linux-x64@0.23.1':
+    resolution: {integrity: sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [linux]
+
+  '@esbuild/netbsd-x64@0.23.1':
+    resolution: {integrity: sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [netbsd]
+
+  '@esbuild/openbsd-arm64@0.23.1':
+    resolution: {integrity: sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [openbsd]
+
+  '@esbuild/openbsd-x64@0.23.1':
+    resolution: {integrity: sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [openbsd]
+
+  '@esbuild/sunos-x64@0.23.1':
+    resolution: {integrity: sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [sunos]
+
+  '@esbuild/win32-arm64@0.23.1':
+    resolution: {integrity: sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [win32]
+
+  '@esbuild/win32-ia32@0.23.1':
+    resolution: {integrity: sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [win32]
+
+  '@esbuild/win32-x64@0.23.1':
+    resolution: {integrity: sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [win32]
+
+  '@google/genai@1.52.0':
+    resolution: {integrity: sha512-gwSvbpiN/17O9TbsqSsE/OzZcpv5Fo4RQjdngGgogtuB9RsyJ8ZHhX5KjHj1bp5N9snN2eK8LDGXSaWW2hof8Q==}
+    engines: {node: '>=20.0.0'}
+    peerDependencies:
+      '@modelcontextprotocol/sdk': ^1.25.2
+    peerDependenciesMeta:
+      '@modelcontextprotocol/sdk':
+        optional: true
+
+  '@mariozechner/clipboard-darwin-arm64@0.3.9':
+    resolution: {integrity: sha512-BfgV7vCEWZwJwZJw03r6bP5+tf0iI/ANuQYCxi9RNn7FrWB3yzGuMKCrNLRl6V761vXRdL8+OqZ0wd4TqlsNOQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@mariozechner/clipboard-darwin-universal@0.3.9':
+    resolution: {integrity: sha512-BGGR4iA9Z2shAjI65eI5xtyb3LYNlDW9X3gxKxDbqtbnREohsrqznov6zpKoIrsRWpzlYVEdKphS7ksJ0/ndSQ==}
+    engines: {node: '>= 10'}
+    os: [darwin]
+
+  '@mariozechner/clipboard-darwin-x64@0.3.9':
+    resolution: {integrity: sha512-4kURmCbS6nt8uYhtmWpUcJWyPHfmAr5dTpXD1nO3pIfa+TSQ9DbrGOYCKH+aEFW47XhQ4Vp8ZTszie+wfFvDKg==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [darwin]
+
+  '@mariozechner/clipboard-linux-arm64-gnu@0.3.9':
+    resolution: {integrity: sha512-g59OkUGP2DDfCOIKypHeYgv2M55u/cKvXa5dSxFbEJ34XvIQMdcVmpKCkGUro3ZgefXiGVdwguvTMQGpHWzIXw==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-arm64-musl@0.3.9':
+    resolution: {integrity: sha512-AGuJdgKsmJdm4Pych7kv3sqe591ERRaAHW3xjLooiFzn8J+PxUyof++7YZrB5Y5tpnTO+K18Og3taj2NpluCRQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9':
+    resolution: {integrity: sha512-DXBEAiuMpk7dhS1a9NzNxVAFi1vaKoPu7rQNgY8LIDLGrK3lnIp3nT10DUum+PKVJoJppIP+NAA8IZe4DMNDPw==}
+    engines: {node: '>= 10'}
+    cpu: [riscv64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-x64-gnu@0.3.9':
+    resolution: {integrity: sha512-WORrMLd6EpElEME7JRKfSaY34nW1P5LbdgK5YNCS1ncG2LqmITsSMEJ8nh2mpvxb3TxqbOOKgY7k9eMJYlW9Mw==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  '@mariozechner/clipboard-linux-x64-musl@0.3.9':
+    resolution: {integrity: sha512-/DHn+1DrfL6oRaPPWXaOKvonFFrni666fxd+zFqiQEfvBH0tsHVWjq9iqBk0oDp0qaPA72lIMy5BptxISBEhZQ==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  '@mariozechner/clipboard-win32-arm64-msvc@0.3.9':
+    resolution: {integrity: sha512-O5FHD3ErkMwMhNzAfu3ggy0ug4z7btZuoQgwwxlzPrwV2bxlD6WDpqBY4NCgICAgZdDKdp+loUEKVAVt8aYnhQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [win32]
+
+  '@mariozechner/clipboard-win32-x64-msvc@0.3.9':
+    resolution: {integrity: sha512-ihQC3EufqEY81vhXBgVBtK4prL+wc62zJsSvxrgz7K1hsdt6OObz6v9p3Rn1OG3GJksTTKMJF0u/guMISHPhSA==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [win32]
+
+  '@mariozechner/clipboard@0.3.9':
+    resolution: {integrity: sha512-ABnA53mdfkGZwOFUdZNv2S0CWGO/EIuPj8Vv9xmBFmSYg/qFc7ihO6q5FcQjvoE67kZpWkEc4AhD6B/os04yuA==}
+    engines: {node: '>= 10'}
+
+  '@mistralai/mistralai@2.2.1':
+    resolution: {integrity: sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ==}
+
+  '@nodable/entities@2.2.0':
+    resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==}
+
+  '@opentelemetry/api-logs@0.54.0':
+    resolution: {integrity: sha512-9HhEh5GqFrassUndqJsyW7a0PzfyWr2eV2xwzHLIS+wX3125+9HE9FMRAKmJRwxZhgZGwH3HNQQjoMGZqmOeVA==}
+    engines: {node: '>=14'}
+
+  '@opentelemetry/api@1.9.0':
+    resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
+    engines: {node: '>=8.0.0'}
+
+  '@opentelemetry/context-async-hooks@1.28.0':
+    resolution: {integrity: sha512-igcl4Ve+F1N2063PJUkesk/GkYyuGIWinYkSyAFTnIj3gzrOgvOA4k747XNdL47HRRL1w/qh7UW8NDuxOLvKFA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/core@1.27.0':
+    resolution: {integrity: sha512-yQPKnK5e+76XuiqUH/gKyS8wv/7qITd5ln56QkBTf3uggr0VkXOXfcaAuG330UfdYu83wsyoBwqwxigpIG+Jkg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/core@1.28.0':
+    resolution: {integrity: sha512-ZLwRMV+fNDpVmF2WYUdBHlq0eOWtEaUJSusrzjGnBt7iSRvfjFE3RXYUZJrqou/wIDWV0DwQ5KIfYe9WXg9Xqw==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.54.0':
+    resolution: {integrity: sha512-cpDQj5wl7G8pLu3lW94SnMpn0C85A9Ehe7+JBow2IL5DGPWXTkynFngMtCC3PpQzQgzlyOVe0MVZfoBB3M5ECA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-exporter-base@0.54.0':
+    resolution: {integrity: sha512-g+H7+QleVF/9lz4zhaR9Dt4VwApjqG5WWupy5CTMpWJfHB/nLxBbX73GBZDgdiNfh08nO3rNa6AS7fK8OhgF5g==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-transformer@0.54.0':
+    resolution: {integrity: sha512-jRexIASQQzdK4AjfNIBfn94itAq4Q8EXR9d3b/OVbhd3kKQKvMr7GkxYDjbeTbY7hHCOLcLfJ3dpYQYGOe8qOQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/propagator-b3@1.28.0':
+    resolution: {integrity: sha512-Q7HVDIMwhN5RxL4bECMT4BdbyYSAKkC6U/RGn4NpO/cbqP6ZRg+BS7fPo/pGZi2w8AHfpIGQFXQmE8d2PC5xxQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/propagator-jaeger@1.28.0':
+    resolution: {integrity: sha512-wKJ94+s8467CnIRgoSRh0yXm/te0QMOwTq9J01PfG/RzYZvlvN8aRisN2oZ9SznB45dDGnMj3BhUlchSA9cEKA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/resources@1.27.0':
+    resolution: {integrity: sha512-jOwt2VJ/lUD5BLc+PMNymDrUCpm5PKi1E9oSVYAvz01U/VdndGmrtV3DU1pG4AwlYhJRHbHfOUIlpBeXCPw6QQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/resources@1.28.0':
+    resolution: {integrity: sha512-cIyXSVJjGeTICENN40YSvLDAq4Y2502hGK3iN7tfdynQLKWb3XWZQEkPc+eSx47kiy11YeFAlYkEfXwR1w8kfw==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-logs@0.54.0':
+    resolution: {integrity: sha512-HeWvOPiWhEw6lWvg+lCIi1WhJnIPbI4/OFZgHq9tKfpwF3LX6/kk3+GR8sGUGAEZfbjPElkkngzvd2s03zbD7Q==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.4.0 <1.10.0'
+
+  '@opentelemetry/sdk-metrics@1.27.0':
+    resolution: {integrity: sha512-JzWgzlutoXCydhHWIbLg+r76m+m3ncqvkCcsswXAQ4gqKS+LOHKhq+t6fx1zNytvLuaOUBur7EvWxECc4jPQKg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-base@1.27.0':
+    resolution: {integrity: sha512-btz6XTQzwsyJjombpeqCX6LhiMQYpzt2pIYNPnw0IPO/3AhT6yjnf8Mnv3ZC2A4eRYOjqrg+bfaXg9XHDRJDWQ==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-base@1.28.0':
+    resolution: {integrity: sha512-ceUVWuCpIao7Y5xE02Xs3nQi0tOGmMea17ecBdwtCvdo9ekmO+ijc9RFDgfifMl7XCBf41zne/1POM3LqSTZDA==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-node@1.28.0':
+    resolution: {integrity: sha512-N0sYfYXvHpP0FNIyc+UfhLnLSTOuZLytV0qQVrDWIlABeD/DWJIGttS7nYeR14gQLXch0M1DW8zm3VeN6Opwtg==}
+    engines: {node: '>=14'}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/semantic-conventions@1.27.0':
+    resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==}
+    engines: {node: '>=14'}
+
+  '@opentelemetry/semantic-conventions@1.28.0':
+    resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==}
+    engines: {node: '>=14'}
+
+  '@protobufjs/aspromise@1.1.2':
+    resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
+
+  '@protobufjs/base64@1.1.2':
+    resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
+
+  '@protobufjs/codegen@2.0.5':
+    resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==}
+
+  '@protobufjs/eventemitter@1.1.1':
+    resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==}
+
+  '@protobufjs/fetch@1.1.1':
+    resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==}
+
+  '@protobufjs/float@1.0.2':
+    resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
+
+  '@protobufjs/path@1.1.2':
+    resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
+
+  '@protobufjs/pool@1.1.0':
+    resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
+
+  '@protobufjs/utf8@1.1.1':
+    resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
+
+  '@silvia-odwyer/photon-node@0.3.4':
+    resolution: {integrity: sha512-bnly4BKB3KDTFxrUIcgCLbaeVVS8lrAkri1pEzskpmxu9MdfGQTy8b8EgcD83ywD3RPMsIulY8xJH5Awa+t9fA==}
+
+  '@smithy/core@3.24.7':
+    resolution: {integrity: sha512-KoUi4M1f3BG6kzN1FnCwL7oyFptTbyBJKjR6yhSib+JHRdUmM1o+VwsFtJ66NZCkCzVfJMWRHJNo0R0jznp0Pg==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/credential-provider-imds@4.3.9':
+    resolution: {integrity: sha512-ZlfJ/4Fa3jYb+3eaohPfG9utX9HmdhFNcFtpoGAhUhdynAOmGXtmigbi7eEiONKM+ykHw8RwKuDEb85Lx7t7fA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/fetch-http-handler@5.4.7':
+    resolution: {integrity: sha512-NslaM2ir0N2hisDmzXLstPaVINZheh8SokyOC++kzFPloZucL2R7Y7bS57mSzx/1Fc/fqmn7twjkeezTTrV0EA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/is-array-buffer@2.2.0':
+    resolution: {integrity: sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==}
+    engines: {node: '>=14.0.0'}
+
+  '@smithy/node-http-handler@4.7.3':
+    resolution: {integrity: sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/node-http-handler@4.7.8':
+    resolution: {integrity: sha512-f+DbsWUwSbtMu1a/j8Y93KiU1SRg9nyzfjereqn1BJ33QOTUXxdlYvVXMhAYl1vuR1Kmna5aIJe09KSIfyFNYw==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/signature-v4@5.4.7':
+    resolution: {integrity: sha512-LwQZazFayImv+IOm0S0enoLeUJwmAlhGC5O6YCcLWezyu08dF46GOxPOq35OpBIHkgd7OvNvBStIFwVNyrvoBw==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/types@4.14.4':
+    resolution: {integrity: sha512-B2S9+UGm1+/pHkcx3ZoLVX1a+pmSk8rqxRR+ZsNqZaJ5q9FWX9AFGQVM4qG5+OBeQUZVy99HY8HqW8gK/wgXzQ==}
+    engines: {node: '>=18.0.0'}
+
+  '@smithy/util-buffer-from@2.2.0':
+    resolution: {integrity: sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==}
+    engines: {node: '>=14.0.0'}
+
+  '@smithy/util-utf8@2.3.0':
+    resolution: {integrity: sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==}
+    engines: {node: '>=14.0.0'}
+
+  '@types/node@22.10.2':
+    resolution: {integrity: sha512-Xxr6BBRCAOQixvonOye19wnzyDiUtTeqldOOmj3CkeblonbccA12PFwlufvRdrpjXxqnmUaeiU5EOA+7s5diUQ==}
+
+  '@types/retry@0.12.0':
+    resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==}
+
+  agent-base@7.1.4:
+    resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
+    engines: {node: '>= 14'}
+
+  anynum@1.0.0:
+    resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==}
+
+  balanced-match@4.0.4:
+    resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
+    engines: {node: 18 || 20 || >=22}
+
+  base64-js@1.5.1:
+    resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
+
+  bignumber.js@9.3.1:
+    resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==}
+
+  bowser@2.14.1:
+    resolution: {integrity: sha512-tzPjzCxygAKWFOJP011oxFHs57HzIhOEracIgAePE4pqB3LikALKnSzUyU4MGs9/iCEUuHlAJTjTc5M+u7YEGg==}
+
+  brace-expansion@5.0.6:
+    resolution: {integrity: sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==}
+    engines: {node: 18 || 20 || >=22}
+
+  buffer-equal-constant-time@1.0.1:
+    resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
+
+  chalk@5.6.2:
+    resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==}
+    engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
+
+  cross-spawn@7.0.6:
+    resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
+    engines: {node: '>= 8'}
+
+  data-uri-to-buffer@4.0.1:
+    resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==}
+    engines: {node: '>= 12'}
+
+  debug@4.4.3:
+    resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==}
+    engines: {node: '>=6.0'}
+    peerDependencies:
+      supports-color: '*'
+    peerDependenciesMeta:
+      supports-color:
+        optional: true
+
+  diff@8.0.4:
+    resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==}
+    engines: {node: '>=0.3.1'}
+
+  ecdsa-sig-formatter@1.0.11:
+    resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==}
+
+  esbuild@0.23.1:
+    resolution: {integrity: sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==}
+    engines: {node: '>=18'}
+    hasBin: true
+
+  extend@3.0.2:
+    resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==}
+
+  fast-xml-builder@1.2.0:
+    resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==}
+
+  fast-xml-parser@5.7.3:
+    resolution: {integrity: sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==}
+    hasBin: true
+
+  fetch-blob@3.2.0:
+    resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==}
+    engines: {node: ^12.20 || >= 14.13}
+
+  formdata-polyfill@4.0.10:
+    resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==}
+    engines: {node: '>=12.20.0'}
+
+  fsevents@2.3.3:
+    resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
+    engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
+    os: [darwin]
+
+  gaxios@7.1.5:
+    resolution: {integrity: sha512-5FZy72Rh8LhtjmvDrKkI+lVhrsQrVKVsItxMoDm5mNQE+xR0WVIIs+jzPSJgBvKVsLi24fZhXJIsNI0bihDzFg==}
+    engines: {node: '>=18'}
+
+  gcp-metadata@8.1.2:
+    resolution: {integrity: sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==}
+    engines: {node: '>=18'}
+
+  get-east-asian-width@1.6.0:
+    resolution: {integrity: sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA==}
+    engines: {node: '>=18'}
+
+  get-tsconfig@4.14.0:
+    resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==}
+
+  glob@13.0.6:
+    resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==}
+    engines: {node: 18 || 20 || >=22}
+
+  google-auth-library@10.7.0:
+    resolution: {integrity: sha512-QpTAbNJ36TliZLx3TTtahR8HG0hN9RllL1e3FymOvQSIKK8JmgV58H924ub2wa2DsS3ANjjP1Aw1N+Ramc8hqQ==}
+    engines: {node: '>=18'}
+
+  google-logging-utils@1.1.3:
+    resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==}
+    engines: {node: '>=14'}
+
+  graceful-fs@4.2.11:
+    resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
+
+  highlight.js@10.7.3:
+    resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==}
+
+  hosted-git-info@9.0.3:
+    resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==}
+    engines: {node: ^20.17.0 || >=22.9.0}
+
+  http-proxy-agent@7.0.2:
+    resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==}
+    engines: {node: '>= 14'}
+
+  https-proxy-agent@7.0.6:
+    resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==}
+    engines: {node: '>= 14'}
+
+  ignore@7.0.5:
+    resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==}
+    engines: {node: '>= 4'}
+
+  isexe@2.0.0:
+    resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
+
+  jiti@2.7.0:
+    resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==}
+    hasBin: true
+
+  json-bigint@1.0.0:
+    resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==}
+
+  json-schema-to-ts@3.1.1:
+    resolution: {integrity: sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==}
+    engines: {node: '>=16'}
+
+  jwa@2.0.1:
+    resolution: {integrity: sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==}
+
+  jws@4.0.1:
+    resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==}
+
+  long@5.3.2:
+    resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==}
+
+  lru-cache@11.5.1:
+    resolution: {integrity: sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==}
+    engines: {node: 20 || >=22}
+
+  marked@15.0.12:
+    resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
+    engines: {node: '>= 18'}
+    hasBin: true
+
+  minimatch@10.2.5:
+    resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==}
+    engines: {node: 18 || 20 || >=22}
+
+  minipass@7.1.3:
+    resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==}
+    engines: {node: '>=16 || 14 >=14.17'}
+
+  ms@2.1.3:
+    resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
+
+  node-domexception@1.0.0:
+    resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
+    engines: {node: '>=10.5.0'}
+    deprecated: Use your platform's native DOMException instead
+
+  node-fetch@3.3.2:
+    resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==}
+    engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
+
+  openai@6.26.0:
+    resolution: {integrity: sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA==}
+    hasBin: true
+    peerDependencies:
+      ws: ^8.18.0
+      zod: ^3.25 || ^4.0
+    peerDependenciesMeta:
+      ws:
+        optional: true
+      zod:
+        optional: true
+
+  p-retry@4.6.2:
+    resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==}
+    engines: {node: '>=8'}
+
+  partial-json@0.1.7:
+    resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==}
+
+  path-expression-matcher@1.5.0:
+    resolution: {integrity: sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==}
+    engines: {node: '>=14.0.0'}
+
+  path-key@3.1.1:
+    resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
+    engines: {node: '>=8'}
+
+  path-scurry@2.0.2:
+    resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==}
+    engines: {node: 18 || 20 || >=22}
+
+  proper-lockfile@4.1.2:
+    resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==}
+
+  protobufjs@7.6.4:
+    resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==}
+    engines: {node: '>=12.0.0'}
+
+  resolve-pkg-maps@1.0.0:
+    resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==}
+
+  retry@0.12.0:
+    resolution: {integrity: sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==}
+    engines: {node: '>= 4'}
+
+  retry@0.13.1:
+    resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==}
+    engines: {node: '>= 4'}
+
+  safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+
+  semver@7.8.0:
+    resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==}
+    engines: {node: '>=10'}
+    hasBin: true
+
+  shebang-command@2.0.0:
+    resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==}
+    engines: {node: '>=8'}
+
+  shebang-regex@3.0.0:
+    resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
+    engines: {node: '>=8'}
+
+  signal-exit@3.0.7:
+    resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==}
+
+  strnum@2.4.0:
+    resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==}
+
+  ts-algebra@2.0.0:
+    resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==}
+
+  tslib@2.8.1:
+    resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==}
+
+  tsx@4.19.2:
+    resolution: {integrity: sha512-pOUl6Vo2LUq/bSa8S5q7b91cgNSjctn9ugq/+Mvow99qW6x/UZYwzxy/3NmqoT66eHYfCVvFvACC58UBPFf28g==}
+    engines: {node: '>=18.0.0'}
+    hasBin: true
+
+  typebox@1.1.38:
+    resolution: {integrity: sha512-pZ0aQPmMmXoUvSbeuWf/Hzsc+avNw/Zd6VeE8CFgkVGWyuHPJvqeJJDeJqLve+K70LvjYIoleGcoJHPT17cWoA==}
+
+  undici-types@6.20.0:
+    resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==}
+
+  undici@8.3.0:
+    resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==}
+    engines: {node: '>=22.19.0'}
+
+  web-streams-polyfill@3.3.3:
+    resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==}
+    engines: {node: '>= 8'}
+
+  which@2.0.2:
+    resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
+    engines: {node: '>= 8'}
+    hasBin: true
+
+  ws@8.21.0:
+    resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==}
+    engines: {node: '>=10.0.0'}
+    peerDependencies:
+      bufferutil: ^4.0.1
+      utf-8-validate: '>=5.0.2'
+    peerDependenciesMeta:
+      bufferutil:
+        optional: true
+      utf-8-validate:
+        optional: true
+
+  xml-naming@0.1.0:
+    resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==}
+    engines: {node: '>=16.0.0'}
+
+  yaml@2.9.0:
+    resolution: {integrity: sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==}
+    engines: {node: '>= 14.6'}
+    hasBin: true
+
+  zod-to-json-schema@3.25.2:
+    resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==}
+    peerDependencies:
+      zod: ^3.25.28 || ^4
+
+  zod@4.4.3:
+    resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==}
+
+snapshots:
+
+  '@anthropic-ai/sdk@0.91.1(zod@4.4.3)':
+    dependencies:
+      json-schema-to-ts: 3.1.1
+    optionalDependencies:
+      zod: 4.4.3
+
+  '@aws-crypto/crc32@5.2.0':
+    dependencies:
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      tslib: 2.8.1
+
+  '@aws-crypto/sha256-browser@5.2.0':
+    dependencies:
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-crypto/supports-web-crypto': 5.2.0
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      '@aws-sdk/util-locate-window': 3.965.7
+      '@smithy/util-utf8': 2.3.0
+      tslib: 2.8.1
+
+  '@aws-crypto/sha256-js@5.2.0':
+    dependencies:
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.12
+      tslib: 2.8.1
+
+  '@aws-crypto/supports-web-crypto@5.2.0':
+    dependencies:
+      tslib: 2.8.1
+
+  '@aws-crypto/util@5.2.0':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/util-utf8': 2.3.0
+      tslib: 2.8.1
+
+  '@aws-sdk/client-bedrock-runtime@3.1048.0':
+    dependencies:
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/credential-provider-node': 3.972.55
+      '@aws-sdk/eventstream-handler-node': 3.972.21
+      '@aws-sdk/middleware-eventstream': 3.972.17
+      '@aws-sdk/middleware-websocket': 3.972.28
+      '@aws-sdk/token-providers': 3.1048.0
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.3
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/core@3.974.20':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@aws-sdk/xml-builder': 3.972.29
+      '@aws/lambda-invoke-store': 0.2.4
+      '@smithy/core': 3.24.7
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      bowser: 2.14.1
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-env@3.972.46':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-http@3.972.48':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-ini@3.972.53':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/credential-provider-env': 3.972.46
+      '@aws-sdk/credential-provider-http': 3.972.48
+      '@aws-sdk/credential-provider-login': 3.972.52
+      '@aws-sdk/credential-provider-process': 3.972.46
+      '@aws-sdk/credential-provider-sso': 3.972.52
+      '@aws-sdk/credential-provider-web-identity': 3.972.52
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-login@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-node@3.972.55':
+    dependencies:
+      '@aws-sdk/credential-provider-env': 3.972.46
+      '@aws-sdk/credential-provider-http': 3.972.48
+      '@aws-sdk/credential-provider-ini': 3.972.53
+      '@aws-sdk/credential-provider-process': 3.972.46
+      '@aws-sdk/credential-provider-sso': 3.972.52
+      '@aws-sdk/credential-provider-web-identity': 3.972.52
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-process@3.972.46':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-sso@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/token-providers': 3.1066.0
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/credential-provider-web-identity@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/eventstream-handler-node@3.972.21':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/middleware-eventstream@3.972.17':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/middleware-websocket@3.972.28':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/nested-clients@3.997.20':
+    dependencies:
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/signature-v4-multi-region': 3.996.34
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/signature-v4-multi-region@3.996.34':
+    dependencies:
+      '@aws-sdk/types': 3.973.12
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/token-providers@3.1048.0':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/token-providers@3.1066.0':
+    dependencies:
+      '@aws-sdk/core': 3.974.20
+      '@aws-sdk/nested-clients': 3.997.20
+      '@aws-sdk/types': 3.973.12
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/types@3.973.12':
+    dependencies:
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@aws-sdk/util-locate-window@3.965.7':
+    dependencies:
+      tslib: 2.8.1
+
+  '@aws-sdk/xml-builder@3.972.29':
+    dependencies:
+      '@smithy/types': 4.14.4
+      fast-xml-parser: 5.7.3
+      tslib: 2.8.1
+
+  '@aws/lambda-invoke-store@0.2.4': {}
+
+  '@babel/runtime@7.29.7': {}
+
+  '@earendil-works/pi-agent-core@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      ignore: 7.0.5
+      typebox: 1.1.38
+      yaml: 2.9.0
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-ai@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@anthropic-ai/sdk': 0.91.1(zod@4.4.3)
+      '@aws-sdk/client-bedrock-runtime': 3.1048.0
+      '@google/genai': 1.52.0
+      '@mistralai/mistralai': 2.2.1
+      '@smithy/node-http-handler': 4.7.3
+      http-proxy-agent: 7.0.2
+      https-proxy-agent: 7.0.6
+      openai: 6.26.0(ws@8.21.0)(zod@4.4.3)
+      partial-json: 0.1.7
+      typebox: 1.1.38
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-coding-agent@0.79.4(ws@8.21.0)(zod@4.4.3)':
+    dependencies:
+      '@earendil-works/pi-agent-core': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3)
+      '@earendil-works/pi-tui': 0.79.4
+      '@silvia-odwyer/photon-node': 0.3.4
+      chalk: 5.6.2
+      cross-spawn: 7.0.6
+      diff: 8.0.4
+      glob: 13.0.6
+      highlight.js: 10.7.3
+      hosted-git-info: 9.0.3
+      ignore: 7.0.5
+      jiti: 2.7.0
+      minimatch: 10.2.5
+      proper-lockfile: 4.1.2
+      semver: 7.8.0
+      typebox: 1.1.38
+      undici: 8.3.0
+      yaml: 2.9.0
+    optionalDependencies:
+      '@mariozechner/clipboard': 0.3.9
+    transitivePeerDependencies:
+      - '@modelcontextprotocol/sdk'
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+      - ws
+      - zod
+
+  '@earendil-works/pi-tui@0.79.4':
+    dependencies:
+      get-east-asian-width: 1.6.0
+      marked: 15.0.12
+
+  '@esbuild/aix-ppc64@0.23.1':
+    optional: true
+
+  '@esbuild/android-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/android-arm@0.23.1':
+    optional: true
+
+  '@esbuild/android-x64@0.23.1':
+    optional: true
+
+  '@esbuild/darwin-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/darwin-x64@0.23.1':
+    optional: true
+
+  '@esbuild/freebsd-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/freebsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-arm@0.23.1':
+    optional: true
+
+  '@esbuild/linux-ia32@0.23.1':
+    optional: true
+
+  '@esbuild/linux-loong64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-mips64el@0.23.1':
+    optional: true
+
+  '@esbuild/linux-ppc64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-riscv64@0.23.1':
+    optional: true
+
+  '@esbuild/linux-s390x@0.23.1':
+    optional: true
+
+  '@esbuild/linux-x64@0.23.1':
+    optional: true
+
+  '@esbuild/netbsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/openbsd-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/openbsd-x64@0.23.1':
+    optional: true
+
+  '@esbuild/sunos-x64@0.23.1':
+    optional: true
+
+  '@esbuild/win32-arm64@0.23.1':
+    optional: true
+
+  '@esbuild/win32-ia32@0.23.1':
+    optional: true
+
+  '@esbuild/win32-x64@0.23.1':
+    optional: true
+
+  '@google/genai@1.52.0':
+    dependencies:
+      google-auth-library: 10.7.0
+      p-retry: 4.6.2
+      protobufjs: 7.6.4
+      ws: 8.21.0
+    transitivePeerDependencies:
+      - bufferutil
+      - supports-color
+      - utf-8-validate
+
+  '@mariozechner/clipboard-darwin-arm64@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-darwin-universal@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-darwin-x64@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-arm64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-arm64-musl@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-x64-gnu@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-linux-x64-musl@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-win32-arm64-msvc@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard-win32-x64-msvc@0.3.9':
+    optional: true
+
+  '@mariozechner/clipboard@0.3.9':
+    optionalDependencies:
+      '@mariozechner/clipboard-darwin-arm64': 0.3.9
+      '@mariozechner/clipboard-darwin-universal': 0.3.9
+      '@mariozechner/clipboard-darwin-x64': 0.3.9
+      '@mariozechner/clipboard-linux-arm64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-arm64-musl': 0.3.9
+      '@mariozechner/clipboard-linux-riscv64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-x64-gnu': 0.3.9
+      '@mariozechner/clipboard-linux-x64-musl': 0.3.9
+      '@mariozechner/clipboard-win32-arm64-msvc': 0.3.9
+      '@mariozechner/clipboard-win32-x64-msvc': 0.3.9
+    optional: true
+
+  '@mistralai/mistralai@2.2.1':
+    dependencies:
+      ws: 8.21.0
+      zod: 4.4.3
+      zod-to-json-schema: 3.25.2(zod@4.4.3)
+    transitivePeerDependencies:
+      - bufferutil
+      - utf-8-validate
+
+  '@nodable/entities@2.2.0': {}
+
+  '@opentelemetry/api-logs@0.54.0':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
+  '@opentelemetry/api@1.9.0': {}
+
+  '@opentelemetry/context-async-hooks@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
+  '@opentelemetry/core@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/core@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/otlp-exporter-base@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/otlp-transformer@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.54.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.54.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0)
+      protobufjs: 7.6.4
+
+  '@opentelemetry/propagator-b3@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/propagator-jaeger@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/resources@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/resources@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-logs@0.54.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.54.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/sdk-metrics@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/sdk-trace-base@1.27.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-trace-base@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.27.0
+
+  '@opentelemetry/sdk-trace-node@1.28.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/context-async-hooks': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-b3': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-jaeger': 1.28.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 1.28.0(@opentelemetry/api@1.9.0)
+      semver: 7.8.0
+
+  '@opentelemetry/semantic-conventions@1.27.0': {}
+
+  '@opentelemetry/semantic-conventions@1.28.0': {}
+
+  '@protobufjs/aspromise@1.1.2': {}
+
+  '@protobufjs/base64@1.1.2': {}
+
+  '@protobufjs/codegen@2.0.5': {}
+
+  '@protobufjs/eventemitter@1.1.1': {}
+
+  '@protobufjs/fetch@1.1.1':
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+
+  '@protobufjs/float@1.0.2': {}
+
+  '@protobufjs/path@1.1.2': {}
+
+  '@protobufjs/pool@1.1.0': {}
+
+  '@protobufjs/utf8@1.1.1': {}
+
+  '@silvia-odwyer/photon-node@0.3.4': {}
+
+  '@smithy/core@3.24.7':
+    dependencies:
+      '@aws-crypto/crc32': 5.2.0
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/credential-provider-imds@4.3.9':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/fetch-http-handler@5.4.7':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/is-array-buffer@2.2.0':
+    dependencies:
+      tslib: 2.8.1
+
+  '@smithy/node-http-handler@4.7.3':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/node-http-handler@4.7.8':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/signature-v4@5.4.7':
+    dependencies:
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
+  '@smithy/types@4.14.4':
+    dependencies:
+      tslib: 2.8.1
+
+  '@smithy/util-buffer-from@2.2.0':
+    dependencies:
+      '@smithy/is-array-buffer': 2.2.0
+      tslib: 2.8.1
+
+  '@smithy/util-utf8@2.3.0':
+    dependencies:
+      '@smithy/util-buffer-from': 2.2.0
+      tslib: 2.8.1
+
+  '@types/node@22.10.2':
+    dependencies:
+      undici-types: 6.20.0
+
+  '@types/retry@0.12.0': {}
+
+  agent-base@7.1.4: {}
+
+  anynum@1.0.0: {}
+
+  balanced-match@4.0.4: {}
+
+  base64-js@1.5.1: {}
+
+  bignumber.js@9.3.1: {}
+
+  bowser@2.14.1: {}
+
+  brace-expansion@5.0.6:
+    dependencies:
+      balanced-match: 4.0.4
+
+  buffer-equal-constant-time@1.0.1: {}
+
+  chalk@5.6.2: {}
+
+  cross-spawn@7.0.6:
+    dependencies:
+      path-key: 3.1.1
+      shebang-command: 2.0.0
+      which: 2.0.2
+
+  data-uri-to-buffer@4.0.1: {}
+
+  debug@4.4.3:
+    dependencies:
+      ms: 2.1.3
+
+  diff@8.0.4: {}
+
+  ecdsa-sig-formatter@1.0.11:
+    dependencies:
+      safe-buffer: 5.2.1
+
+  esbuild@0.23.1:
+    optionalDependencies:
+      '@esbuild/aix-ppc64': 0.23.1
+      '@esbuild/android-arm': 0.23.1
+      '@esbuild/android-arm64': 0.23.1
+      '@esbuild/android-x64': 0.23.1
+      '@esbuild/darwin-arm64': 0.23.1
+      '@esbuild/darwin-x64': 0.23.1
+      '@esbuild/freebsd-arm64': 0.23.1
+      '@esbuild/freebsd-x64': 0.23.1
+      '@esbuild/linux-arm': 0.23.1
+      '@esbuild/linux-arm64': 0.23.1
+      '@esbuild/linux-ia32': 0.23.1
+      '@esbuild/linux-loong64': 0.23.1
+      '@esbuild/linux-mips64el': 0.23.1
+      '@esbuild/linux-ppc64': 0.23.1
+      '@esbuild/linux-riscv64': 0.23.1
+      '@esbuild/linux-s390x': 0.23.1
+      '@esbuild/linux-x64': 0.23.1
+      '@esbuild/netbsd-x64': 0.23.1
+      '@esbuild/openbsd-arm64': 0.23.1
+      '@esbuild/openbsd-x64': 0.23.1
+      '@esbuild/sunos-x64': 0.23.1
+      '@esbuild/win32-arm64': 0.23.1
+      '@esbuild/win32-ia32': 0.23.1
+      '@esbuild/win32-x64': 0.23.1
+
+  extend@3.0.2: {}
+
+  fast-xml-builder@1.2.0:
+    dependencies:
+      path-expression-matcher: 1.5.0
+      xml-naming: 0.1.0
+
+  fast-xml-parser@5.7.3:
+    dependencies:
+      '@nodable/entities': 2.2.0
+      fast-xml-builder: 1.2.0
+      path-expression-matcher: 1.5.0
+      strnum: 2.4.0
+
+  fetch-blob@3.2.0:
+    dependencies:
+      node-domexception: 1.0.0
+      web-streams-polyfill: 3.3.3
+
+  formdata-polyfill@4.0.10:
+    dependencies:
+      fetch-blob: 3.2.0
+
+  fsevents@2.3.3:
+    optional: true
+
+  gaxios@7.1.5:
+    dependencies:
+      extend: 3.0.2
+      https-proxy-agent: 7.0.6
+      node-fetch: 3.3.2
+    transitivePeerDependencies:
+      - supports-color
+
+  gcp-metadata@8.1.2:
+    dependencies:
+      gaxios: 7.1.5
+      google-logging-utils: 1.1.3
+      json-bigint: 1.0.0
+    transitivePeerDependencies:
+      - supports-color
+
+  get-east-asian-width@1.6.0: {}
+
+  get-tsconfig@4.14.0:
+    dependencies:
+      resolve-pkg-maps: 1.0.0
+
+  glob@13.0.6:
+    dependencies:
+      minimatch: 10.2.5
+      minipass: 7.1.3
+      path-scurry: 2.0.2
+
+  google-auth-library@10.7.0:
+    dependencies:
+      base64-js: 1.5.1
+      ecdsa-sig-formatter: 1.0.11
+      gaxios: 7.1.5
+      gcp-metadata: 8.1.2
+      google-logging-utils: 1.1.3
+      jws: 4.0.1
+    transitivePeerDependencies:
+      - supports-color
+
+  google-logging-utils@1.1.3: {}
+
+  graceful-fs@4.2.11: {}
+
+  highlight.js@10.7.3: {}
+
+  hosted-git-info@9.0.3:
+    dependencies:
+      lru-cache: 11.5.1
+
+  http-proxy-agent@7.0.2:
+    dependencies:
+      agent-base: 7.1.4
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
+  https-proxy-agent@7.0.6:
+    dependencies:
+      agent-base: 7.1.4
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
+  ignore@7.0.5: {}
+
+  isexe@2.0.0: {}
+
+  jiti@2.7.0: {}
+
+  json-bigint@1.0.0:
+    dependencies:
+      bignumber.js: 9.3.1
+
+  json-schema-to-ts@3.1.1:
+    dependencies:
+      '@babel/runtime': 7.29.7
+      ts-algebra: 2.0.0
+
+  jwa@2.0.1:
+    dependencies:
+      buffer-equal-constant-time: 1.0.1
+      ecdsa-sig-formatter: 1.0.11
+      safe-buffer: 5.2.1
+
+  jws@4.0.1:
+    dependencies:
+      jwa: 2.0.1
+      safe-buffer: 5.2.1
+
+  long@5.3.2: {}
+
+  lru-cache@11.5.1: {}
+
+  marked@15.0.12: {}
+
+  minimatch@10.2.5:
+    dependencies:
+      brace-expansion: 5.0.6
+
+  minipass@7.1.3: {}
+
+  ms@2.1.3: {}
+
+  node-domexception@1.0.0: {}
+
+  node-fetch@3.3.2:
+    dependencies:
+      data-uri-to-buffer: 4.0.1
+      fetch-blob: 3.2.0
+      formdata-polyfill: 4.0.10
+
+  openai@6.26.0(ws@8.21.0)(zod@4.4.3):
+    optionalDependencies:
+      ws: 8.21.0
+      zod: 4.4.3
+
+  p-retry@4.6.2:
+    dependencies:
+      '@types/retry': 0.12.0
+      retry: 0.13.1
+
+  partial-json@0.1.7: {}
+
+  path-expression-matcher@1.5.0: {}
+
+  path-key@3.1.1: {}
+
+  path-scurry@2.0.2:
+    dependencies:
+      lru-cache: 11.5.1
+      minipass: 7.1.3
+
+  proper-lockfile@4.1.2:
+    dependencies:
+      graceful-fs: 4.2.11
+      retry: 0.12.0
+      signal-exit: 3.0.7
+
+  protobufjs@7.6.4:
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+      '@protobufjs/base64': 1.1.2
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.1
+      '@protobufjs/fetch': 1.1.1
+      '@protobufjs/float': 1.0.2
+      '@protobufjs/path': 1.1.2
+      '@protobufjs/pool': 1.1.0
+      '@protobufjs/utf8': 1.1.1
+      '@types/node': 22.10.2
+      long: 5.3.2
+
+  resolve-pkg-maps@1.0.0: {}
+
+  retry@0.12.0: {}
+
+  retry@0.13.1: {}
+
+  safe-buffer@5.2.1: {}
+
+  semver@7.8.0: {}
+
+  shebang-command@2.0.0:
+    dependencies:
+      shebang-regex: 3.0.0
+
+  shebang-regex@3.0.0: {}
+
+  signal-exit@3.0.7: {}
+
+  strnum@2.4.0:
+    dependencies:
+      anynum: 1.0.0
+
+  ts-algebra@2.0.0: {}
+
+  tslib@2.8.1: {}
+
+  tsx@4.19.2:
+    dependencies:
+      esbuild: 0.23.1
+      get-tsconfig: 4.14.0
+    optionalDependencies:
+      fsevents: 2.3.3
+
+  typebox@1.1.38: {}
+
+  undici-types@6.20.0: {}
+
+  undici@8.3.0: {}
+
+  web-streams-polyfill@3.3.3: {}
+
+  which@2.0.2:
+    dependencies:
+      isexe: 2.0.0
+
+  ws@8.21.0: {}
+
+  xml-naming@0.1.0: {}
+
+  yaml@2.9.0: {}
+
+  zod-to-json-schema@3.25.2(zod@4.4.3):
+    dependencies:
+      zod: 4.4.3
+
+  zod@4.4.3: {}
diff --git a/services/agent/scripts/register_agent_app.py b/services/agent/scripts/register_agent_app.py
new file mode 100644
index 0000000000..1e73c0515f
--- /dev/null
+++ b/services/agent/scripts/register_agent_app.py
@@ -0,0 +1,166 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["requests"]
+# ///
+"""Register the agent as an app in a running Agenta stack, pointing at the dockerized
+agent service. Run it, then open the app in the playground and chat.
+
+It creates a workflow + default variant and commits a revision whose `data.url` points
+at the agent service and whose `data.schemas` is the chat interface the agent serves
+from /inspect (so the playground renders a chat box). This is the "custom workflow"
+path: no static SDK interface, the agent self-describes.
+
+Env:
+  AGENTA_HOST     base host (default http://144.76.237.122:8280)
+  AGENTA_API_KEY  api key for that stack (Authorization: ApiKey ...)
+  AGENT_URL       agent service invoke base (default http://144.76.237.122:8092/agent/v0)
+  PROJECT_ID      optional; defaults to the stack's default project
+  APP_SLUG        optional; defaults to wp2-agent-<n>
+
+Usage:
+  AGENTA_API_KEY=... uv run services/agent/scripts/register_agent_app.py
+"""
+
+import os
+import secrets
+import sys
+
+import requests
+
+HOST = os.environ.get("AGENTA_HOST", "http://144.76.237.122:8280").rstrip("/")
+API = HOST + "/api"
+KEY = os.environ.get("AGENTA_API_KEY")
+AGENT_URL = os.environ.get("AGENT_URL", "http://144.76.237.122:8092/agent/v0").rstrip(
+    "/"
+)
+PROJECT_ID = os.environ.get("PROJECT_ID")
+APP_SLUG = os.environ.get("APP_SLUG") or f"wp2-agent-{secrets.token_hex(3)}"
+
+if not KEY:
+    sys.exit("Set AGENTA_API_KEY")
+
+H = {"Authorization": f"ApiKey {KEY}", "Content-Type": "application/json"}
+
+# The chat interface the agent advertises via /inspect (kept in sync with
+# services/oss/src/agent_pi/schemas.py).
+SCHEMA = "https://json-schema.org/draft/2020-12/schema"
+AGENT_SCHEMAS = {
+    "inputs": {
+        "$schema": SCHEMA,
+        "type": "object",
+        "additionalProperties": True,
+        "properties": {
+            "messages": {
+                "x-ag-type-ref": "messages",
+                "type": "array",
+                "description": "Ordered list of normalized chat messages.",
+            }
+        },
+    },
+    "parameters": {
+        "$schema": SCHEMA,
+        "type": "object",
+        "additionalProperties": True,
+        "properties": {"model": {"type": "string", "description": "Model override."}},
+    },
+    "outputs": {
+        "$schema": SCHEMA,
+        "x-ag-type-ref": "message",
+        "type": "object",
+        "description": "Final assistant message returned by the agent.",
+    },
+}
+
+
+def _id() -> str:
+    return secrets.token_hex(6)
+
+
+def post(path: str, body: dict) -> dict:
+    r = requests.post(
+        f"{API}{path}",
+        json=body,
+        headers=H,
+        params={"project_id": PROJECT_ID},
+        timeout=60,
+    )
+    if r.status_code >= 300:
+        sys.exit(f"POST {path} -> {r.status_code}: {r.text[:600]}")
+    return r.json()
+
+
+def main() -> None:
+    global PROJECT_ID
+    if not PROJECT_ID:
+        projects = requests.get(f"{API}/projects", headers=H, timeout=30).json()
+        default = next(
+            (p for p in projects if p.get("is_default_project")), projects[0]
+        )
+        PROJECT_ID = default["project_id"]
+    print(f"project_id={PROJECT_ID}  app_slug={APP_SLUG}  agent_url={AGENT_URL}")
+
+    wf = post(
+        "/workflows/",
+        {
+            "workflow": {
+                "slug": APP_SLUG,
+                "name": APP_SLUG,
+                "flags": {"is_application": True},
+            }
+        },
+    )
+    workflow_id = wf["workflow"]["id"]
+
+    var = post(
+        "/workflows/variants/",
+        {
+            "workflow_variant": {
+                "workflow_id": workflow_id,
+                "slug": f"{APP_SLUG}.default",
+                "name": "default",
+            }
+        },
+    )
+    variant_id = var["workflow_variant"]["id"]
+
+    # Seed v0 (tables dismiss v0), then commit v1 with the real data.
+    post(
+        "/workflows/revisions/commit",
+        {
+            "workflow_revision": {
+                "workflow_id": workflow_id,
+                "workflow_variant_id": variant_id,
+                "slug": _id(),
+                "name": "default",
+                "message": "Initial commit",
+            }
+        },
+    )
+    rev = post(
+        "/workflows/revisions/commit",
+        {
+            "workflow_revision": {
+                "workflow_id": workflow_id,
+                "workflow_variant_id": variant_id,
+                "slug": _id(),
+                "name": "default",
+                "message": "Agent service",
+                "flags": {"is_chat": True},
+                "data": {
+                    "url": AGENT_URL,
+                    "parameters": {"model": "gpt-5.5"},
+                    "schemas": AGENT_SCHEMAS,
+                },
+            }
+        },
+    )
+    revision = rev["workflow_revision"]
+    print(f"workflow_id={workflow_id}")
+    print(f"variant_id={variant_id}")
+    print(f"revision_id={revision['id']}  flags={revision.get('flags')}")
+    print(f"stored url={revision.get('data', {}).get('url')}")
+    print(f"\nOpen the playground: {HOST}/apps/{workflow_id}/playground")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/services/agent/src/agenta-otel.ts b/services/agent/src/agenta-otel.ts
new file mode 100644
index 0000000000..3d838329a1
--- /dev/null
+++ b/services/agent/src/agenta-otel.ts
@@ -0,0 +1,551 @@
+/**
+ * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into
+ * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta.
+ *
+ * This is the service build of the WP-1 POC extension
+ * (docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts). It keeps the
+ * span tree and the load-bearing attribute choices identical, and adds three
+ * things the service needs that the single-run POC did not:
+ *
+ *   1. Per-run state. The POC kept span state in module globals because it ran one
+ *      prompt at a time. The service may drive several runs in one process (the
+ *      HTTP sidecar), so all per-run state lives in the closure returned by
+ *      `createAgentaOtel`. The shared tracer/provider/exporters stay module-level.
+ *   2. Cross-boundary trace context. The caller (the Agenta Python service) passes a
+ *      W3C `traceparent`. When present, `invoke_agent` is started as a CHILD of that
+ *      remote span, so the whole agent run joins the same trace as the `/invoke`
+ *      request — the agent's work becomes part of the response trace, the way
+ *      completion/chat nest their LLM spans under the workflow span.
+ *   3. Per-trace export target. The OTLP endpoint and `Authorization` header come
+ *      from the run config (the caller's host + credentials), falling back to env.
+ *      Each trace is exported with its own target, so a shared process can serve
+ *      more than one project.
+ *
+ * Span tree (per user prompt), unchanged from the POC:
+ *   invoke_agent            (openinference.span.kind = AGENT)
+ *     turn N                (CHAIN)
+ *       chat <model>        (LLM)   — the provider request for that turn
+ *       execute_tool <name> (TOOL)  — each tool the turn ran
+ *
+ * Config (read lazily from the environment for the fallback target):
+ *   AGENTA_HOST, AGENTA_API_KEY  — fallback exporter endpoint + auth
+ *   OTEL_SERVICE_NAME            — resource service.name (default "pi-agent")
+ */
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import {
+  context,
+  ROOT_CONTEXT,
+  trace,
+  TraceFlags,
+  SpanStatusCode,
+  type Context,
+  type Span,
+  type SpanContext,
+} from "@opentelemetry/api";
+import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
+import { Resource } from "@opentelemetry/resources";
+import type {
+  ReadableSpan,
+  SpanExporter,
+  SpanProcessor,
+} from "@opentelemetry/sdk-trace-base";
+import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
+
+// ---------------------------------------------------------------------------
+// Shared, process-wide tracing infrastructure
+// ---------------------------------------------------------------------------
+
+/** Where a trace's spans are shipped: an OTLP endpoint and an Authorization header. */
+interface ExportTarget {
+  endpoint: string;
+  authorization?: string;
+}
+
+/** traceId (hex) -> where that trace's spans should be exported. Set on agent_start. */
+const traceTargets = new Map<string, ExportTarget>();
+
+/** Cache one exporter per distinct endpoint+auth so we do not rebuild per export. */
+const exporterCache = new Map<string, OTLPTraceExporter>();
+
+function targetKey(target: ExportTarget): string {
+  return `${target.endpoint}\n${target.authorization ?? ""}`;
+}
+
+function getExporter(target: ExportTarget): OTLPTraceExporter {
+  const key = targetKey(target);
+  let exporter = exporterCache.get(key);
+  if (!exporter) {
+    exporter = new OTLPTraceExporter({
+      url: target.endpoint,
+      headers: target.authorization
+        ? { Authorization: target.authorization }
+        : {},
+      timeoutMillis: 10_000,
+    });
+    exporterCache.set(key, exporter);
+  }
+  return exporter;
+}
+
+/** Fallback target from env, used when a trace was started without an explicit one. */
+function defaultTarget(): ExportTarget {
+  const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace(
+    /\/+$/,
+    "",
+  );
+  const apiKey = process.env.AGENTA_API_KEY || "";
+  return {
+    endpoint: `${host}/api/otlp/v1/traces`,
+    authorization: apiKey ? `ApiKey ${apiKey}` : undefined,
+  };
+}
+
+/**
+ * Buffer a trace's spans and export them in ONE OTLP batch. Agenta computes
+ * cumulative (rolled-up) token/cost metrics per ingest batch, so a trace split
+ * across batches loses the root aggregation. Two completion signals:
+ *   - the root span ends (standalone run: invoke_agent IS the root), or
+ *   - the run flushes explicitly by trace id (cross-boundary run: invoke_agent
+ *     has a remote parent that never ends in this process, so root-end never fires).
+ */
+class TraceBatchProcessor implements SpanProcessor {
+  private readonly buffers = new Map<string, ReadableSpan[]>();
+
+  onStart(): void {}
+
+  onEnd(span: ReadableSpan): void {
+    const traceId = span.spanContext().traceId;
+    const spans = this.buffers.get(traceId) ?? [];
+    spans.push(span);
+    this.buffers.set(traceId, spans);
+    // No parent in this process => this is the local root and the trace is done.
+    if (!span.parentSpanId) {
+      this.flush(traceId);
+    }
+  }
+
+  /** Export and drop one trace's buffered spans. Resolves once the export returns. */
+  flush(traceId: string): Promise<void> {
+    const spans = this.buffers.get(traceId);
+    if (!spans || spans.length === 0) return Promise.resolve();
+    this.buffers.delete(traceId);
+    const target = traceTargets.get(traceId) ?? defaultTarget();
+    traceTargets.delete(traceId);
+    return new Promise((resolve) =>
+      getExporter(target).export(orderParentFirst(spans), () => resolve()),
+    );
+  }
+
+  forceFlush(): Promise<void> {
+    return Promise.all(
+      [...this.buffers.keys()].map((traceId) => this.flush(traceId)),
+    ).then(() => undefined);
+  }
+
+  shutdown(): Promise<void> {
+    return this.forceFlush().then(async () => {
+      await Promise.all(
+        [...exporterCache.values()].map((exporter) => exporter.shutdown()),
+      );
+    });
+  }
+}
+
+let provider: NodeTracerProvider | undefined;
+let processor: TraceBatchProcessor | undefined;
+
+function ensureProvider(): void {
+  if (provider) return;
+  processor = new TraceBatchProcessor();
+  provider = new NodeTracerProvider({
+    resource: new Resource({
+      [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent",
+    }),
+  });
+  provider.addSpanProcessor(processor);
+  provider.register();
+}
+
+/** Flush one trace's spans to Agenta. Call after a run whose root has a remote parent. */
+export async function flushTrace(traceId?: string): Promise<void> {
+  if (!processor || !traceId) return;
+  await processor.flush(traceId);
+}
+
+/** Flush and shut down all exporters. Call once on process exit, not per run. */
+export async function shutdownTracing(): Promise<void> {
+  if (!provider) return;
+  try {
+    await provider.forceFlush();
+    await provider.shutdown();
+  } finally {
+    provider = undefined;
+    processor = undefined;
+    exporterCache.clear();
+  }
+}
+
+/**
+ * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at
+ * millisecond resolution and builds its roll-up tree by sorting on start_time,
+ * attaching a span only if its parent is already seen. A parent-first request
+ * order keeps parents ahead of children on same-millisecond ties.
+ */
+function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] {
+  const byId = new Map(spans.map((s) => [s.spanContext().spanId, s]));
+  const childrenOf = new Map<string, ReadableSpan[]>();
+  const roots: ReadableSpan[] = [];
+  for (const s of spans) {
+    const parentId = s.parentSpanId;
+    if (parentId && byId.has(parentId)) {
+      const list = childrenOf.get(parentId) ?? [];
+      list.push(s);
+      childrenOf.set(parentId, list);
+    } else {
+      roots.push(s);
+    }
+  }
+  const ordered: ReadableSpan[] = [];
+  const visit = (s: ReadableSpan) => {
+    ordered.push(s);
+    for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child);
+  };
+  roots.forEach(visit);
+  // Any spans not reached (defensive) get appended so nothing is dropped.
+  if (ordered.length !== spans.length) {
+    const seen = new Set(ordered);
+    for (const s of spans) if (!seen.has(s)) ordered.push(s);
+  }
+  return ordered;
+}
+
+/** Build a parent Context from a W3C traceparent string, or undefined if absent/invalid. */
+function parentContext(traceparent?: string): Context | undefined {
+  if (!traceparent) return undefined;
+  const match = /^00-([0-9a-f]{32})-([0-9a-f]{16})-([0-9a-f]{2})$/.exec(
+    traceparent.trim(),
+  );
+  if (!match) return undefined;
+  const [, traceId, spanId, flags] = match;
+  const spanContext: SpanContext = {
+    traceId,
+    spanId,
+    // Honor the incoming sampled bit; default to sampled so child spans record.
+    traceFlags: (parseInt(flags, 16) & 1) === 1 ? TraceFlags.SAMPLED : TraceFlags.NONE,
+    isRemote: true,
+  };
+  return trace.setSpanContext(ROOT_CONTEXT, spanContext);
+}
+
+// ---------------------------------------------------------------------------
+// Per-run config + content helpers
+// ---------------------------------------------------------------------------
+
+/** One run's tracing config. Mutated by the runner after the session is created. */
+export interface RunConfig {
+  /** OTLP traces endpoint for this run's trace (falls back to env). */
+  endpoint?: string;
+  /** Authorization header value for the OTLP export (falls back to env ApiKey). */
+  authorization?: string;
+  /** W3C traceparent from the caller; nests invoke_agent under that span. */
+  traceparent?: string;
+  /** Drop prompt/completion/tool I/O from spans when false. */
+  captureContent: boolean;
+  /** Pi session id, set after createAgentSession so spans carry session.id. */
+  sessionId?: string;
+  /** Resolved provider, set after the model is picked. */
+  provider?: string;
+  /** Resolved model id, set after the model is picked. */
+  requestModel?: string;
+  /** Filled by the extension on agent_start so the runner can flush/return it. */
+  traceId?: string;
+}
+
+/** A string output → ag.data.outputs (any type is valid there). */
+function setOutput(span: Span, value: unknown, capture: boolean): void {
+  if (!capture || value == null) return;
+  const text = typeof value === "string" ? value : JSON.stringify(value);
+  if (text.length > 0) span.setAttribute("output.value", text);
+}
+
+/**
+ * ag.data.inputs must be a dict, so emit input.value as a JSON object string.
+ * A non-object (raw string) would be relocated to ag.unsupported by Agenta.
+ */
+function setInputs(
+  span: Span,
+  obj: Record<string, unknown>,
+  capture: boolean,
+): void {
+  if (!capture) return;
+  span.setAttribute("input.value", JSON.stringify(obj));
+  span.setAttribute("input.mime_type", "application/json");
+}
+
+function oiRole(role: string): string {
+  return role === "toolResult" ? "tool" : role; // user | assistant | system | tool
+}
+
+function messageText(msg: any): string {
+  const c = msg?.content;
+  if (typeof c === "string") return c;
+  if (Array.isArray(c)) {
+    return c
+      .filter((b: any) => b?.type === "text")
+      .map((b: any) => b.text)
+      .join("");
+  }
+  return "";
+}
+
+/**
+ * Emit OpenInference structured messages so Agenta renders a proper message
+ * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*,
+ * `llm.output_messages.*` -> ag.data.outputs.completion.*.
+ */
+function emitMessages(
+  span: Span,
+  prefix: string,
+  messages: any[],
+  capture: boolean,
+): void {
+  if (!capture || !Array.isArray(messages)) return;
+  messages.forEach((m, i) => {
+    const base = `${prefix}.${i}.message`;
+    span.setAttribute(`${base}.role`, oiRole(m.role));
+    const text = messageText(m);
+    if (text) span.setAttribute(`${base}.content`, text);
+    if (m.role === "toolResult" && m.toolCallId)
+      span.setAttribute(`${base}.tool_call_id`, m.toolCallId);
+    if (Array.isArray(m.content)) {
+      m.content
+        .filter((b: any) => b?.type === "toolCall")
+        .forEach((call: any, j: number) => {
+          const tc = `${base}.tool_calls.${j}.tool_call`;
+          if (call.id) span.setAttribute(`${tc}.id`, call.id);
+          span.setAttribute(`${tc}.function.name`, call.name);
+          span.setAttribute(
+            `${tc}.function.arguments`,
+            JSON.stringify(call.arguments ?? {}),
+          );
+        });
+    }
+  });
+}
+
+function toolResultText(result: any): string {
+  if (!result) return "";
+  if (typeof result === "string") return result;
+  if (Array.isArray(result)) {
+    return result
+      .filter((c: any) => c?.type === "text")
+      .map((c: any) => c.text)
+      .join("");
+  }
+  if (result.content) return toolResultText(result.content);
+  return JSON.stringify(result);
+}
+
+function lastAssistantText(messages: any): string {
+  if (!Array.isArray(messages)) return "";
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant") return messageText(messages[i]);
+  }
+  return "";
+}
+
+/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */
+function applyAssistant(span: Span, msg: any, capture: boolean): void {
+  if (msg.provider) span.setAttribute("gen_ai.system", msg.provider);
+  if (msg.model) span.setAttribute("gen_ai.request.model", msg.model);
+  if (msg.responseModel || msg.model)
+    span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model);
+  if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId);
+  if (msg.stopReason)
+    span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]);
+
+  const u = msg.usage;
+  if (u) {
+    // Current GenAI names (mapped by Agenta's logfire adapter) ...
+    span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0);
+    // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token
+    // usage is never silently dropped regardless of which adapter wins.
+    span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0);
+    span.setAttribute(
+      "gen_ai.usage.total_tokens",
+      u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0),
+    );
+    if (u.cacheRead)
+      span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead);
+    if (u.cacheWrite)
+      span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite);
+    if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total);
+  }
+
+  emitMessages(span, "llm.output_messages", [msg], capture);
+  if (msg.stopReason === "error" || msg.errorMessage) {
+    span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage });
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Extension factory (one per run; state is closure-scoped)
+// ---------------------------------------------------------------------------
+
+export interface AgentaOtel {
+  /** Register with DefaultResourceLoader.extensionFactories. */
+  register: (pi: ExtensionAPI) => void;
+  /** Mutable config; set sessionId/provider/requestModel after the session exists. */
+  config: RunConfig;
+  /** Flush this run's trace to Agenta. Await before the process/response ends. */
+  flush: () => Promise<void>;
+}
+
+/**
+ * Build a tracing extension scoped to a single agent run. Pass `register` to the
+ * resource loader, fill in `config.sessionId`/`provider`/`requestModel` once the
+ * session and model are resolved, then `await flush()` after the prompt completes.
+ */
+export function createAgentaOtel(
+  init: Partial<RunConfig> & { captureContent?: boolean },
+): AgentaOtel {
+  ensureProvider();
+
+  const config: RunConfig = {
+    endpoint: init.endpoint,
+    authorization: init.authorization,
+    traceparent: init.traceparent,
+    captureContent: init.captureContent !== false,
+    sessionId: init.sessionId,
+    provider: init.provider,
+    requestModel: init.requestModel,
+  };
+
+  const tracer = trace.getTracer("agenta-pi-otel", "0.1.0");
+
+  // Per-run span state — closure-scoped so concurrent runs never collide.
+  let agentSpan: Span | undefined;
+  let agentCtx: Context | undefined;
+  let pendingPrompt: string | undefined;
+  let currentTurn: { span: Span; ctx: Context; index?: number } | undefined;
+  let llmSpan: Span | undefined;
+  let lastContextMessages: any[] | undefined;
+  const toolSpans = new Map<string, Span>();
+
+  const register = (pi: ExtensionAPI): void => {
+    pi.on("before_agent_start", async (event: any) => {
+      pendingPrompt = event?.prompt;
+    });
+
+    pi.on("agent_start", async () => {
+      // Nest under the caller's workflow span when a traceparent was supplied,
+      // so the whole run joins the /invoke trace; otherwise start a fresh root.
+      const parent = parentContext(config.traceparent);
+      agentSpan = tracer.startSpan("invoke_agent", undefined, parent);
+      agentSpan.setAttribute("openinference.span.kind", "AGENT");
+      agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent");
+      agentSpan.setAttribute("gen_ai.agent.name", "pi");
+      if (config.sessionId) {
+        agentSpan.setAttribute("session.id", config.sessionId);
+        agentSpan.setAttribute("gen_ai.conversation.id", config.sessionId);
+      }
+      setInputs(agentSpan, { prompt: pendingPrompt ?? "" }, config.captureContent);
+
+      const traceId = agentSpan.spanContext().traceId;
+      config.traceId = traceId;
+      traceTargets.set(traceId, {
+        endpoint: config.endpoint ?? defaultTarget().endpoint,
+        authorization: config.authorization ?? defaultTarget().authorization,
+      });
+      agentCtx = trace.setSpan(parent ?? context.active(), agentSpan);
+    });
+
+    // The messages handed to the next LLM call — the chat span's input.
+    pi.on("context", async (event: any) => {
+      if (Array.isArray(event?.messages)) lastContextMessages = event.messages;
+    });
+
+    pi.on("turn_start", async (event: any) => {
+      const parent = agentCtx ?? context.active();
+      const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn";
+      const span = tracer.startSpan(name, undefined, parent);
+      span.setAttribute("openinference.span.kind", "CHAIN");
+      if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex);
+      currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex };
+    });
+
+    pi.on("before_provider_request", async (_event: any, ctx: any) => {
+      const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+      const modelId = config.requestModel ?? ctx?.model?.id;
+      const providerName = config.provider ?? ctx?.model?.provider;
+      llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent);
+      llmSpan.setAttribute("openinference.span.kind", "LLM");
+      llmSpan.setAttribute("gen_ai.operation.name", "chat");
+      if (providerName) llmSpan.setAttribute("gen_ai.system", providerName);
+      if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId);
+      if (lastContextMessages)
+        emitMessages(llmSpan, "llm.input_messages", lastContextMessages, config.captureContent);
+    });
+
+    pi.on("message_end", async (event: any) => {
+      const msg = event?.message;
+      if (!msg || msg.role !== "assistant" || !llmSpan) return;
+      applyAssistant(llmSpan, msg, config.captureContent);
+      llmSpan.end();
+      llmSpan = undefined;
+    });
+
+    pi.on("tool_execution_start", async (event: any) => {
+      const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+      const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool";
+      const span = tracer.startSpan(name, undefined, parent);
+      span.setAttribute("openinference.span.kind", "TOOL");
+      span.setAttribute("gen_ai.operation.name", "execute_tool");
+      if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName);
+      if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId);
+      setInputs(span, (event?.args as Record<string, unknown>) ?? {}, config.captureContent);
+      if (event?.toolCallId) toolSpans.set(event.toolCallId, span);
+    });
+
+    pi.on("tool_execution_end", async (event: any) => {
+      const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined;
+      if (!span) return;
+      setOutput(span, toolResultText(event?.result), config.captureContent);
+      if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR });
+      span.end();
+      toolSpans.delete(event.toolCallId);
+    });
+
+    pi.on("turn_end", async (event: any) => {
+      // Safety net: if the LLM span is still open (no assistant message_end seen),
+      // close it from the turn's assistant message.
+      if (llmSpan && event?.message) {
+        applyAssistant(llmSpan, event.message, config.captureContent);
+        llmSpan.end();
+        llmSpan = undefined;
+      }
+      if (currentTurn) {
+        currentTurn.span.end();
+        currentTurn = undefined;
+      }
+    });
+
+    pi.on("agent_end", async (event: any) => {
+      if (!agentSpan) return;
+      setOutput(agentSpan, lastAssistantText(event?.messages), config.captureContent);
+      agentSpan.end();
+      agentSpan = undefined;
+      agentCtx = undefined;
+      lastContextMessages = undefined;
+    });
+  };
+
+  return {
+    register,
+    config,
+    flush: () => flushTrace(config.traceId),
+  };
+}
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
new file mode 100644
index 0000000000..ed8b99c3ae
--- /dev/null
+++ b/services/agent/src/cli.ts
@@ -0,0 +1,44 @@
+/**
+ * WP-2 Pi wrapper CLI: the JSON transport for the Harness port.
+ *
+ * Reads one JSON `AgentRunRequest` from stdin, runs Pi once, and writes one JSON
+ * `AgentRunResult` to stdout. stdout carries the result and nothing else; logs go
+ * to stderr. This is the one-shot "json adapter" the design doc describes; a
+ * long-lived RPC adapter can replace it later behind the same Python-side port.
+ */
+import { runPi, type AgentRunRequest, type AgentRunResult } from "./runPi.ts";
+
+async function readStdin(): Promise<string> {
+  const chunks: Buffer[] = [];
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk as Buffer);
+  }
+  return Buffer.concat(chunks).toString("utf8");
+}
+
+function emit(result: AgentRunResult): void {
+  process.stdout.write(JSON.stringify(result));
+}
+
+async function main(): Promise<void> {
+  const raw = await readStdin();
+
+  let request: AgentRunRequest;
+  try {
+    request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
+  } catch (err) {
+    emit({ ok: false, error: `Invalid JSON on stdin: ${String(err)}` });
+    process.exit(1);
+  }
+
+  try {
+    const result = await runPi(request);
+    emit(result);
+    process.exit(result.ok ? 0 : 1);
+  } catch (err) {
+    emit({ ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) });
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/services/agent/src/runPi.ts b/services/agent/src/runPi.ts
new file mode 100644
index 0000000000..cabf603701
--- /dev/null
+++ b/services/agent/src/runPi.ts
@@ -0,0 +1,231 @@
+/**
+ * WP-2 Pi harness driver.
+ *
+ * This is the concrete "harness" behind the service's Harness port. It drives the
+ * Pi SDK (`createAgentSession`) for a single run: it injects the agent's AGENTS.md
+ * in memory, resolves the model, sends one user turn, and returns the final
+ * assistant text. No streaming, no tools by default, no session persistence. Those
+ * are later work packages.
+ *
+ * Auth: uses `AuthStorage.create()`, which reads ~/.pi/agent/auth.json (the local
+ * Pi login). Set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment as an
+ * alternative. Nothing invocation-specific is written to a persistent disk: the
+ * session is in-memory and the working dir is a throwaway temp dir.
+ *
+ * Important: stdout is reserved for the JSON result (see cli.ts). Everything here
+ * logs to stderr so it never pollutes the result channel.
+ */
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import {
+  AuthStorage,
+  createAgentSession,
+  DefaultResourceLoader,
+  getAgentDir,
+  ModelRegistry,
+  SessionManager,
+  SettingsManager,
+} from "@earendil-works/pi-coding-agent";
+
+import { createAgentaOtel } from "./agenta-otel.ts";
+
+export interface ChatMessage {
+  role: string;
+  content: string;
+}
+
+/**
+ * Trace context threaded in from the Agenta service so the agent run joins the
+ * caller's /invoke trace instead of starting its own. All fields are optional;
+ * with none set the run is traced standalone (or not at all) using env config.
+ */
+export interface TraceContext {
+  /** W3C traceparent of the caller's workflow span. Nests invoke_agent under it. */
+  traceparent?: string;
+  /** W3C baggage from the caller (carried for future use). */
+  baggage?: string;
+  /** OTLP traces endpoint (e.g. https://host/api/otlp/v1/traces). */
+  endpoint?: string;
+  /** Full Authorization header for the OTLP export (e.g. "ApiKey ..." / "Secret ..."). */
+  authorization?: string;
+  /** Drop prompt/completion/tool I/O from spans when false. Default true. */
+  captureContent?: boolean;
+}
+
+export interface AgentRunRequest {
+  /** AGENTS.md text injected as the agent's instructions (in memory). */
+  agentsMd?: string;
+  /** Model id ("gpt-5.5") or "provider/id" ("openai-codex/gpt-5.5"). */
+  model?: string;
+  /** The user turn to send. Falls back to the last user message. */
+  prompt?: string;
+  /** Optional prior message history. MVP sends the latest user turn only. */
+  messages?: ChatMessage[];
+  /** Built-in tools to enable. MVP default: none. */
+  tools?: string[];
+  /** Tracing: thread the Agenta trace context across the boundary. */
+  trace?: TraceContext;
+}
+
+export interface AgentRunResult {
+  ok: boolean;
+  output?: string;
+  sessionId?: string;
+  model?: string;
+  /** Trace id of the run (the caller's trace when a traceparent was passed). */
+  traceId?: string;
+  error?: string;
+}
+
+function log(message: string): void {
+  process.stderr.write(`[pi-wrapper] ${message}\n`);
+}
+
+/** Pick the requested model, else gpt-5.5, else a sensible non-mini default. */
+function pickModel(available: any[], wanted?: string): any {
+  return (
+    (wanted &&
+      available.find(
+        (m) => m.id === wanted || `${m.provider}/${m.id}` === wanted,
+      )) ||
+    available.find((m) => m.id === "gpt-5.5") ||
+    available.find((m) => !/spark|mini/i.test(m.id)) ||
+    available[0]
+  );
+}
+
+/** The latest user turn: explicit prompt, else last user message content. */
+function resolvePrompt(request: AgentRunRequest): string {
+  if (request.prompt && request.prompt.trim()) return request.prompt;
+  const messages = request.messages ?? [];
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === "user" && messages[i].content) {
+      return messages[i].content;
+    }
+  }
+  return "";
+}
+
+/** Concatenate the text blocks of the last assistant message. */
+function extractAssistantText(messages: any[]): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const message = messages[i];
+    if (message?.role !== "assistant") continue;
+    const content = message.content;
+    if (typeof content === "string") return content;
+    if (Array.isArray(content)) {
+      const text = content
+        .filter((block: any) => block?.type === "text" && block.text)
+        .map((block: any) => block.text)
+        .join("");
+      if (text) return text;
+    }
+  }
+  return "";
+}
+
+export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
+  const prompt = resolvePrompt(request);
+  if (!prompt) {
+    return { ok: false, error: "No user message to send (prompt/messages empty)." };
+  }
+
+  const cwd = mkdtempSync(join(tmpdir(), "agenta-agent-"));
+
+  try {
+    const authStorage = AuthStorage.create();
+    const modelRegistry = ModelRegistry.create(authStorage);
+    const available = await modelRegistry.getAvailable();
+    if (!available || available.length === 0) {
+      return {
+        ok: false,
+        error:
+          "No model available. Log in with `pnpm exec pi` -> /login, or set OPENAI_API_KEY / ANTHROPIC_API_KEY.",
+      };
+    }
+
+    const model = pickModel(available, request.model);
+    log(`model: ${model.provider}/${model.id}`);
+
+    // Tracing: turn this run into OTel spans. When the caller passed a
+    // traceparent, invoke_agent nests under their /invoke span so the whole
+    // agent run is part of the same trace (just like completion/chat).
+    const otel = createAgentaOtel({
+      traceparent: request.trace?.traceparent,
+      baggage: request.trace?.baggage,
+      endpoint: request.trace?.endpoint,
+      authorization: request.trace?.authorization,
+      captureContent: request.trace?.captureContent,
+    });
+
+    // Inject AGENTS.md in memory and keep on-disk context files out of the run.
+    const agentsMd = request.agentsMd?.trim();
+    const loader = new DefaultResourceLoader({
+      cwd,
+      agentDir: getAgentDir(),
+      noContextFiles: true,
+      appendSystemPromptOverride: () => [],
+      agentsFilesOverride: () => ({
+        agentsFiles: agentsMd
+          ? [{ path: "/virtual/AGENTS.md", content: agentsMd }]
+          : [],
+      }),
+      extensionFactories: [otel.register],
+    });
+    await loader.reload();
+
+    const { session } = await createAgentSession({
+      cwd,
+      model,
+      authStorage,
+      modelRegistry,
+      tools: request.tools ?? [],
+      sessionManager: SessionManager.inMemory(cwd),
+      settingsManager: SettingsManager.inMemory(),
+      resourceLoader: loader,
+    });
+
+    // Hand the session id + model to the extension so spans carry them.
+    otel.config.sessionId = session.sessionId;
+    otel.config.provider = model.provider;
+    otel.config.requestModel = model.id;
+
+    // Accumulate streamed text as the primary output channel.
+    let streamed = "";
+    session.subscribe((event: any) => {
+      if (
+        event.type === "message_update" &&
+        event.assistantMessageEvent?.type === "text_delta"
+      ) {
+        streamed += event.assistantMessageEvent.delta ?? "";
+      }
+    });
+
+    await session.prompt(prompt);
+
+    const output = streamed.trim() || extractAssistantText(session.messages);
+    const sessionId = session.sessionId;
+    session.dispose();
+
+    // Ship this run's trace before the result is returned (and before the CLI
+    // process exits): invoke_agent has a remote parent, so the per-trace flush
+    // is what exports it.
+    await otel.flush();
+
+    return {
+      ok: true,
+      output,
+      sessionId,
+      model: `${model.provider}/${model.id}`,
+      traceId: otel.config.traceId,
+    };
+  } finally {
+    try {
+      rmSync(cwd, { recursive: true, force: true });
+    } catch {
+      // best-effort cleanup of the throwaway working dir
+    }
+  }
+}
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
new file mode 100644
index 0000000000..968ff6be4e
--- /dev/null
+++ b/services/agent/src/server.ts
@@ -0,0 +1,64 @@
+/**
+ * WP-2 Pi wrapper HTTP server: the HTTP transport for the Harness port.
+ *
+ * Same contract as the CLI, exposed over HTTP so the wrapper can run as its own
+ * container (a sidecar) that the Python service calls in-network:
+ *
+ *   GET  /health -> { status: "ok" }
+ *   POST /run    -> body is an AgentRunRequest, response is an AgentRunResult
+ *
+ * Uses Node's built-in http server (no framework dependency). Pi auth comes from
+ * PI_CODING_AGENT_DIR / ~/.pi/agent, mounted into the container.
+ */
+import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
+
+import { runPi, type AgentRunRequest } from "./runPi.ts";
+
+const PORT = Number(process.env.PORT ?? 8765);
+
+function send(res: ServerResponse, status: number, body: unknown): void {
+  const payload = JSON.stringify(body);
+  res.writeHead(status, {
+    "content-type": "application/json",
+    "content-length": Buffer.byteLength(payload),
+  });
+  res.end(payload);
+}
+
+async function readBody(req: IncomingMessage): Promise<string> {
+  const chunks: Buffer[] = [];
+  for await (const chunk of req) {
+    chunks.push(chunk as Buffer);
+  }
+  return Buffer.concat(chunks).toString("utf8");
+}
+
+const server = createServer(async (req, res) => {
+  try {
+    if (req.method === "GET" && req.url === "/health") {
+      return send(res, 200, { status: "ok" });
+    }
+
+    if (req.method === "POST" && req.url === "/run") {
+      const raw = await readBody(req);
+      let request: AgentRunRequest;
+      try {
+        request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
+      } catch (err) {
+        return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
+      }
+
+      const result = await runPi(request);
+      return send(res, result.ok ? 200 : 500, result);
+    }
+
+    return send(res, 404, { ok: false, error: "Not found" });
+  } catch (err) {
+    const message = err instanceof Error ? err.stack ?? err.message : String(err);
+    return send(res, 500, { ok: false, error: message });
+  }
+});
+
+server.listen(PORT, () => {
+  process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`);
+});
diff --git a/services/agent/tsconfig.json b/services/agent/tsconfig.json
new file mode 100644
index 0000000000..b8314675f3
--- /dev/null
+++ b/services/agent/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "lib": ["ES2023"],
+    "types": ["node"],
+    "strict": true,
+    "noEmit": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "allowImportingTsExtensions": true
+  },
+  "include": ["src/**/*.ts"]
+}
diff --git a/services/entrypoints/agent_main.py b/services/entrypoints/agent_main.py
new file mode 100644
index 0000000000..595e60ad27
--- /dev/null
+++ b/services/entrypoints/agent_main.py
@@ -0,0 +1,47 @@
+"""Standalone entrypoint for the agent service (WP-2 local verification).
+
+Mounts only the agent app plus a health check, so the agent ``/invoke`` can be
+exercised with curl without bringing up the full services app. The real integration
+point is ``entrypoints/main.py`` (one import + one mount), kept separate so this
+isolated runner stays light.
+
+Run locally (auth disabled for curl):
+
+    cd services
+    AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false \\
+        uv run uvicorn entrypoints.agent_main:app --host 0.0.0.0 --port 8090
+"""
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+import agenta as ag
+from oss.src.agent import agent_v0_app
+
+ag.init()
+
+app = FastAPI(
+    openapi_url=None,
+    docs_url=None,
+    redoc_url=None,
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    # The playground invokes cross-origin (web on a different port) with credentials
+    # (cookies + Authorization). Browsers reject a "*" origin on credentialed requests,
+    # so echo the specific origin and allow credentials. Matches the dev box on any
+    # port and localhost. Same-origin (served under /services) would avoid CORS entirely.
+    allow_origin_regex=r"https?://(144\.76\.237\.122|localhost|0\.0\.0\.0)(:\d+)?",
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+app.mount("/agent/v0", agent_v0_app)
diff --git a/services/entrypoints/main.py b/services/entrypoints/main.py
index 72cc291dfb..f52ac69ed8 100644
--- a/services/entrypoints/main.py
+++ b/services/entrypoints/main.py
@@ -43,6 +43,7 @@
 )
 from oss.src.chat import chat_v0_app
 from oss.src.completion import completion_v0_app
+from oss.src.agent import agent_v0_app
 from entrypoints.legacy import register_legacy_routes
 
 
@@ -134,6 +135,7 @@ async def health():
 
 app.mount("/chat/v0", chat_v0_app)
 app.mount("/completion/v0", completion_v0_app)
+app.mount("/agent/v0", agent_v0_app)
 
 register_legacy_routes(
     app=app,
diff --git a/services/oss/src/agent.py b/services/oss/src/agent.py
new file mode 100644
index 0000000000..1203f1560a
--- /dev/null
+++ b/services/oss/src/agent.py
@@ -0,0 +1,140 @@
+"""Agent workflow service (WP-2).
+
+Mirrors the chat/completion services: an Agenta app exposing ``/invoke`` and
+``/inspect`` through ``ag.create_app`` + ``ag.workflow`` + ``ag.route``, so the
+backend and playground treat an agent like the other workflow types. The handler
+builds the user turn from the request and runs it through the Harness port, whose Pi
+adapter drives the TypeScript wrapper in ``services/agent``.
+
+MVP: hardcoded config (AGENTS.md text, model) read from files, a single
+non-streaming reply, no tools. Streaming, multi-message output, tools, and Daytona
+are later work packages.
+"""
+
+import os
+from typing import Any, Dict, List, Optional
+
+import agenta as ag
+from agenta.sdk.engines.tracing.propagation import inject
+from agenta.sdk.utils.logging import get_module_logger
+
+from oss.src.agent_pi.config import load_config, wrapper_dir
+from oss.src.agent_pi.local_runtime import LocalRuntime
+from oss.src.agent_pi.pi_harness import PiHarness
+from oss.src.agent_pi.pi_http_harness import PiHttpHarness
+from oss.src.agent_pi.ports import Harness, HarnessRequest, TraceContext
+from oss.src.agent_pi.schemas import AGENT_SCHEMAS
+
+log = get_module_logger(__name__)
+
+_CAPTURE_CONTENT = os.getenv("AGENTA_AGENT_CAPTURE_CONTENT", "true").lower() not in (
+    "0",
+    "false",
+    "no",
+)
+
+
+def _build_harness() -> Harness:
+    """Pick the harness adapter for the current deployment.
+
+    - ``AGENTA_AGENT_PI_URL`` set (docker): call the Pi sidecar over HTTP.
+    - otherwise (local): spawn the TS wrapper as a subprocess.
+    """
+    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
+    if pi_url:
+        return PiHttpHarness(pi_url)
+    return PiHarness(LocalRuntime(), wrapper_dir=str(wrapper_dir()))
+
+
+def _latest_user_message(messages: Optional[List[Any]]) -> str:
+    for message in reversed(messages or []):
+        if not isinstance(message, dict):
+            continue
+        if message.get("role") == "user" and message.get("content"):
+            content = message["content"]
+            return content if isinstance(content, str) else str(content)
+    return ""
+
+
+def _trace_context() -> Optional[TraceContext]:
+    """Capture the active workflow span's trace context for the harness.
+
+    This runs inside the instrumented handler, so the current OTel span is the
+    ``/invoke`` workflow span. Threading its ``traceparent`` into the Pi run makes
+    the agent's spans children of that span, in the same trace, so the agent's
+    whole run shows up under the response's ``trace_id`` the way completion/chat
+    nest their LLM spans. Best-effort: any failure returns ``None`` and the run is
+    simply traced standalone (or not at all) using the wrapper's env config.
+    """
+    try:
+        headers = inject({})
+
+        traceparent = headers.get("traceparent")
+        if not traceparent:
+            return None
+
+        endpoint = None
+        try:
+            endpoint = ag.tracing.otlp_url
+        except Exception:  # pylint: disable=broad-except
+            endpoint = None
+
+        return TraceContext(
+            traceparent=traceparent,
+            baggage=headers.get("baggage"),
+            endpoint=endpoint,
+            authorization=headers.get("Authorization"),
+            capture_content=_CAPTURE_CONTENT,
+        )
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: failed to capture trace context", exc_info=True)
+        return None
+
+
+async def _agent(
+    inputs: Optional[Dict[str, Any]] = None,
+    messages: Optional[List[Any]] = None,
+    parameters: Optional[Dict] = None,
+):
+    config = load_config()
+
+    # Config (model + AGENTS.md instructions) comes from parameters when the
+    # playground/caller sets it, falling back to the service's file config.
+    params = parameters or {}
+    model = params.get("model") or config.model
+    agents_md = params.get("agents_md") or config.agents_md
+
+    msgs = messages or (inputs or {}).get("messages") or []
+    prompt = _latest_user_message(msgs)
+
+    harness = _build_harness()
+
+    await harness.setup()
+    try:
+        result = await harness.invoke(
+            HarnessRequest(
+                agents_md=agents_md,
+                model=model,
+                prompt=prompt,
+                messages=msgs,
+                tools=config.tools,
+                trace=_trace_context(),
+            )
+        )
+    finally:
+        await harness.shutdown()
+
+    return {"role": "assistant", "content": result.output}
+
+
+def create_agent_app():
+    app = ag.create_app()
+    # No builtin URI yet: registering the agent as a first-class workflow type
+    # (`agenta:builtin:agent:v0`) and its interface is WP-6. Here we register the
+    # handler directly, so it gets an auto URI (`user:custom:...`) and runs locally.
+    routed = ag.workflow(schemas=AGENT_SCHEMAS)(_agent)
+    ag.route("/", app=app, flags={"is_chat": True})(routed)
+    return app
+
+
+agent_v0_app = create_agent_app()
diff --git a/services/oss/src/agent_pi/__init__.py b/services/oss/src/agent_pi/__init__.py
new file mode 100644
index 0000000000..91ee583c51
--- /dev/null
+++ b/services/oss/src/agent_pi/__init__.py
@@ -0,0 +1,11 @@
+"""Agent runtime: ports and adapters for the WP-2 agent service.
+
+The Python service is "our agent implementation". It owns two ports the design doc
+calls out:
+
+- ``Harness``: the seam between our service and the agent engine. ``PiHarness`` is the
+  Pi implementation; it drives the TypeScript Pi wrapper in ``services/agent``.
+- ``Runtime``: the seam for the run environment (start, shutdown, pause, connect
+  volume). ``LocalRuntime`` runs the harness as a local subprocess. A Daytona adapter
+  lands later behind the same port.
+"""
diff --git a/services/oss/src/agent_pi/config.py b/services/oss/src/agent_pi/config.py
new file mode 100644
index 0000000000..b630a3063e
--- /dev/null
+++ b/services/oss/src/agent_pi/config.py
@@ -0,0 +1,68 @@
+"""Hardcoded MVP agent config, read from ``services/agent/config``.
+
+The config (AGENTS.md text, model, tools) lives in editable files so changing the
+agent does not need a code change. Paths can be overridden with env vars for Docker
+or alternate layouts.
+"""
+
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+
+# services/oss/src/agent_pi/config.py -> parents[3] == services/
+_SERVICES_DIR = Path(__file__).resolve().parents[3]
+_DEFAULT_AGENT_DIR = _SERVICES_DIR / "agent"
+
+# Fallback config used when the editable files are missing or a field is absent.
+# Kept in sync with the catalog template and the `/inspect` schema defaults
+# (schemas.py: _DEFAULT_MODEL / _DEFAULT_AGENTS_MD).
+DEFAULT_MODEL = "gpt-5.5"
+DEFAULT_AGENTS_MD = (
+    "You are a friendly hello-world agent running on the Agenta agent service.\n\n"
+    "- Greet the user warmly.\n"
+    "- Answer the user's message in one or two short sentences."
+)
+
+
+@dataclass
+class AgentConfig:
+    agents_md: str
+    model: Optional[str] = None
+    tools: List[str] = field(default_factory=list)
+
+
+def wrapper_dir() -> Path:
+    """Directory of the TypeScript Pi wrapper (where the command runs)."""
+    override = os.getenv("AGENTA_AGENT_WRAPPER_DIR")
+    return Path(override) if override else _DEFAULT_AGENT_DIR
+
+
+def config_dir() -> Path:
+    """Directory holding AGENTS.md and agent.json."""
+    override = os.getenv("AGENTA_AGENT_CONFIG_DIR")
+    return Path(override) if override else (_DEFAULT_AGENT_DIR / "config")
+
+
+def load_config() -> AgentConfig:
+    base = config_dir()
+
+    # Read the editable AGENTS.md when present; otherwise fall back to the default
+    # instructions so a fresh checkout (or Docker layout) still runs.
+    agents_md = DEFAULT_AGENTS_MD
+    agents_path = base / "AGENTS.md"
+    if agents_path.exists():
+        text = agents_path.read_text(encoding="utf-8").strip()
+        if text:
+            agents_md = text
+
+    model: str = DEFAULT_MODEL
+    tools: List[str] = []
+    meta_path = base / "agent.json"
+    if meta_path.exists():
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        model = meta.get("model") or DEFAULT_MODEL
+        tools = meta.get("tools", []) or []
+
+    return AgentConfig(agents_md=agents_md, model=model, tools=tools)
diff --git a/services/oss/src/agent_pi/local_runtime.py b/services/oss/src/agent_pi/local_runtime.py
new file mode 100644
index 0000000000..d50d97edd8
--- /dev/null
+++ b/services/oss/src/agent_pi/local_runtime.py
@@ -0,0 +1,59 @@
+"""Local runtime adapter: runs the harness as a subprocess on this host.
+
+This is the parity baseline for the design doc. The Node process is the run
+environment. A Daytona adapter (WP-3) implements the same port by running the command
+inside a sandbox instead.
+"""
+
+import asyncio
+from typing import Dict, Optional, Sequence
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from .ports import ExecResult, Runtime
+
+log = get_module_logger(__name__)
+
+
+class LocalRuntime(Runtime):
+    async def start(self) -> None:
+        return None
+
+    async def shutdown(self) -> None:
+        return None
+
+    async def exec(
+        self,
+        command: Sequence[str],
+        input_bytes: bytes,
+        *,
+        cwd: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        timeout: Optional[float] = None,
+    ) -> ExecResult:
+        proc = await asyncio.create_subprocess_exec(
+            *command,
+            cwd=cwd,
+            env=env,
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+
+        try:
+            stdout, stderr = await asyncio.wait_for(
+                proc.communicate(input=input_bytes),
+                timeout=timeout,
+            )
+        except asyncio.TimeoutError:
+            proc.kill()
+            await proc.wait()
+            raise RuntimeError(
+                f"Harness process timed out after {timeout}s: {' '.join(command)}"
+            )
+
+        return ExecResult(
+            code=proc.returncode if proc.returncode is not None else 0,
+            stdout=stdout.decode("utf-8", "replace"),
+            stderr=stderr.decode("utf-8", "replace"),
+        )
diff --git a/services/oss/src/agent_pi/pi_harness.py b/services/oss/src/agent_pi/pi_harness.py
new file mode 100644
index 0000000000..f4c5fc3e5c
--- /dev/null
+++ b/services/oss/src/agent_pi/pi_harness.py
@@ -0,0 +1,84 @@
+"""Pi harness adapter: drives the TypeScript Pi wrapper in ``services/agent``.
+
+The transport is a one-shot JSON-over-stdio call: we send the run request as JSON on
+the wrapper's stdin and read its JSON result from stdout. This is the "json adapter"
+the design doc describes. A long-lived RPC adapter (``pi --mode rpc``) can replace it
+later behind this same Harness port without touching the service.
+"""
+
+import json
+import os
+from typing import List, Optional, Sequence
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from .ports import Harness, HarnessRequest, HarnessResult, Runtime
+
+log = get_module_logger(__name__)
+
+_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
+_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
+
+
+class PiHarness(Harness):
+    def __init__(
+        self,
+        runtime: Runtime,
+        *,
+        wrapper_dir: str,
+        command: Optional[Sequence[str]] = None,
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        self._runtime = runtime
+        self._wrapper_dir = wrapper_dir
+        self._command: List[str] = list(command or _DEFAULT_COMMAND)
+        self._timeout = timeout
+
+    async def setup(self) -> None:
+        await self._runtime.start()
+
+    async def shutdown(self) -> None:
+        await self._runtime.shutdown()
+
+    async def invoke(self, request: HarnessRequest) -> HarnessResult:
+        payload = json.dumps(
+            {
+                "agentsMd": request.agents_md,
+                "model": request.model,
+                "prompt": request.prompt,
+                "messages": request.messages,
+                "tools": request.tools,
+                "trace": request.trace.to_wire() if request.trace else None,
+            }
+        ).encode("utf-8")
+
+        result = await self._runtime.exec(
+            self._command,
+            payload,
+            cwd=self._wrapper_dir,
+            env={**os.environ},
+            timeout=self._timeout,
+        )
+
+        if not result.stdout.strip():
+            raise RuntimeError(
+                "Pi wrapper returned no output. "
+                f"exit={result.code} stderr={result.stderr[-2000:]}"
+            )
+
+        try:
+            data = json.loads(result.stdout)
+        except json.JSONDecodeError as exc:
+            raise RuntimeError(
+                "Pi wrapper returned invalid JSON. "
+                f"stdout={result.stdout[:500]} stderr={result.stderr[-1000:]}"
+            ) from exc
+
+        if not data.get("ok"):
+            raise RuntimeError(f"Pi run failed: {data.get('error')}")
+
+        return HarnessResult(
+            output=data.get("output", ""),
+            session_id=data.get("sessionId"),
+            model=data.get("model"),
+        )
diff --git a/services/oss/src/agent_pi/pi_http_harness.py b/services/oss/src/agent_pi/pi_http_harness.py
new file mode 100644
index 0000000000..1e4b8a0d2e
--- /dev/null
+++ b/services/oss/src/agent_pi/pi_http_harness.py
@@ -0,0 +1,64 @@
+"""Pi harness adapter over HTTP.
+
+Same Harness port as ``PiHarness`` (the local subprocess one), but talks to the Pi
+wrapper running as a separate HTTP service (a sidecar container). The transport is a
+JSON ``POST /run``. This is what the dockerized agent uses, since the Python service
+container has no Node; the Pi wrapper runs in its own container.
+"""
+
+import os
+
+import httpx
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from .ports import Harness, HarnessRequest, HarnessResult
+
+log = get_module_logger(__name__)
+
+_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
+
+
+class PiHttpHarness(Harness):
+    def __init__(
+        self,
+        base_url: str,
+        *,
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        self._base_url = base_url.rstrip("/")
+        self._timeout = timeout
+
+    async def setup(self) -> None:
+        return None
+
+    async def shutdown(self) -> None:
+        return None
+
+    async def invoke(self, request: HarnessRequest) -> HarnessResult:
+        payload = {
+            "agentsMd": request.agents_md,
+            "model": request.model,
+            "prompt": request.prompt,
+            "messages": request.messages,
+            "tools": request.tools,
+            "trace": request.trace.to_wire() if request.trace else None,
+        }
+
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            response = await client.post(f"{self._base_url}/run", json=payload)
+
+        if response.status_code >= 500:
+            raise RuntimeError(
+                f"Pi wrapper HTTP {response.status_code}: {response.text[:1000]}"
+            )
+
+        data = response.json()
+        if not data.get("ok"):
+            raise RuntimeError(f"Pi run failed: {data.get('error')}")
+
+        return HarnessResult(
+            output=data.get("output", ""),
+            session_id=data.get("sessionId"),
+            model=data.get("model"),
+        )
diff --git a/services/oss/src/agent_pi/ports.py b/services/oss/src/agent_pi/ports.py
new file mode 100644
index 0000000000..f556de8cf7
--- /dev/null
+++ b/services/oss/src/agent_pi/ports.py
@@ -0,0 +1,121 @@
+"""Ports for the agent service: the Harness seam and the Runtime (environment) seam.
+
+These interfaces keep the service harness-agnostic and environment-agnostic. The MVP
+ships one adapter for each (Pi over a local subprocess), but the boundaries are where
+Codex/Claude Code (other harnesses) and Daytona (other environments) slot in later.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence
+
+
+@dataclass
+class ExecResult:
+    """Result of running a command through a Runtime."""
+
+    code: int
+    stdout: str
+    stderr: str
+
+
+class Runtime(ABC):
+    """Port for the run environment: where and how the harness process runs.
+
+    The local adapter runs it as a subprocess on this host. A sandbox adapter (WP-3)
+    runs it inside Daytona. ``pause`` and ``connect_volume`` are lifecycle hooks the
+    design doc calls out; the local adapter no-ops them.
+    """
+
+    @abstractmethod
+    async def start(self) -> None:
+        """Bring the environment up (no-op for a local process)."""
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """Tear the environment down (no-op for a local process)."""
+
+    async def pause(self) -> None:
+        """Pause the environment. Optional; no-op by default."""
+        return None
+
+    async def connect_volume(self, *args: Any, **kwargs: Any) -> None:
+        """Attach a volume to the environment. Optional; no-op by default."""
+        return None
+
+    @abstractmethod
+    async def exec(
+        self,
+        command: Sequence[str],
+        input_bytes: bytes,
+        *,
+        cwd: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        timeout: Optional[float] = None,
+    ) -> ExecResult:
+        """Run ``command`` in the environment, feeding ``input_bytes`` to stdin."""
+
+
+@dataclass
+class TraceContext:
+    """Agenta trace context threaded into the harness run.
+
+    Lets the harness nest its spans under the caller's workflow span (same
+    ``trace_id``) and ship them to the same Agenta backend with the same auth, so
+    the agent's whole run becomes part of the ``/invoke`` trace the way
+    completion/chat nest their LLM spans. All fields optional; with none set the
+    harness traces standalone (or not at all).
+    """
+
+    traceparent: Optional[str] = None
+    baggage: Optional[str] = None
+    endpoint: Optional[str] = None  # OTLP traces URL
+    authorization: Optional[str] = None  # full Authorization header value
+    capture_content: bool = True
+
+    def to_wire(self) -> Dict[str, Any]:
+        """Serialize to the camelCase shape the TS wrapper expects on the wire."""
+        return {
+            "traceparent": self.traceparent,
+            "baggage": self.baggage,
+            "endpoint": self.endpoint,
+            "authorization": self.authorization,
+            "captureContent": self.capture_content,
+        }
+
+
+@dataclass
+class HarnessRequest:
+    """One agent run: instructions, model, the user turn, and optional history."""
+
+    agents_md: Optional[str] = None
+    model: Optional[str] = None
+    prompt: Optional[str] = None
+    messages: List[Any] = field(default_factory=list)
+    tools: List[str] = field(default_factory=list)
+    trace: Optional[TraceContext] = None
+
+
+@dataclass
+class HarnessResult:
+    """The agent's reply plus run metadata."""
+
+    output: str
+    session_id: Optional[str] = None
+    model: Optional[str] = None
+
+
+class Harness(ABC):
+    """Port between our service and the agent engine. Pi is one implementation."""
+
+    @abstractmethod
+    async def setup(self) -> None:
+        """Prepare the harness for a run."""
+
+    @abstractmethod
+    async def invoke(self, request: HarnessRequest) -> HarnessResult:
+        """Run one turn and return the agent's reply."""
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """Release any harness resources."""
diff --git a/services/oss/src/agent_pi/schemas.py b/services/oss/src/agent_pi/schemas.py
new file mode 100644
index 0000000000..93a22c6532
--- /dev/null
+++ b/services/oss/src/agent_pi/schemas.py
@@ -0,0 +1,71 @@
+"""JSON schemas the agent workflow advertises via ``/inspect``.
+
+The agent self-describes its interface here instead of registering a static SDK
+interface. The shape mirrors the chat workflow (messages in, a single assistant
+message out) so the playground renders a chat box and POSTs `data.inputs.messages`.
+
+Kept in its own module so it composes into the workflow registration with a one-line
+change and stays out of the handler logic.
+"""
+
+_SCHEMA = "https://json-schema.org/draft/2020-12/schema"
+
+# Default config the playground pre-fills and the agent falls back to. Kept in sync
+# with the catalog template and ``config.py`` (DEFAULT_MODEL / DEFAULT_AGENTS_MD).
+_DEFAULT_MODEL = "gpt-5.5"
+_DEFAULT_AGENTS_MD = (
+    "You are a friendly hello-world agent running on the Agenta agent service.\n\n"
+    "- Greet the user warmly.\n"
+    "- Answer the user's message in one or two short sentences."
+)
+
+# Inputs: a chat-style message list. `x-ag-type-ref: messages` is what marks the
+# workflow as chat to the playground (same marker the builtin chat service uses).
+AGENT_INPUTS_SCHEMA = {
+    "$schema": _SCHEMA,
+    "type": "object",
+    "additionalProperties": True,
+    "properties": {
+        "messages": {
+            "x-ag-type-ref": "messages",
+            "type": "array",
+            "description": "Ordered list of normalized chat messages.",
+        },
+    },
+}
+
+# Parameters: the agent config the playground renders as editable fields. Exposes
+# the two values that actually drive a run: the model and the AGENTS.md instructions.
+# `x-parameters.multiline` is the hint the playground honors to render a textarea.
+AGENT_PARAMETERS_SCHEMA = {
+    "$schema": _SCHEMA,
+    "type": "object",
+    "additionalProperties": True,
+    "properties": {
+        "model": {
+            "type": "string",
+            "default": _DEFAULT_MODEL,
+            "description": "Model the agent runs on.",
+        },
+        "agents_md": {
+            "type": "string",
+            "default": _DEFAULT_AGENTS_MD,
+            "description": "The agent's instructions (AGENTS.md).",
+            "x-parameters": {"multiline": True},
+        },
+    },
+}
+
+# Outputs: the final assistant message.
+AGENT_OUTPUTS_SCHEMA = {
+    "$schema": _SCHEMA,
+    "x-ag-type-ref": "message",
+    "type": "object",
+    "description": "Final assistant message returned by the agent.",
+}
+
+AGENT_SCHEMAS = {
+    "inputs": AGENT_INPUTS_SCHEMA,
+    "parameters": AGENT_PARAMETERS_SCHEMA,
+    "outputs": AGENT_OUTPUTS_SCHEMA,
+}
diff --git a/web/oss/src/components/pages/app-management/components/CreateAppDropdown/index.tsx b/web/oss/src/components/pages/app-management/components/CreateAppDropdown/index.tsx
index fdc6300665..93a0d0f6e0 100644
--- a/web/oss/src/components/pages/app-management/components/CreateAppDropdown/index.tsx
+++ b/web/oss/src/components/pages/app-management/components/CreateAppDropdown/index.tsx
@@ -34,6 +34,12 @@ const ITEMS: CreateAppDropdownItem[] = [
         description: "Single-shot prompt completion.",
         testId: "create-app-dropdown-completion",
     },
+    {
+        type: "agent",
+        label: "Agent",
+        description: "Agent that uses tools over multiple turns.",
+        testId: "create-app-dropdown-agent",
+    },
 ]
 
 interface CreateAppDropdownProps {
diff --git a/web/oss/src/components/pages/app-management/modals/CreateAppTypeModal/index.tsx b/web/oss/src/components/pages/app-management/modals/CreateAppTypeModal/index.tsx
index 9f07b6b354..beeda459ff 100644
--- a/web/oss/src/components/pages/app-management/modals/CreateAppTypeModal/index.tsx
+++ b/web/oss/src/components/pages/app-management/modals/CreateAppTypeModal/index.tsx
@@ -51,6 +51,12 @@ const OPTIONS: CreateAppTypeOption[] = [
         description: "Single-shot prompt completion.",
         testId: "create-app-type-modal-completion",
     },
+    {
+        type: "agent",
+        label: "Agent",
+        description: "Agent that uses tools over multiple turns.",
+        testId: "create-app-type-modal-agent",
+    },
 ]
 
 interface CreateAppTypeModalProps {
diff --git a/web/oss/src/components/pages/prompts/assets/iconHelpers.tsx b/web/oss/src/components/pages/prompts/assets/iconHelpers.tsx
index 1902e21c55..2864a5c011 100644
--- a/web/oss/src/components/pages/prompts/assets/iconHelpers.tsx
+++ b/web/oss/src/components/pages/prompts/assets/iconHelpers.tsx
@@ -1,6 +1,6 @@
 import React from "react"
 
-import {ChatDotsIcon, NoteIcon} from "@phosphor-icons/react"
+import {ChatDotsIcon, NoteIcon, RobotIcon} from "@phosphor-icons/react"
 
 import CompletionAppIcon from "../components/CompletionAppIcon"
 import SetupWorkflowIcon from "../components/SetupWorkflowIcon"
@@ -8,6 +8,8 @@ import SetupWorkflowIcon from "../components/SetupWorkflowIcon"
 export const getAppTypeIcon = (appType?: string) => {
     const normalizedType = appType?.toLowerCase()
 
+    if (normalizedType?.includes("agent"))
+        return <RobotIcon size={16} className="text-zinc-9 dark:text-white" />
     if (normalizedType?.includes("chat"))
         return <ChatDotsIcon size={16} className="text-zinc-9 dark:text-white" />
     if (normalizedType?.includes("completion"))
diff --git a/web/packages/agenta-entities/src/workflow/state/appUtils.ts b/web/packages/agenta-entities/src/workflow/state/appUtils.ts
index de72d61b38..6216e2acf6 100644
--- a/web/packages/agenta-entities/src/workflow/state/appUtils.ts
+++ b/web/packages/agenta-entities/src/workflow/state/appUtils.ts
@@ -64,7 +64,7 @@ export const appTemplatesDataAtom = atom<WorkflowCatalogTemplate[]>((get) => {
  * App types supported by the drawer flow. "custom" routes through the
  * existing CustomWorkflowModal and does NOT use this factory.
  */
-export type AppType = "chat" | "completion"
+export type AppType = "chat" | "completion" | "agent"
 
 export interface CreateEphemeralAppFromTemplateParams {
     type: AppType
@@ -206,7 +206,9 @@ export async function createEphemeralAppFromTemplate({
             is_code: false,
             is_match: false,
             is_feedback: false,
-            is_chat: type === "chat",
+            // Agent takes messages-in / returns a final message, so it runs in
+            // chat mode like `chat` (backend infers is_chat from messages-in too).
+            is_chat: type === "chat" || type === "agent",
             has_url: false,
             has_script: false,
             has_handler: false,

From 59bf20a98fda115c3cb96a0de197b140faae1587 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 16 Jun 2026 20:17:40 +0200
Subject: [PATCH 02/10] feat(agent): runnable tools as agent configuration
 (WP-7)

---
 api/oss/src/apis/fastapi/tools/models.py      |  18 ++
 api/oss/src/apis/fastapi/tools/router.py      |  89 ++++---
 api/oss/src/core/tools/dtos.py                |  62 ++++-
 api/oss/src/core/tools/exceptions.py          |  18 ++
 api/oss/src/core/tools/service.py             | 159 ++++++++++++
 .../tracing-in-the-agent-service.md           |   5 +-
 .../wp-2-agent-service/implementation-plan.md |   8 +
 .../agent-workflows/wp-2-agent-service/qa.md  | 176 +++++++++++++
 .../agent-workflows/wp-7-tools/README.md      |  97 ++++++-
 .../docker-compose/ee/docker-compose.dev.yml  |   8 +-
 services/agent/README.md                      |  38 ++-
 services/agent/docker-compose.agent.yml       |  98 -------
 services/agent/docker-compose.stack.yml       |  86 -------
 services/agent/scripts/register_agent_app.py  | 166 ------------
 services/agent/src/runPi.ts                   | 165 +++++++++++-
 services/entrypoints/agent_main.py            |  47 ----
 services/oss/src/agent.py                     | 241 +++++++++++++++++-
 services/oss/src/agent_pi/config.py           |   8 +-
 services/oss/src/agent_pi/pi_harness.py       |   4 +
 services/oss/src/agent_pi/pi_http_harness.py  |   4 +
 services/oss/src/agent_pi/ports.py            |  26 ++
 services/oss/src/agent_pi/schemas.py          |  32 ++-
 22 files changed, 1089 insertions(+), 466 deletions(-)
 create mode 100644 docs/design/agent-workflows/wp-2-agent-service/qa.md
 delete mode 100644 services/agent/docker-compose.agent.yml
 delete mode 100644 services/agent/docker-compose.stack.yml
 delete mode 100644 services/agent/scripts/register_agent_app.py
 delete mode 100644 services/entrypoints/agent_main.py

diff --git a/api/oss/src/apis/fastapi/tools/models.py b/api/oss/src/apis/fastapi/tools/models.py
index 891b276c22..768574f23c 100644
--- a/api/oss/src/apis/fastapi/tools/models.py
+++ b/api/oss/src/apis/fastapi/tools/models.py
@@ -15,6 +15,9 @@
     ToolConnectionCreate,
     # Tool Calls
     ToolResult,
+    # Agent tools
+    AgentToolReference,
+    ResolvedAgentTool,
 )
 
 
@@ -87,3 +90,18 @@ class ToolConnectionsResponse(BaseModel):
 
 class ToolCallResponse(BaseModel):
     call: ToolResult
+
+
+# ---------------------------------------------------------------------------
+# Agent tool resolution
+# ---------------------------------------------------------------------------
+
+
+class ToolResolveRequest(BaseModel):
+    tools: List[AgentToolReference] = []
+
+
+class ToolResolveResponse(BaseModel):
+    count: int = 0
+    builtins: List[str] = []
+    custom: List[ResolvedAgentTool] = []
diff --git a/api/oss/src/apis/fastapi/tools/router.py b/api/oss/src/apis/fastapi/tools/router.py
index 043d114fa7..3cc689a055 100644
--- a/api/oss/src/apis/fastapi/tools/router.py
+++ b/api/oss/src/apis/fastapi/tools/router.py
@@ -29,6 +29,9 @@
     ToolConnectionsResponse,
     #
     ToolCallResponse,
+    #
+    ToolResolveRequest,
+    ToolResolveResponse,
 )
 
 from oss.src.core.shared.dtos import Status
@@ -42,10 +45,12 @@
     ToolResultData,
 )
 from oss.src.core.tools.exceptions import (
+    ActionNotFoundError,
     AdapterError,
     ConnectionInactiveError,
     ConnectionInvalidError,
     ConnectionNotFoundError,
+    ToolSlugInvalidError,
 )
 from oss.src.core.tools.service import (
     ToolsService,
@@ -208,6 +213,14 @@ def __init__(
         )
 
         # --- Tool operations ---
+        self.router.add_api_route(
+            "/resolve",
+            self.resolve_tools,
+            methods=["POST"],
+            operation_id="resolve_agent_tools",
+            response_model=ToolResolveResponse,
+            response_model_exclude_none=True,
+        )
         self.router.add_api_route(
             "/call",
             self.call_tool,
@@ -886,6 +899,51 @@ async def callback_connection(
     # Tool Calls
     # -----------------------------------------------------------------------
 
+    @intercept_exceptions()
+    @handle_adapter_exceptions()
+    async def resolve_tools(
+        self,
+        request: Request,
+        *,
+        body: ToolResolveRequest,
+    ) -> ToolResolveResponse:
+        """Resolve an agent's tool references into model-ready specs.
+
+        Validates Composio connections up front and enriches each action from the
+        catalog, so a running agent (e.g. Pi) gets ``customTools`` whose ``execute``
+        routes back through ``POST /tools/call`` — provider keys stay server-side.
+        """
+        if is_ee():
+            has_permission = await check_action_access(
+                user_uid=request.state.user_id,
+                project_id=request.state.project_id,
+                permission=Permission.VIEW_TOOLS,
+            )
+            if not has_permission:
+                raise FORBIDDEN_EXCEPTION
+
+        try:
+            resolution = await self.tools_service.resolve_agent_tools(
+                project_id=UUID(request.state.project_id),
+                tools=body.tools,
+            )
+        except ConnectionNotFoundError as e:
+            raise HTTPException(status_code=404, detail=e.message) from e
+        except ConnectionInactiveError as e:
+            raise HTTPException(status_code=400, detail=e.message) from e
+        except ConnectionInvalidError as e:
+            raise HTTPException(status_code=400, detail=e.message) from e
+        except ToolSlugInvalidError as e:
+            raise HTTPException(status_code=400, detail=e.message) from e
+        except ActionNotFoundError as e:
+            raise HTTPException(status_code=404, detail=e.message) from e
+
+        return ToolResolveResponse(
+            count=len(resolution.builtins) + len(resolution.custom),
+            builtins=resolution.builtins,
+            custom=resolution.custom,
+        )
+
     @intercept_exceptions()
     @handle_adapter_exceptions()
     async def call_tool(
@@ -931,39 +989,12 @@ async def call_tool(
         connection_slug = slug_parts[4]
 
         try:
-            connections = await self.tools_service.query_connections(
+            connection = await self.tools_service.resolve_connection_by_slug(
                 project_id=UUID(request.state.project_id),
                 provider_key=provider_key,
                 integration_key=integration_key,
+                connection_slug=connection_slug,
             )
-
-            connection = next(
-                (c for c in connections if c.slug == connection_slug), None
-            )
-
-            if not connection:
-                raise ConnectionNotFoundError(
-                    connection_slug=connection_slug,
-                    provider_key=provider_key,
-                    integration_key=integration_key,
-                )
-
-            if not connection.is_active:
-                raise ConnectionInactiveError(connection_id=connection_slug)
-
-            if not connection.is_valid:
-                raise ConnectionInvalidError(
-                    connection_slug=connection_slug,
-                    detail="Please refresh the connection.",
-                )
-
-            if not connection.provider_connection_id:
-                raise ConnectionNotFoundError(
-                    connection_slug=connection_slug,
-                    provider_key=provider_key,
-                    integration_key=integration_key,
-                )
-
         except ConnectionNotFoundError as e:
             raise HTTPException(status_code=404, detail=e.message) from e
         except ConnectionInactiveError as e:
diff --git a/api/oss/src/core/tools/dtos.py b/api/oss/src/core/tools/dtos.py
index a588965f61..3c3f0ec53e 100644
--- a/api/oss/src/core/tools/dtos.py
+++ b/api/oss/src/core/tools/dtos.py
@@ -1,8 +1,8 @@
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Annotated, Any, Dict, List, Literal, Optional, Union
 
 from agenta.sdk.models.workflows import JsonSchemas
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from oss.src.core.shared.dtos import (
     Header,
@@ -238,3 +238,61 @@ class ToolExecutionResponse(BaseModel):
     data: Optional[Json] = None
     error: Optional[str] = None
     successful: bool = False
+
+
+# ---------------------------------------------------------------------------
+# Agent tools (config references + resolution)
+# ---------------------------------------------------------------------------
+
+# A provider-agnostic list of tool references lives under an agent revision's
+# ``parameters["tools"]``. Each entry is a discriminated union on ``type``: config
+# holds references and display metadata only, never secrets. The backend resolves
+# them into model-ready specs at invoke time (see ToolsService.resolve_agent_tools).
+
+
+class AgentBuiltinTool(BaseModel):
+    """A Pi built-in tool, referenced by name (e.g. ``read``, ``bash``)."""
+
+    type: Literal["builtin"] = "builtin"
+    name: str
+
+
+class AgentComposioTool(BaseModel):
+    """A Composio action, carrying the slug segments ``/tools/call`` parses."""
+
+    type: Literal["composio"] = "composio"
+    integration: str
+    action: str
+    connection: str
+    # Function name shown to the model. Defaults to ``{integration}__{action}``.
+    name: Optional[str] = None
+
+
+AgentToolReference = Annotated[
+    Union[AgentBuiltinTool, AgentComposioTool],
+    Field(discriminator="type"),
+]
+
+
+class ResolvedAgentTool(BaseModel):
+    """A runnable reference resolved into a model-ready tool spec.
+
+    ``call_ref`` is the ``tools.{provider}.{integration}.{action}.{connection}`` slug
+    the execution bridge sends back to ``POST /tools/call``.
+    """
+
+    name: str
+    description: Optional[str] = None
+    input_schema: Optional[Dict[str, Any]] = None
+    call_ref: str
+
+
+class AgentToolsResolution(BaseModel):
+    """Outcome of resolving an agent's ``tools`` list.
+
+    ``builtins`` pass straight into Pi's ``tools: string[]``; ``custom`` become Pi
+    ``customTools`` whose ``execute`` routes through ``/tools/call``.
+    """
+
+    builtins: List[str] = []
+    custom: List[ResolvedAgentTool] = []
diff --git a/api/oss/src/core/tools/exceptions.py b/api/oss/src/core/tools/exceptions.py
index f46c08b6cd..e9dbd54f3f 100644
--- a/api/oss/src/core/tools/exceptions.py
+++ b/api/oss/src/core/tools/exceptions.py
@@ -40,6 +40,24 @@ def __init__(
         super().__init__(msg)
 
 
+class ActionNotFoundError(ToolsError):
+    """Raised when a catalog action cannot be found for an integration."""
+
+    def __init__(
+        self,
+        *,
+        provider_key: str,
+        integration_key: str,
+        action_key: str,
+    ):
+        self.provider_key = provider_key
+        self.integration_key = integration_key
+        self.action_key = action_key
+        super().__init__(
+            f"Action not found: {provider_key}/{integration_key}/{action_key}"
+        )
+
+
 class ConnectionSlugConflictError(ToolsError):
     """Raised when a connection slug already exists for the integration."""
 
diff --git a/api/oss/src/core/tools/service.py b/api/oss/src/core/tools/service.py
index f603bc4d42..a9e1e4c779 100644
--- a/api/oss/src/core/tools/service.py
+++ b/api/oss/src/core/tools/service.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any, Dict, List, Optional, Tuple
 from uuid import UUID
 
@@ -6,6 +7,11 @@
 from oss.src.core.tools.utils import make_oauth_state
 
 from oss.src.core.tools.dtos import (
+    AgentBuiltinTool,
+    AgentComposioTool,
+    AgentToolReference,
+    AgentToolsResolution,
+    ResolvedAgentTool,
     ToolCatalogAction,
     ToolCatalogActionDetails,
     ToolCatalogIntegration,
@@ -15,17 +21,27 @@
     ToolConnectionRequest,
     ToolExecutionRequest,
     ToolExecutionResponse,
+    ToolProviderKind,
 )
 from oss.src.core.tools.interfaces import (
     ToolsDAOInterface,
 )
 from oss.src.core.tools.registry import ToolsGatewayRegistry
 from oss.src.core.tools.exceptions import (
+    ActionNotFoundError,
     ConnectionInactiveError,
+    ConnectionInvalidError,
     ConnectionNotFoundError,
+    ToolSlugInvalidError,
 )
 
 
+# A slug segment is safe for the ``tools.{provider}.{integration}.{action}.{connection}``
+# call-ref. ``__`` is forbidden because ``/tools/call`` round-trips ``__`` <-> ``.`` when
+# parsing function names, so a ``__`` inside a segment would corrupt the split.
+_SLUG_SEGMENT_RE = re.compile(r"^[a-zA-Z0-9-]+(?:_[a-zA-Z0-9-]+)*$")
+
+
 log = get_module_logger(__name__)
 
 
@@ -408,3 +424,146 @@ async def execute_tool(
                 arguments=arguments,
             ),
         )
+
+    # -----------------------------------------------------------------------
+    # Connection resolution (shared by the call endpoint and the agent resolver)
+    # -----------------------------------------------------------------------
+
+    async def resolve_connection_by_slug(
+        self,
+        *,
+        project_id: UUID,
+        provider_key: str,
+        integration_key: str,
+        connection_slug: str,
+    ) -> ToolConnection:
+        """Resolve a project-scoped connection slug to a usable connection row.
+
+        Raises a domain exception when the connection is missing, inactive, invalid,
+        or never finished its provider handshake. Shared by ``call_tool`` (execution)
+        and ``resolve_agent_tools`` (up-front validation).
+        """
+        # Query all (not active-only) so an inactive connection yields a precise
+        # "inactive" error instead of an indistinguishable "not found".
+        connections = await self.query_connections(
+            project_id=project_id,
+            provider_key=provider_key,
+            integration_key=integration_key,
+            is_active=None,
+        )
+
+        connection = next(
+            (c for c in connections if c.slug == connection_slug),
+            None,
+        )
+
+        if not connection:
+            raise ConnectionNotFoundError(
+                provider_key=provider_key,
+                integration_key=integration_key,
+                connection_slug=connection_slug,
+            )
+
+        if not connection.is_active:
+            raise ConnectionInactiveError(connection_id=connection_slug)
+
+        if not connection.is_valid:
+            raise ConnectionInvalidError(
+                connection_slug=connection_slug,
+                detail="Please refresh the connection.",
+            )
+
+        if not connection.provider_connection_id:
+            raise ConnectionNotFoundError(
+                provider_key=provider_key,
+                integration_key=integration_key,
+                connection_slug=connection_slug,
+            )
+
+        return connection
+
+    # -----------------------------------------------------------------------
+    # Agent tool resolution
+    # -----------------------------------------------------------------------
+
+    async def resolve_agent_tools(
+        self,
+        *,
+        project_id: UUID,
+        tools: List[AgentToolReference],
+    ) -> AgentToolsResolution:
+        """Resolve an agent's tool references into model-ready specs.
+
+        ``builtin`` references pass through as names. ``composio`` references are
+        validated against the project's connections up front and enriched from the
+        catalog (description + input schema), so the model never sees a stale schema
+        and the invoke fails fast on a missing/invalid connection rather than mid-loop.
+        """
+        builtins: List[str] = []
+        custom: List[ResolvedAgentTool] = []
+
+        for ref in tools:
+            if isinstance(ref, AgentBuiltinTool):
+                if ref.name:
+                    builtins.append(ref.name)
+                continue
+
+            if isinstance(ref, AgentComposioTool):
+                custom.append(
+                    await self._resolve_composio_tool(
+                        project_id=project_id,
+                        ref=ref,
+                    )
+                )
+
+        return AgentToolsResolution(builtins=builtins, custom=custom)
+
+    async def _resolve_composio_tool(
+        self,
+        *,
+        project_id: UUID,
+        ref: AgentComposioTool,
+    ) -> ResolvedAgentTool:
+        provider_key = ToolProviderKind.COMPOSIO.value
+
+        for segment in (ref.integration, ref.action, ref.connection):
+            if not _SLUG_SEGMENT_RE.match(segment):
+                raise ToolSlugInvalidError(
+                    slug=f"{provider_key}.{ref.integration}.{ref.action}.{ref.connection}",
+                    detail=f"Invalid slug segment: {segment!r}",
+                )
+
+        # Fail fast if the connection is missing/inactive/invalid for this project.
+        await self.resolve_connection_by_slug(
+            project_id=project_id,
+            provider_key=provider_key,
+            integration_key=ref.integration,
+            connection_slug=ref.connection,
+        )
+
+        action = await self.get_action(
+            provider_key=provider_key,
+            integration_key=ref.integration,
+            action_key=ref.action,
+        )
+        if not action:
+            raise ActionNotFoundError(
+                provider_key=provider_key,
+                integration_key=ref.integration,
+                action_key=ref.action,
+            )
+
+        input_schema = (
+            action.schemas.inputs if action.schemas and action.schemas.inputs else None
+        )
+        name = ref.name or f"{ref.integration}__{ref.action}"
+        call_ref = (
+            f"tools.{provider_key}.{ref.integration}.{ref.action}.{ref.connection}"
+        )
+
+        return ResolvedAgentTool(
+            name=name,
+            description=action.description,
+            input_schema=input_schema,
+            call_ref=call_ref,
+        )
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
index 0bb4b12777..977427469c 100644
--- a/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
+++ b/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
@@ -92,8 +92,9 @@ network the internal hostname resolves from both; if it does not, the sidecar's
 
 ## How to verify
 
-1. Start the service (`entrypoints.agent_main:app`) with `AGENTA_HOST` and
-   `AGENTA_API_KEY` set and a Pi login or provider key available.
+1. Start the services app (`entrypoints.main:app`, which mounts the agent at
+   `/agent/v0`) with `AGENTA_HOST` and `AGENTA_API_KEY` set and a Pi login or provider
+   key available.
 2. POST a chat-style body to `/agent/v0/invoke` and read `x-ag-trace-id` from the
    response headers (it equals `trace_id` in the body).
 3. Fetch the trace and confirm the merged tree and the totals:
diff --git a/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
index 81f8cb6e88..f905fd6d7a 100644
--- a/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
+++ b/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
@@ -4,6 +4,14 @@ Status: MVP built and verified by curl (2026-06-15). Decisions below were taken;
 "Implemented" section records what shipped. Original decision points are kept marked
 **[DECISION]** for history.
 
+> Note (current state): the sections below describe the iterative MVP, including a
+> standalone entrypoint (`agent_main.py`) and dedicated composes
+> (`docker-compose.agent.yml`, `docker-compose.stack.yml`). Those were **removed** in
+> favor of the integrated path only: the agent is mounted in `entrypoints/main.py` at
+> `/agent/v0` and the `agent-pi` sidecar lives in
+> `hosting/docker-compose/ee/docker-compose.dev.yml`. The standalone run commands below
+> are historical. See `qa.md` for the rationale.
+
 ## Implemented (MVP, verified by curl)
 
 Per the decisions: a Python service exposes the Agenta `/invoke` contract (auth,
diff --git a/docs/design/agent-workflows/wp-2-agent-service/qa.md b/docs/design/agent-workflows/wp-2-agent-service/qa.md
new file mode 100644
index 0000000000..b7d25221d9
--- /dev/null
+++ b/docs/design/agent-workflows/wp-2-agent-service/qa.md
@@ -0,0 +1,176 @@
+# Agent service: Q&A
+
+Running notes answering review questions about the agent workflow implementation
+(branch `feat/agent-workflows`). Questions are in no particular order.
+
+---
+
+## Q: Why a separate entrypoint `agent_main.py` instead of `main.py`?
+
+Short answer: `agent_main.py` is not a replacement for `main.py`. It is an extra,
+lightweight runner for testing the agent in isolation. The real integration lives in
+`main.py`, and that is what the 8280 stack actually runs.
+
+The two entrypoints:
+
+- `services/entrypoints/main.py` is the full services app. It mounts every service
+  (chat, completion, all the managed evaluators, and now the agent at `/agent/v0`). This
+  is the production/dev container entrypoint and the path the playground uses
+  (`/services/agent/v0/...`). The agent is a first-class part of it:
+  `app.mount("/agent/v0", agent_v0_app)`.
+
+- `services/entrypoints/agent_main.py` mounts only the agent app plus `/health`.
+
+Why we added `agent_main.py`:
+
+1. Isolated, fast iteration. Early on the deliverable was "a standalone agent service
+   verified by curl" (no full stack). Running `main.py` pulls in the whole managed
+   evaluator surface (litellm, all the builtins) and `ag.init()` for the full app.
+   `agent_main.py` lets you run just the agent:
+   `uv run uvicorn entrypoints.agent_main:app --port 8090` and curl it, without the rest.
+
+2. The dedicated `:8092` Docker compose. Before the agent was integrated into the real
+   stack, it ran standalone in its own compose. That container ran `agent_main.py`.
+
+3. A place for cross-origin CORS. When the playground had to call the agent on a
+   different port (`:8092` vs the web on `:8280`), the browser needs a credentialed CORS
+   policy (echo the specific origin + allow credentials). `agent_main.py` sets that
+   (`allow_origin_regex` + `allow_credentials=True`). `main.py` keeps the stricter
+   shared services CORS, which is fine for it because, once integrated, the agent is
+   served same-origin (`/services/agent/v0`) so there is no CORS at all.
+
+Net: `main.py` is the real, integrated path (same-origin, used by the 8280 stack).
+`agent_main.py` was a convenience runner for isolated local/standalone testing and the
+old dedicated compose.
+
+**Update (decision): dropped.** We removed `agent_main.py` and the two standalone
+composes (`docker-compose.agent.yml`, `docker-compose.stack.yml`) to keep only the
+integrated path: the agent mounted in `entrypoints/main.py` at `/agent/v0`, served by
+the normal services container, with the `agent-pi` sidecar wired into
+`hosting/docker-compose/ee/docker-compose.dev.yml`. If we ever want isolated runs again,
+the cleaner approach is a profile/override on the real compose rather than a parallel
+entrypoint.
+
+---
+
+## Q: How does the agent service use the workflow middleware? Which parts does it have access to (secrets, invoke, inspect, ...)?
+
+The agent gets the whole Agenta workflow machinery "for free" because it is built the
+same way as chat and completion: `ag.create_app()` + `ag.workflow(schemas=...)` +
+`ag.route("/", flags={"is_chat": True})` in `services/oss/src/agent.py`. That was the
+point of the Python-front decision: the Python layer provides auth, middleware,
+tracing, secrets, and the invoke/inspect contract; the Node wrapper only runs Pi.
+
+There are **two middleware layers**.
+
+### Layer 1 — HTTP/ASGI middleware (per request)
+
+Added by `ag.create_app()` (`sdks/.../decorators/routing.py:64`). Outermost first:
+
+- **CORSMiddleware** — cross-origin headers. Irrelevant on the integrated same-origin
+  path; it mattered only for the old cross-port setup.
+- **AuthMiddleware** — verifies the caller against `{host}/api/access/permissions/check`
+  and puts the resolved credential on `request.state.auth["credentials"]` (a signed
+  `Secret`). With `AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false` it passes the raw
+  `Authorization` through without a remote check. This is the credential everything
+  downstream uses.
+- **OTelMiddleware** — opens the request's tracing context, i.e. the workflow span the
+  whole run nests under.
+
+### Layer 2 — Workflow middleware (inside `wf.invoke`)
+
+Set on the workflow object (`decorators/running.py:197`), run in order around the
+handler:
+
+- **VaultMiddleware** — resolves secrets for the credential: it fetches the project's
+  vault secrets from `{api_url}/secrets/`, combines them with any local secrets, checks
+  access, and exposes them on the running context. (More on "access" below.)
+- **ResolverMiddleware** — resolves which handler to run from the revision URI, hydrates
+  references / revision / config from the backend when needed, and resolves embeds in
+  parameters.
+- **NormalizerMiddleware** — maps the request to the handler's arguments by inspecting
+  its signature (`inputs`, `messages`, `parameters` pulled from `data`), calls
+  `_agent(...)`, and wraps the return value into the response envelope, attaching
+  `trace_id` / `span_id`.
+
+### What the agent actually has access to / uses
+
+- **invoke** — yes, fully. `POST /services/agent/v0/invoke` runs the entire chain
+  (auth -> vault -> resolver -> normalizer -> `_agent`). `_agent` receives `inputs`,
+  `messages`, and `parameters` already mapped for it.
+- **inspect** — yes. `POST /services/agent/v0/inspect` returns the agent's interface,
+  i.e. `AGENT_SCHEMAS` (chat `messages` in, `message` out, config = `model` +
+  `agents_md`). This is what tells the playground to render a chat box and the two
+  config fields. (Known bug: inspect currently 500s under session-cookie auth; it did
+  not block the playground because the create flow takes the schema from the catalog
+  template.)
+- **auth / credentials** — yes. The resolved `Secret` credential is available to the
+  handler and to tracing export.
+- **tracing** — yes. `_agent` reads the active workflow span via `_trace_context()` and
+  threads the `traceparent` (plus endpoint/auth) to the Pi sidecar, so the Pi spans
+  nest under the `/invoke` span in one trace.
+- **secrets** — available but **not consumed yet**. VaultMiddleware resolves the
+  project's secrets on every invoke and exposes them on the running context. Chat and
+  completion use them automatically because litellm reads them. The agent handler does
+  not read them today; the Pi model auth currently comes from the mounted
+  `~/.pi/agent` (Codex login) or `AGENTA_API_KEY`/provider env on the sidecar. Wiring
+  the resolved secrets into the Pi run (the "startup hook injects the provider/tool
+  keys" step) is exactly where this plugs in: read the secrets in `_agent`, pass them in
+  the harness request, and have the wrapper inject them (`setRuntimeApiKey` / env). That
+  is the planned secrets work, not yet built.
+
+One detail: the route passes `secrets=None` into `wf.invoke`, so the agent does not
+hand secrets in; VaultMiddleware fetches them itself from the credential. The gap is
+only on the consuming side (the handler), not the resolving side.
+
+---
+
+## Q: Why does tracing look different / broken now vs the old trace?
+
+Reference old trace `6ab51033...`: root `invoke_agent`, four `turn`s, several
+`chat gpt-5.5` spans, and `execute_tool ls/read/bash/write` — 14 spans, with
+cumulative token + cost rolled up onto the `turn` and `invoke_agent` spans.
+
+Current trace (e.g. `329698f7...`): `_agent -> invoke_agent -> turn 0 -> chat` — 4
+spans; the `chat` span has tokens + cost, the parents do not.
+
+Tracing is **not broken** (spans land, nest correctly, the `chat` span carries model,
+tokens, cost). Two things changed:
+
+### 1. Different agent and task (the big, expected difference)
+
+The old trace is the WP-1 POC: tools enabled (`read/bash/edit/write/ls`) and a task
+that needs them ("read notes.txt, write greeting.txt"). That drives a multi-turn loop
+with tool calls, so you get many turns, many `chat` spans, and `execute_tool` spans.
+
+The current app is the hello-world chat agent: `tools=[]` and "answer in one or two
+short sentences". So it does exactly one turn, no tools, one `chat`. Same
+instrumentation, a trivial run. To get a rich trace again, give the agent tools
+(built-in `read/bash/...` or the WP-7 runnable tools) and a task that uses them.
+
+### 2. Cumulative token/cost rollup is lost across the process boundary (a real regression)
+
+In the old (standalone) trace, all spans were exported by one process in one batch, so
+Agenta's per-ingest-batch cumulative computation could build the roll-up tree and put
+cumulative tokens/cost on `turn` and `invoke_agent`.
+
+Now the trace is split across **two exporters**:
+- Python (services container) exports `_agent` (the workflow span).
+- Node (`agent-pi`) exports `invoke_agent -> turn -> chat` (the Pi spans), where
+  `invoke_agent`'s parent is the **remote** `_agent`.
+
+Agenta builds the cumulative tree per ingest batch and "attaches a span only if its
+parent is already seen" (see the `orderParentFirst` comment in `agenta-otel.ts`). In the
+Node batch, `invoke_agent`'s parent (`_agent`) is in the **other** (Python) batch, so the
+Pi subtree is dropped from the cumulative tree. Result: the leaf `chat` keeps its raw
+`incremental` tokens, but `cumulative` is missing on `chat` and there is no token/cost
+rollup on `turn` / `invoke_agent` / `_agent`. (Duration still rolls up because it is
+computed differently.)
+
+So the agent- and turn-level token/cost totals you used to see are gone. This is a
+side effect of nesting the agent under the Agenta workflow span (the integration goal).
+The fix belongs on the tracing side (owned by the instrumentation work): compute the
+cumulative roll-up across the whole trace by `trace_id` rather than per ingest batch, so
+a trace split between the Python workflow span and the Node Pi spans still aggregates.
+Until then, per-span (leaf `chat`) tokens/cost are correct; the rolled-up agent totals
+are not.
diff --git a/docs/design/agent-workflows/wp-7-tools/README.md b/docs/design/agent-workflows/wp-7-tools/README.md
index 225c77eb26..483f5dc688 100644
--- a/docs/design/agent-workflows/wp-7-tools/README.md
+++ b/docs/design/agent-workflows/wp-7-tools/README.md
@@ -1,6 +1,8 @@
 # WP-7: Runnable tools as agent configuration
 
-Status: design draft. Builds on WP-2 (agent service) and WP-6 (workflow type and template).
+Status: Composio MVP implemented. Resolution lives in `api`; the bridge routes Pi tool
+calls back through `POST /tools/call`. Builds on WP-2 (agent service) and WP-6 (workflow
+type and template). See [Implementation status](#implementation-status-composio-mvp) below.
 
 ## Goal
 
@@ -205,6 +207,99 @@ dispatches purely by `provider_key` through the registry, the agent side stays p
   smoke run, with the call nested under the agent invoke span and the Composio key absent from the
   sandbox.
 
+## Implementation status (Composio MVP)
+
+What landed, by seam. WP-6 is not started, so resolution runs in `api` behind a thin
+endpoint that the agent service calls over HTTP; when WP-6 lands, its invoke path calls the
+same `ToolsService.resolve_agent_tools(...)` in-process and the HTTP hop drops out.
+
+**Backend (`api`) — the resolver and the shared connection lookup.**
+
+- `core/tools/dtos.py`: `AgentToolReference` (discriminated `builtin` | `composio`),
+  `ResolvedAgentTool` (`name`, `description`, `input_schema`, `call_ref`), and
+  `AgentToolsResolution` (`builtins`, `custom`).
+- `core/tools/service.py`: `resolve_connection_by_slug(...)` (extracted from `call_tool`, now
+  shared) and `resolve_agent_tools(...)`. Composio refs validate the connection up front,
+  enrich `description` + `input_schema` from the catalog (`get_action`), and build the
+  `call_ref` `tools.composio.{integration}.{action}.{connection}`. Slug segments are validated
+  and `__` is rejected so the `/tools/call` `__`↔`.` round-trip can't corrupt the split.
+- `apis/fastapi/tools/router.py`: `POST /tools/resolve` (project-scoped, EE `VIEW_TOOLS`)
+  returns the resolution; `call_tool` now reuses `resolve_connection_by_slug`. `call_tool` is
+  otherwise unchanged as the execution endpoint.
+
+**Agent service (`services/oss`) — thin driver.**
+
+- `agent_pi/ports.py`: `ToolCallback` (endpoint + authorization) and `custom_tools` /
+  `tool_callback` on `HarnessRequest`, serialized onto the wire by both harness adapters.
+- `agent.py`: reads `parameters["tools"]` (or the file config), POSTs them to `/tools/resolve`,
+  and threads the result plus a `/tools/call` callback into the harness. The callback endpoint
+  and credential reuse the OTLP-credential mechanism (`inject()` Authorization, API-base derived
+  from `ag.tracing.otlp_url`, with `AGENTA_AGENT_TOOLS_API_URL` / `AGENTA_API_KEY` fallbacks). An
+  agent with no tools never touches the backend, preserving the tool-less WP-2 path.
+
+**TS wrapper (`services/agent`) — the bridge.**
+
+- `runPi.ts`: `buildCustomTools(...)` turns each resolved spec into a Pi `customTool` whose
+  `execute` does one `POST {endpoint}` with the OpenAI envelope
+  `{ data: { id, type, function: { name: callRef, arguments } } }` and the callback
+  Authorization. Arguments go as an object (no double-encoding); the result `content` returns
+  verbatim; an HTTP/timeout failure throws, which Pi turns into a tool-error result rather than a
+  run failure. Custom tool names are added to the `createAgentSession` `tools` allowlist, because
+  the allowlist gates custom tools too (an empty allowlist would hide them).
+
+**Config schema as shipped.** Under the agent revision `parameters["tools"]`, each entry is a
+built-in tool name (string, normalized to `{"type": "builtin", "name": ...}`) or a discriminated
+object. Example:
+
+```json
+{
+  "model": "gpt-5.5",
+  "tools": [
+    "read_file",
+    { "type": "composio", "integration": "gmail", "action": "GMAIL_SEND_EMAIL",
+      "connection": "gmail-team", "name": "gmail__SEND_EMAIL" }
+  ]
+}
+```
+
+**Playground integration: reuse the existing tool picker.** The chat/completion tool picker
+only renders inside the prompt control, which the playground shows for a config field marked
+`x-ag-type-ref: "prompt-template"`. So the agent advertises its config as a `prompt`
+prompt-template (`agent_pi/schemas.py`) instead of a bespoke form: the playground then renders
+the same model selector + system-message editor + tool picker, with no new frontend code. The
+handler (`agent.py` `_resolve_run_config`) reads the system message as the AGENTS.md, the model
+and tools from `prompt.llm_config`, and still accepts the flat `{model, agents_md, tools}` an API
+caller may send. The picker encodes a Composio action as a gateway function name,
+`tools__{provider}__{integration}__{action}__{connection}` (connection = the connection slug);
+`agent.py` `_parse_gateway_slug` turns that into the same `composio` ref the resolver already
+takes, so no backend change was needed. Non-Composio picker entries (provider built-ins, inline
+functions) are skipped.
+
+**Verified live (2026-06-16, dev stack, pi-agents project).** A real GitHub Composio connection
+(`github-tvn`) plus a `GET_THE_AUTHENTICATED_USER` reference, passed via `parameters["tools"]` to
+the agent `/invoke`, drove the whole path: `/tools/resolve` built the spec, Pi registered the
+`github_whoami` customTool, called it, and the bridge executed the real action through
+`/tools/call`. The agent answered with live data (login `mmabrouk`, follower count, public-repo
+count) that only comes from executing the action. The trace nests the tool call correctly:
+`_agent → invoke_agent → turn 0 → {chat, execute_tool github_whoami} → turn 1 → chat`. The same
+run also works end to end from the playground: the picker shows the GitHub tool as a gateway card,
+and Run returns the live answer.
+
+Earlier unit-level checks still hold: the resolver builds correct specs and raises the right
+errors for missing / inactive / invalid connections, bad slugs, and missing actions; the bridge
+sends the right envelope, forwards Authorization, sends object-form arguments, returns content
+verbatim, and throws on HTTP error; Pi's validator accepts and coerces the plain Composio JSON
+Schema.
+
+**Deployment hardening found and fixed.** The DoD wants the Composio key absent from the sandbox.
+The WP-7 *data path* already guarantees this (the key is never sent to Pi). But the dev
+`agent-pi` sidecar was loading the whole stack `env_file`, so the container inherited
+`COMPOSIO_API_KEY` and other secrets anyway. Dropping `env_file` from the `agent-pi` service in
+`hosting/docker-compose/ee/docker-compose.dev.yml` (it reads only `PORT`, `PI_CODING_AGENT_DIR`,
+`AGENTA_HOST`, `AGENTA_API_KEY`, and two optional vars; Pi auth comes from the mounted login) makes
+the property hold in the local sidecar too. A real sandbox (WP-3 Daytona) is isolated and never
+saw these.
+
 ## Links
 
 - [`wp-2-agent-service/`](../wp-2-agent-service/README.md)
diff --git a/hosting/docker-compose/ee/docker-compose.dev.yml b/hosting/docker-compose/ee/docker-compose.dev.yml
index 27974d996f..e09b82b29f 100644
--- a/hosting/docker-compose/ee/docker-compose.dev.yml
+++ b/hosting/docker-compose/ee/docker-compose.dev.yml
@@ -428,8 +428,12 @@ services:
             sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
             exec node_modules/.bin/tsx src/server.ts"
         # === CONFIGURATION ======================================== #
-        env_file:
-            - ${ENV_FILE:-./.env.ee.dev}
+        # Deliberately NO env_file: the Pi sandbox must not inherit the stack's
+        # secrets (COMPOSIO_API_KEY, STRIPE/POSTHOG/GOOGLE/DAYTONA keys, ...). Tools
+        # run server-side via /tools/call, so the sandbox only needs its own port,
+        # the Pi login (mounted below), and the OTLP export fallback. The wrapper
+        # reads exactly: PORT, PI_CODING_AGENT_DIR, AGENTA_HOST, AGENTA_API_KEY, and
+        # the optional AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS / OTEL_SERVICE_NAME.
         environment:
             PORT: "8765"
             PI_CODING_AGENT_DIR: /pi-agent
diff --git a/services/agent/README.md b/services/agent/README.md
index c920de19f9..f566acb704 100644
--- a/services/agent/README.md
+++ b/services/agent/README.md
@@ -1,4 +1,4 @@
-# Agent service: Pi wrapper (WP-2)
+# Agent service: Pi wrapper (WP-2 + WP-7)
 
 This is the TypeScript side of the agent workflow service. It is a thin wrapper that
 drives the [Pi](https://pi.dev) agent harness for a single run. The Python service
@@ -54,6 +54,34 @@ With no `trace` block the run is traced standalone using `AGENTA_HOST` /
 `AGENTA_API_KEY`, or not at all when neither is set. The extension lives in
 `src/agenta-otel.ts`.
 
+## Tools (WP-7)
+
+The agent's runnable tools are resolved in the backend (not here) and arrive on the
+request as `customTools` plus a `toolCallback`. `buildCustomTools` in `src/runPi.ts`
+turns each spec into a Pi `customTool` whose `execute` does one
+`POST {toolCallback.endpoint}` (Agenta's `/tools/call`) with the `callRef` slug and the
+threaded `authorization`. Pi drives the loop and runs the tool in-process; the provider
+key and connection auth stay server-side behind `/tools/call` and never enter this
+sandbox. See `docs/design/agent-workflows/wp-7-tools/README.md`.
+
+```json
+{
+  "prompt": "What is my GitHub username?",
+  "customTools": [
+    {
+      "name": "github__GET_THE_AUTHENTICATED_USER",
+      "description": "Gets the authenticated GitHub user.",
+      "inputSchema": {"type": "object", "properties": {}},
+      "callRef": "tools.composio.github.GET_THE_AUTHENTICATED_USER.github-tvn"
+    }
+  ],
+  "toolCallback": {
+    "endpoint": "https://host/api/tools/call",
+    "authorization": "ApiKey ..."
+  }
+}
+```
+
 ## Auth
 
 `AuthStorage.create()` reads `~/.pi/agent/auth.json`. Log in once with `pnpm exec pi`
@@ -68,6 +96,8 @@ echo '{"agentsMd":"You are a hello-world agent.","prompt":"Hi"}' | pnpm run run:
 
 ## Config
 
-`config/AGENTS.md` and `config/agent.json` hold the hardcoded MVP config. They are read
-by the Python service and passed into the request, so editing them changes the agent
-without a code change.
+The live config comes from the agent revision in the playground: a `prompt-template`
+whose system message is the AGENTS.md, with the model and the picked tools under
+`llm_config`. The Python service (`services/oss/src/agent.py`) reads that and fills the
+request. `config/AGENTS.md` and `config/agent.json` are only the file fallback used when
+the request carries no config.
diff --git a/services/agent/docker-compose.agent.yml b/services/agent/docker-compose.agent.yml
deleted file mode 100644
index 43f733d1c7..0000000000
--- a/services/agent/docker-compose.agent.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-# Dedicated, self-contained compose for the agent service (WP-2).
-#
-# Runs the agent fully in Docker, invokable by curl, without touching any other stack:
-#
-#   agent-pi   - the TypeScript Pi wrapper as an HTTP sidecar. Uses the local Pi login
-#                (~/.pi/agent) copied in at startup so token refresh never writes to the
-#                host. Reachable only on the internal network.
-#   agent-api  - the Python agent service (reuses the prebuilt services dev image). Speaks
-#                the Agenta /invoke contract and calls agent-pi over HTTP. Published on a
-#                host port for curl.
-#
-# Bring up:
-#   docker compose -f services/agent/docker-compose.agent.yml up --build
-# Verify:
-#   curl localhost:8092/health
-#   curl -X POST localhost:8092/agent/v0/invoke -H 'Content-Type: application/json' \
-#     -d '{"data":{"inputs":{"messages":[{"role":"user","content":"hi"}]}}}'
-# Tear down:
-#   docker compose -f services/agent/docker-compose.agent.yml down
-
-name: agenta-agent
-
-services:
-    agent-pi:
-        build:
-            context: .
-            dockerfile: docker/Dockerfile.dev
-        # Copy the read-only mounted login into a writable container path so OAuth token
-        # refresh works and never writes back to the host ~/.pi/agent.
-        command: >
-            sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
-            exec node_modules/.bin/tsx watch src/server.ts"
-        environment:
-            PORT: "8765"
-            PI_CODING_AGENT_DIR: /pi-agent
-            # Tracing export fallback when the request carries no Authorization
-            # (auth disabled locally). Must be reachable from this container.
-            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
-            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
-        volumes:
-            - ./src:/app/src
-            - ${HOME}/.pi/agent:/pi-agent-ro:ro
-        networks:
-            - agent-net
-        restart: unless-stopped
-
-    agent-api:
-        # Built from the current services dev Dockerfile (Python 3.13, current SDK +
-        # deps). A dedicated tag so we never clobber other stacks' images.
-        image: agenta-agent-api:dev
-        build:
-            context: ../..
-            dockerfile: services/oss/docker/Dockerfile.dev
-        command:
-            [
-                "uvicorn",
-                "entrypoints.agent_main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                "8080",
-                "--reload",
-                "--reload-dir",
-                "/app/oss/src",
-                "--reload-dir",
-                "/app/entrypoints",
-                "--reload-dir",
-                "/sdks/python/agenta",
-                "--reload-exclude",
-                "*.pyc",
-                "--reload-exclude",
-                "__pycache__",
-            ]
-        environment:
-            # Local curl: skip the remote credential check (the Python layer still runs
-            # its auth/middleware stack, it just passes the header through).
-            AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED: "false"
-            # Drives the harness selection: HTTP harness -> the agent-pi sidecar.
-            AGENTA_AGENT_PI_URL: http://agent-pi:8765
-            # Tracing export target. Must be reachable from this container AND from the
-            # agent-pi sidecar (the endpoint is passed across to nest the Pi spans), so
-            # use the host IP, not localhost. The API key authorizes the OTLP export.
-            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
-            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
-        volumes:
-            - ..:/app
-            - ../../sdks/python:/sdks/python
-            - ../../clients/python:/clients/python
-        ports:
-            - "8092:8080"
-        depends_on:
-            - agent-pi
-        networks:
-            - agent-net
-        restart: unless-stopped
-
-networks:
-    agent-net:
diff --git a/services/agent/docker-compose.stack.yml b/services/agent/docker-compose.stack.yml
deleted file mode 100644
index 774e942517..0000000000
--- a/services/agent/docker-compose.stack.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Same-origin demo: the agent served exactly like chat and completion.
-#
-# Runs the FULL services app (entrypoints.main, which now mounts /agent/v0 next to
-# /chat/v0 and /completion/v0) behind its own traefik, so the agent answers at
-# {origin}/services/agent/v0/invoke just like {origin}/services/chat/v0/invoke. The
-# Pi sidecar is called in-network. This is the integration; a full dev stack (with the
-# web app) would serve the playground at the same origin so there is no CORS at all.
-#
-# Bring up (creds for tracing/export come from the shell):
-#   set -a && source .env.test.local && set +a
-#   docker compose -f services/agent/docker-compose.stack.yml up --build -d
-# Verify:
-#   curl -X POST localhost:8480/services/agent/v0/invoke -H 'content-type: application/json' \
-#     -d '{"data":{"inputs":{"messages":[{"role":"user","content":"hi"}]}}}'
-
-name: agenta-agent-stack
-
-services:
-    traefik:
-        image: traefik:2
-        command:
-            - --providers.docker
-            - --providers.docker.constraints=Label(`com.docker.compose.project`,`agenta-agent-stack`)
-            - --entrypoints.web.address=:80
-        volumes:
-            - /var/run/docker.sock:/var/run/docker.sock
-        ports:
-            - "8480:80"
-        networks:
-            - stack-net
-        restart: unless-stopped
-
-    services:
-        image: agenta-agent-api:dev
-        command:
-            [
-                "uvicorn",
-                "entrypoints.main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                "8080",
-                "--root-path",
-                "/services",
-            ]
-        environment:
-            AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED: "false"
-            AGENTA_AGENT_PI_URL: http://agent-pi:8765
-            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
-            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
-        volumes:
-            - ..:/app
-            - ../../sdks/python:/sdks/python
-            - ../../clients/python:/clients/python
-        networks:
-            - stack-net
-        labels:
-            - "traefik.http.routers.aservices.rule=PathPrefix(`/services/`)"
-            - "traefik.http.routers.aservices.entrypoints=web"
-            - "traefik.http.middlewares.aservices-strip.stripprefix.prefixes=/services"
-            - "traefik.http.middlewares.aservices-strip.stripprefix.forceslash=true"
-            - "traefik.http.routers.aservices.middlewares=aservices-strip"
-            - "traefik.http.services.aservices.loadbalancer.server.port=8080"
-        restart: unless-stopped
-
-    agent-pi:
-        build:
-            context: .
-            dockerfile: docker/Dockerfile.dev
-        command: >
-            sh -c "mkdir -p /pi-agent && cp -a /pi-agent-ro/. /pi-agent/ 2>/dev/null || true;
-            exec node_modules/.bin/tsx src/server.ts"
-        environment:
-            PORT: "8765"
-            PI_CODING_AGENT_DIR: /pi-agent
-            AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
-            AGENTA_API_KEY: ${AGENTA_API_KEY:-}
-        volumes:
-            - ./src:/app/src
-            - ${HOME}/.pi/agent:/pi-agent-ro:ro
-        networks:
-            - stack-net
-        restart: unless-stopped
-
-networks:
-    stack-net:
diff --git a/services/agent/scripts/register_agent_app.py b/services/agent/scripts/register_agent_app.py
deleted file mode 100644
index 1e73c0515f..0000000000
--- a/services/agent/scripts/register_agent_app.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# /// script
-# requires-python = ">=3.11"
-# dependencies = ["requests"]
-# ///
-"""Register the agent as an app in a running Agenta stack, pointing at the dockerized
-agent service. Run it, then open the app in the playground and chat.
-
-It creates a workflow + default variant and commits a revision whose `data.url` points
-at the agent service and whose `data.schemas` is the chat interface the agent serves
-from /inspect (so the playground renders a chat box). This is the "custom workflow"
-path: no static SDK interface, the agent self-describes.
-
-Env:
-  AGENTA_HOST     base host (default http://144.76.237.122:8280)
-  AGENTA_API_KEY  api key for that stack (Authorization: ApiKey ...)
-  AGENT_URL       agent service invoke base (default http://144.76.237.122:8092/agent/v0)
-  PROJECT_ID      optional; defaults to the stack's default project
-  APP_SLUG        optional; defaults to wp2-agent-<n>
-
-Usage:
-  AGENTA_API_KEY=... uv run services/agent/scripts/register_agent_app.py
-"""
-
-import os
-import secrets
-import sys
-
-import requests
-
-HOST = os.environ.get("AGENTA_HOST", "http://144.76.237.122:8280").rstrip("/")
-API = HOST + "/api"
-KEY = os.environ.get("AGENTA_API_KEY")
-AGENT_URL = os.environ.get("AGENT_URL", "http://144.76.237.122:8092/agent/v0").rstrip(
-    "/"
-)
-PROJECT_ID = os.environ.get("PROJECT_ID")
-APP_SLUG = os.environ.get("APP_SLUG") or f"wp2-agent-{secrets.token_hex(3)}"
-
-if not KEY:
-    sys.exit("Set AGENTA_API_KEY")
-
-H = {"Authorization": f"ApiKey {KEY}", "Content-Type": "application/json"}
-
-# The chat interface the agent advertises via /inspect (kept in sync with
-# services/oss/src/agent_pi/schemas.py).
-SCHEMA = "https://json-schema.org/draft/2020-12/schema"
-AGENT_SCHEMAS = {
-    "inputs": {
-        "$schema": SCHEMA,
-        "type": "object",
-        "additionalProperties": True,
-        "properties": {
-            "messages": {
-                "x-ag-type-ref": "messages",
-                "type": "array",
-                "description": "Ordered list of normalized chat messages.",
-            }
-        },
-    },
-    "parameters": {
-        "$schema": SCHEMA,
-        "type": "object",
-        "additionalProperties": True,
-        "properties": {"model": {"type": "string", "description": "Model override."}},
-    },
-    "outputs": {
-        "$schema": SCHEMA,
-        "x-ag-type-ref": "message",
-        "type": "object",
-        "description": "Final assistant message returned by the agent.",
-    },
-}
-
-
-def _id() -> str:
-    return secrets.token_hex(6)
-
-
-def post(path: str, body: dict) -> dict:
-    r = requests.post(
-        f"{API}{path}",
-        json=body,
-        headers=H,
-        params={"project_id": PROJECT_ID},
-        timeout=60,
-    )
-    if r.status_code >= 300:
-        sys.exit(f"POST {path} -> {r.status_code}: {r.text[:600]}")
-    return r.json()
-
-
-def main() -> None:
-    global PROJECT_ID
-    if not PROJECT_ID:
-        projects = requests.get(f"{API}/projects", headers=H, timeout=30).json()
-        default = next(
-            (p for p in projects if p.get("is_default_project")), projects[0]
-        )
-        PROJECT_ID = default["project_id"]
-    print(f"project_id={PROJECT_ID}  app_slug={APP_SLUG}  agent_url={AGENT_URL}")
-
-    wf = post(
-        "/workflows/",
-        {
-            "workflow": {
-                "slug": APP_SLUG,
-                "name": APP_SLUG,
-                "flags": {"is_application": True},
-            }
-        },
-    )
-    workflow_id = wf["workflow"]["id"]
-
-    var = post(
-        "/workflows/variants/",
-        {
-            "workflow_variant": {
-                "workflow_id": workflow_id,
-                "slug": f"{APP_SLUG}.default",
-                "name": "default",
-            }
-        },
-    )
-    variant_id = var["workflow_variant"]["id"]
-
-    # Seed v0 (tables dismiss v0), then commit v1 with the real data.
-    post(
-        "/workflows/revisions/commit",
-        {
-            "workflow_revision": {
-                "workflow_id": workflow_id,
-                "workflow_variant_id": variant_id,
-                "slug": _id(),
-                "name": "default",
-                "message": "Initial commit",
-            }
-        },
-    )
-    rev = post(
-        "/workflows/revisions/commit",
-        {
-            "workflow_revision": {
-                "workflow_id": workflow_id,
-                "workflow_variant_id": variant_id,
-                "slug": _id(),
-                "name": "default",
-                "message": "Agent service",
-                "flags": {"is_chat": True},
-                "data": {
-                    "url": AGENT_URL,
-                    "parameters": {"model": "gpt-5.5"},
-                    "schemas": AGENT_SCHEMAS,
-                },
-            }
-        },
-    )
-    revision = rev["workflow_revision"]
-    print(f"workflow_id={workflow_id}")
-    print(f"variant_id={variant_id}")
-    print(f"revision_id={revision['id']}  flags={revision.get('flags')}")
-    print(f"stored url={revision.get('data', {}).get('url')}")
-    print(f"\nOpen the playground: {HOST}/apps/{workflow_id}/playground")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/services/agent/src/runPi.ts b/services/agent/src/runPi.ts
index cabf603701..4056d0dce7 100644
--- a/services/agent/src/runPi.ts
+++ b/services/agent/src/runPi.ts
@@ -4,8 +4,9 @@
  * This is the concrete "harness" behind the service's Harness port. It drives the
  * Pi SDK (`createAgentSession`) for a single run: it injects the agent's AGENTS.md
  * in memory, resolves the model, sends one user turn, and returns the final
- * assistant text. No streaming, no tools by default, no session persistence. Those
- * are later work packages.
+ * assistant text. It also turns the backend-resolved runnable tools (WP-7) into Pi
+ * customTools that route back through Agenta's /tools/call. No streaming and no
+ * session persistence yet; those are later work packages.
  *
  * Auth: uses `AuthStorage.create()`, which reads ~/.pi/agent/auth.json (the local
  * Pi login). Set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment as an
@@ -54,6 +55,34 @@ export interface TraceContext {
   captureContent?: boolean;
 }
 
+/**
+ * A runnable tool the backend already resolved from the agent config: name +
+ * description + JSON-Schema params for the model, plus the `callRef` slug the
+ * execution bridge sends back to Agenta's /tools/call. The Composio key and the
+ * connection auth stay server-side; this sandbox never sees them.
+ */
+export interface ResolvedToolSpec {
+  /** Function name shown to the model (e.g. "gmail__SEND_EMAIL"). */
+  name: string;
+  /** Description shown to the model. Resolved live from the provider catalog. */
+  description?: string;
+  /** JSON Schema for the tool arguments. Pi accepts plain JSON Schema here. */
+  inputSchema?: Record<string, unknown> | null;
+  /** "tools.{provider}.{integration}.{action}.{connection}" — the /tools/call slug. */
+  callRef: string;
+}
+
+/**
+ * Where and how to route a tool call back through Agenta. The backend builds the
+ * full /tools/call URL and threads the same credential the OTLP export rides on.
+ */
+export interface ToolCallbackContext {
+  /** Full /tools/call URL. */
+  endpoint: string;
+  /** Authorization header value for the callback (project-scoped). */
+  authorization?: string;
+}
+
 export interface AgentRunRequest {
   /** AGENTS.md text injected as the agent's instructions (in memory). */
   agentsMd?: string;
@@ -65,6 +94,10 @@ export interface AgentRunRequest {
   messages?: ChatMessage[];
   /** Built-in tools to enable. MVP default: none. */
   tools?: string[];
+  /** Resolved runnable tools (WP-7), turned into Pi customTools below. */
+  customTools?: ResolvedToolSpec[];
+  /** Where customTools route their calls back to. Required when customTools is set. */
+  toolCallback?: ToolCallbackContext;
   /** Tracing: thread the Agenta trace context across the boundary. */
   trace?: TraceContext;
 }
@@ -126,6 +159,117 @@ function extractAssistantText(messages: any[]): string {
   return "";
 }
 
+/** Per-tool budget for the /tools/call round-trip. Surfaced as a tool error on timeout. */
+const TOOL_CALL_TIMEOUT_MS = Number(
+  process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000,
+);
+
+/** Permissive default when a resolved tool has no input schema. */
+const EMPTY_OBJECT_SCHEMA = {
+  type: "object",
+  properties: {},
+  additionalProperties: true,
+};
+
+/**
+ * Turn resolved tool specs into Pi customTools. Each tool's `execute` does one
+ * POST back through Agenta's /tools/call, so Pi runs the loop while the Composio
+ * key and connection auth stay server-side. A failed call throws, which Pi turns
+ * into a tool-error result (the loop continues) rather than a run failure.
+ */
+export function buildCustomTools(
+  specs: ResolvedToolSpec[],
+  callback: ToolCallbackContext | undefined,
+): any[] {
+  if (specs.length === 0) return [];
+  if (!callback?.endpoint) {
+    log(`skipping ${specs.length} custom tool(s): missing toolCallback endpoint`);
+    return [];
+  }
+
+  return specs.map((spec) => ({
+    name: spec.name,
+    label: spec.name,
+    description: spec.description ?? spec.name,
+    // Pi accepts a plain JSON Schema for `parameters` (its validator has a
+    // non-TypeBox path); the schema is resolved live from the provider catalog.
+    parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
+    async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
+      const text = await callAgentaTool(
+        callback,
+        spec.callRef,
+        toolCallId,
+        params,
+        signal,
+      );
+      return {
+        content: [{ type: "text", text }],
+        details: { callRef: spec.callRef },
+      };
+    },
+  }));
+}
+
+/** One /tools/call round-trip. Returns the result string; throws on failure. */
+async function callAgentaTool(
+  callback: ToolCallbackContext,
+  callRef: string,
+  toolCallId: string,
+  params: unknown,
+  signal?: AbortSignal,
+): Promise<string> {
+  const headers: Record<string, string> = { "content-type": "application/json" };
+  if (callback.authorization) headers["authorization"] = callback.authorization;
+
+  // Combine Pi's abort signal (if any) with a per-tool timeout.
+  const timeoutSignal = AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS);
+  const anyOf = (AbortSignal as any).any;
+  const combined =
+    signal && typeof anyOf === "function"
+      ? anyOf([signal, timeoutSignal])
+      : timeoutSignal;
+
+  let response: Response;
+  try {
+    response = await fetch(callback.endpoint, {
+      method: "POST",
+      headers,
+      body: JSON.stringify({
+        data: {
+          id: toolCallId,
+          type: "function",
+          // Arguments as an object (not a JSON string) to avoid double-encoding.
+          function: { name: callRef, arguments: params ?? {} },
+        },
+      }),
+      signal: combined,
+    });
+  } catch (err) {
+    throw new Error(
+      `tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+
+  const bodyText = await response.text();
+  if (!response.ok) {
+    throw new Error(
+      `tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`,
+    );
+  }
+
+  // ToolCallResponse -> { call: { data: { content }, status } }. `content` is the
+  // execution result serialized as a JSON string; hand it to the model verbatim.
+  try {
+    const parsed = JSON.parse(bodyText);
+    const content = parsed?.call?.data?.content;
+    if (typeof content === "string") return content;
+    if (content != null) return JSON.stringify(content);
+    return bodyText;
+  } catch {
+    return bodyText;
+  }
+}
+
 export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
   const prompt = resolvePrompt(request);
   if (!prompt) {
@@ -176,12 +320,27 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
     });
     await loader.reload();
 
+    // Build runnable tools from the resolved specs. Pi's allowlist gates custom
+    // tools too, so their names must be in `tools` for the model to see them.
+    const customTools = buildCustomTools(
+      request.customTools ?? [],
+      request.toolCallback,
+    );
+    const toolAllowlist = [
+      ...(request.tools ?? []),
+      ...customTools.map((tool) => tool.name),
+    ];
+    if (customTools.length > 0) {
+      log(`custom tools: ${customTools.map((t) => t.name).join(", ")}`);
+    }
+
     const { session } = await createAgentSession({
       cwd,
       model,
       authStorage,
       modelRegistry,
-      tools: request.tools ?? [],
+      tools: toolAllowlist,
+      customTools,
       sessionManager: SessionManager.inMemory(cwd),
       settingsManager: SettingsManager.inMemory(),
       resourceLoader: loader,
diff --git a/services/entrypoints/agent_main.py b/services/entrypoints/agent_main.py
deleted file mode 100644
index 595e60ad27..0000000000
--- a/services/entrypoints/agent_main.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Standalone entrypoint for the agent service (WP-2 local verification).
-
-Mounts only the agent app plus a health check, so the agent ``/invoke`` can be
-exercised with curl without bringing up the full services app. The real integration
-point is ``entrypoints/main.py`` (one import + one mount), kept separate so this
-isolated runner stays light.
-
-Run locally (auth disabled for curl):
-
-    cd services
-    AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false \\
-        uv run uvicorn entrypoints.agent_main:app --host 0.0.0.0 --port 8090
-"""
-
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-
-import agenta as ag
-from oss.src.agent import agent_v0_app
-
-ag.init()
-
-app = FastAPI(
-    openapi_url=None,
-    docs_url=None,
-    redoc_url=None,
-)
-
-app.add_middleware(
-    CORSMiddleware,
-    # The playground invokes cross-origin (web on a different port) with credentials
-    # (cookies + Authorization). Browsers reject a "*" origin on credentialed requests,
-    # so echo the specific origin and allow credentials. Matches the dev box on any
-    # port and localhost. Same-origin (served under /services) would avoid CORS entirely.
-    allow_origin_regex=r"https?://(144\.76\.237\.122|localhost|0\.0\.0\.0)(:\d+)?",
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-@app.get("/health")
-async def health():
-    return {"status": "ok"}
-
-
-app.mount("/agent/v0", agent_v0_app)
diff --git a/services/oss/src/agent.py b/services/oss/src/agent.py
index 1203f1560a..42f9b1832c 100644
--- a/services/oss/src/agent.py
+++ b/services/oss/src/agent.py
@@ -1,4 +1,4 @@
-"""Agent workflow service (WP-2).
+"""Agent workflow service (WP-2 + WP-7).
 
 Mirrors the chat/completion services: an Agenta app exposing ``/invoke`` and
 ``/inspect`` through ``ag.create_app`` + ``ag.workflow`` + ``ag.route``, so the
@@ -6,13 +6,17 @@
 builds the user turn from the request and runs it through the Harness port, whose Pi
 adapter drives the TypeScript wrapper in ``services/agent``.
 
-MVP: hardcoded config (AGENTS.md text, model) read from files, a single
-non-streaming reply, no tools. Streaming, multi-message output, tools, and Daytona
-are later work packages.
+Config is a ``prompt-template`` (system message as AGENTS.md, model, and tools): the
+playground renders the same prompt control as chat/completion, including the tool
+picker. Runnable tools (WP-7) are resolved in the backend (``/tools/resolve``) and
+executed back through ``/tools/call`` while Pi drives the loop. Streaming,
+multi-message output, and the Daytona sandbox are later work packages.
 """
 
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
+
+import httpx
 
 import agenta as ag
 from agenta.sdk.engines.tracing.propagation import inject
@@ -22,7 +26,7 @@
 from oss.src.agent_pi.local_runtime import LocalRuntime
 from oss.src.agent_pi.pi_harness import PiHarness
 from oss.src.agent_pi.pi_http_harness import PiHttpHarness
-from oss.src.agent_pi.ports import Harness, HarnessRequest, TraceContext
+from oss.src.agent_pi.ports import Harness, HarnessRequest, ToolCallback, TraceContext
 from oss.src.agent_pi.schemas import AGENT_SCHEMAS
 
 log = get_module_logger(__name__)
@@ -33,6 +37,9 @@
     "no",
 )
 
+# Budget for the backend tool-resolution round-trip (catalog + connection check).
+_TOOLS_RESOLVE_TIMEOUT = float(os.getenv("AGENTA_AGENT_TOOLS_TIMEOUT", "30"))
+
 
 def _build_harness() -> Harness:
     """Pick the harness adapter for the current deployment.
@@ -46,6 +53,53 @@ def _build_harness() -> Harness:
     return PiHarness(LocalRuntime(), wrapper_dir=str(wrapper_dir()))
 
 
+def _system_text(messages: Optional[List[Any]]) -> str:
+    """Join the system-message content of a prompt-template into AGENTS.md text."""
+    parts: List[str] = []
+    for message in messages or []:
+        if not isinstance(message, dict) or message.get("role") != "system":
+            continue
+        content = message.get("content")
+        if isinstance(content, str):
+            parts.append(content)
+        elif isinstance(content, list):
+            parts.extend(
+                block.get("text", "")
+                for block in content
+                if isinstance(block, dict) and block.get("type") == "text"
+            )
+    return "\n\n".join(part for part in parts if part)
+
+
+def _resolve_run_config(
+    params: Dict[str, Any],
+    config: Any,
+) -> Tuple[str, str, Any]:
+    """Pull model, instructions, and raw tools from the request parameters.
+
+    Accepts both shapes: the playground's ``prompt`` (a ``prompt-template`` whose
+    system message is the AGENTS.md and whose ``llm_config`` carries model + picker
+    tools) and the flat ``{model, agents_md, tools}`` an API caller may send. Falls
+    back to the service file config for any unset field.
+    """
+    prompt_cfg = params.get("prompt")
+    if isinstance(prompt_cfg, dict):
+        llm_config = prompt_cfg.get("llm_config") or {}
+        model = llm_config.get("model") or config.model
+        agents_md = _system_text(prompt_cfg.get("messages")) or config.agents_md
+        raw_tools = llm_config.get("tools")
+        if raw_tools is None:
+            raw_tools = prompt_cfg.get("tools")
+    else:
+        model = params.get("model") or config.model
+        agents_md = params.get("agents_md") or config.agents_md
+        raw_tools = params.get("tools")
+
+    if raw_tools is None:
+        raw_tools = config.tools
+    return model, agents_md, raw_tools
+
+
 def _latest_user_message(messages: Optional[List[Any]]) -> str:
     for message in reversed(messages or []):
         if not isinstance(message, dict):
@@ -91,6 +145,162 @@ def _trace_context() -> Optional[TraceContext]:
         return None
 
 
+def _agenta_api_base() -> Optional[str]:
+    """Resolve the Agenta backend base URL (``.../api``) for tool calls.
+
+    Prefers an explicit override, then derives it from the OTLP endpoint the SDK is
+    configured with (``{host}/api/otlp/v1/traces``), then falls back to env. Returns
+    ``None`` when nothing is configured; callers only need this when tools are set.
+    """
+    override = os.getenv("AGENTA_AGENT_TOOLS_API_URL")
+    if override:
+        return override.rstrip("/")
+
+    try:
+        otlp_url = ag.tracing.otlp_url
+    except Exception:  # pylint: disable=broad-except
+        otlp_url = None
+    if otlp_url and "/otlp/" in otlp_url:
+        return otlp_url.split("/otlp/", 1)[0].rstrip("/")
+
+    api_url = os.getenv("AGENTA_API_URL")
+    if api_url:
+        return api_url.rstrip("/")
+
+    return None
+
+
+def _request_authorization() -> Optional[str]:
+    """The project-scoped credential to call ``/tools/resolve`` and ``/tools/call``.
+
+    Reuses the same propagation the OTLP credential rides on (the caller's
+    Authorization), falling back to the service's own API key the way the tracing
+    sidecar does. Scoping to the caller keeps an agent run from invoking tools the
+    user could not (see WP-7 risk: RUN_TOOLS scoping).
+    """
+    try:
+        authorization = inject({}).get("Authorization")
+    except Exception:  # pylint: disable=broad-except
+        authorization = None
+    if authorization:
+        return authorization
+
+    api_key = os.getenv("AGENTA_API_KEY")
+    if api_key:
+        return f"ApiKey {api_key}"
+
+    return None
+
+
+def _parse_gateway_slug(slug: Any) -> Optional[Dict[str, Any]]:
+    """Parse a gateway tool slug into a Composio reference, or ``None``.
+
+    The playground tool picker encodes a Composio action as a function name like
+    ``tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn`` (the same
+    5-segment slug ``/tools/call`` parses; ``__`` or ``.`` separated). Anything that
+    is not a 5-segment ``tools.composio.*`` slug returns ``None`` so the caller can
+    skip it.
+    """
+    if not isinstance(slug, str):
+        return None
+    parts = slug.replace("__", ".").split(".")
+    if len(parts) == 5 and parts[0] == "tools" and parts[1] == "composio":
+        return {
+            "type": "composio",
+            "integration": parts[2],
+            "action": parts[3],
+            "connection": parts[4],
+        }
+    return None
+
+
+def _normalize_tool_ref(ref: Any) -> Optional[Dict[str, Any]]:
+    """Coerce a config entry into a discriminated tool reference the resolver parses.
+
+    Handles three shapes: a bare string (or single-key ``{"name": ...}``) is the
+    existing built-in tool name; a dict already carrying ``type`` passes through; and
+    the playground picker's gateway entry (``{"function": {"name":
+    "tools__composio__..."}}``) is parsed into a ``composio`` ref. Unsupported picker
+    entries (provider built-ins, inline custom functions) return ``None`` and are
+    skipped rather than failing the run.
+    """
+    if isinstance(ref, str):
+        return {"type": "builtin", "name": ref}
+    if isinstance(ref, dict):
+        if ref.get("type") in ("builtin", "composio"):
+            return ref
+        function = ref.get("function") if isinstance(ref.get("function"), dict) else {}
+        gateway = _parse_gateway_slug(function.get("name") or ref.get("name"))
+        if gateway:
+            return gateway
+        if "type" not in ref and isinstance(ref.get("name"), str):
+            return {"type": "builtin", "name": ref["name"]}
+        return None
+    return None
+
+
+async def _resolve_tools(
+    tools: List[Any],
+) -> Tuple[List[str], List[Dict[str, Any]], Optional[ToolCallback]]:
+    """Resolve config tool references into builtins + Pi customTool specs.
+
+    Calls the backend resolver (``POST /tools/resolve``), which validates Composio
+    connections up front and enriches each action from the catalog. Returns the
+    built-in tool names, the camelCase customTool specs for the wire, and the
+    ``/tools/call`` callback. Raises on resolution failure so the invoke fails early
+    with a clear message rather than the model hitting a runtime tool error.
+    """
+    refs = [ref for ref in (_normalize_tool_ref(t) for t in tools if t) if ref]
+    if not refs:
+        return [], [], None
+
+    api_base = _agenta_api_base()
+    if not api_base:
+        raise RuntimeError(
+            "Agent has tools configured but the Agenta API base URL is unknown. "
+            "Set AGENTA_AGENT_TOOLS_API_URL or AGENTA_API_URL."
+        )
+
+    authorization = _request_authorization()
+    headers = {"Content-Type": "application/json"}
+    if authorization:
+        headers["Authorization"] = authorization
+
+    async with httpx.AsyncClient(timeout=_TOOLS_RESOLVE_TIMEOUT) as client:
+        response = await client.post(
+            f"{api_base}/tools/resolve",
+            json={"tools": refs},
+            headers=headers,
+        )
+
+    if response.status_code >= 400:
+        raise RuntimeError(
+            f"Tool resolution failed (HTTP {response.status_code}): "
+            f"{response.text[:500]}"
+        )
+
+    data = response.json()
+    builtins = data.get("builtins") or []
+    custom = data.get("custom") or []
+
+    custom_tools = [
+        {
+            "name": spec["name"],
+            "description": spec.get("description"),
+            "inputSchema": spec.get("input_schema"),
+            "callRef": spec["call_ref"],
+        }
+        for spec in custom
+    ]
+
+    callback = ToolCallback(
+        endpoint=f"{api_base}/tools/call",
+        authorization=authorization,
+    )
+
+    return builtins, custom_tools, callback
+
+
 async def _agent(
     inputs: Optional[Dict[str, Any]] = None,
     messages: Optional[List[Any]] = None,
@@ -98,15 +308,22 @@ async def _agent(
 ):
     config = load_config()
 
-    # Config (model + AGENTS.md instructions) comes from parameters when the
-    # playground/caller sets it, falling back to the service's file config.
+    # Config comes from parameters when the playground/caller sets it, falling back
+    # to the service file config. Accepts both the playground prompt-template shape
+    # and a flat {model, agents_md, tools} (see _resolve_run_config).
     params = parameters or {}
-    model = params.get("model") or config.model
-    agents_md = params.get("agents_md") or config.agents_md
+    model, agents_md, tools_config = _resolve_run_config(params, config)
+
+    if isinstance(tools_config, dict):
+        tools_config = [tools_config]
+    elif not isinstance(tools_config, list):
+        tools_config = []
 
     msgs = messages or (inputs or {}).get("messages") or []
     prompt = _latest_user_message(msgs)
 
+    builtins, custom_tools, tool_callback = await _resolve_tools(tools_config)
+
     harness = _build_harness()
 
     await harness.setup()
@@ -117,7 +334,9 @@ async def _agent(
                 model=model,
                 prompt=prompt,
                 messages=msgs,
-                tools=config.tools,
+                tools=builtins,
+                custom_tools=custom_tools,
+                tool_callback=tool_callback,
                 trace=_trace_context(),
             )
         )
diff --git a/services/oss/src/agent_pi/config.py b/services/oss/src/agent_pi/config.py
index b630a3063e..8c2f5bf660 100644
--- a/services/oss/src/agent_pi/config.py
+++ b/services/oss/src/agent_pi/config.py
@@ -9,7 +9,7 @@
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, List, Optional
 
 # services/oss/src/agent_pi/config.py -> parents[3] == services/
 _SERVICES_DIR = Path(__file__).resolve().parents[3]
@@ -30,7 +30,11 @@
 class AgentConfig:
     agents_md: str
     model: Optional[str] = None
-    tools: List[str] = field(default_factory=list)
+    # Provider-agnostic tool references (WP-7). Each entry is either a plain string
+    # (a Pi built-in name, normalized to a ``builtin`` ref downstream) or a
+    # discriminated dict (``{"type": "composio", ...}``). Resolution happens in the
+    # backend at invoke time; the service just forwards the list.
+    tools: List[Any] = field(default_factory=list)
 
 
 def wrapper_dir() -> Path:
diff --git a/services/oss/src/agent_pi/pi_harness.py b/services/oss/src/agent_pi/pi_harness.py
index f4c5fc3e5c..266e9cb9a0 100644
--- a/services/oss/src/agent_pi/pi_harness.py
+++ b/services/oss/src/agent_pi/pi_harness.py
@@ -48,6 +48,10 @@ async def invoke(self, request: HarnessRequest) -> HarnessResult:
                 "prompt": request.prompt,
                 "messages": request.messages,
                 "tools": request.tools,
+                "customTools": request.custom_tools,
+                "toolCallback": request.tool_callback.to_wire()
+                if request.tool_callback
+                else None,
                 "trace": request.trace.to_wire() if request.trace else None,
             }
         ).encode("utf-8")
diff --git a/services/oss/src/agent_pi/pi_http_harness.py b/services/oss/src/agent_pi/pi_http_harness.py
index 1e4b8a0d2e..0435319011 100644
--- a/services/oss/src/agent_pi/pi_http_harness.py
+++ b/services/oss/src/agent_pi/pi_http_harness.py
@@ -42,6 +42,10 @@ async def invoke(self, request: HarnessRequest) -> HarnessResult:
             "prompt": request.prompt,
             "messages": request.messages,
             "tools": request.tools,
+            "customTools": request.custom_tools,
+            "toolCallback": request.tool_callback.to_wire()
+            if request.tool_callback
+            else None,
             "trace": request.trace.to_wire() if request.trace else None,
         }
 
diff --git a/services/oss/src/agent_pi/ports.py b/services/oss/src/agent_pi/ports.py
index f556de8cf7..4b436db6c6 100644
--- a/services/oss/src/agent_pi/ports.py
+++ b/services/oss/src/agent_pi/ports.py
@@ -84,6 +84,28 @@ def to_wire(self) -> Dict[str, Any]:
         }
 
 
+@dataclass
+class ToolCallback:
+    """How the harness routes a tool call back through Agenta's ``/tools/call``.
+
+    The backend resolves runnable tool references into specs and hands the harness
+    this callback. The TS wrapper turns each spec into a Pi ``customTool`` whose
+    ``execute`` POSTs the OpenAI-style envelope to ``endpoint`` with
+    ``authorization``. The provider key and connection auth never enter the sandbox;
+    they stay behind ``/tools/call``. Same mechanism that threads the OTLP credential.
+    """
+
+    endpoint: str  # full ``/tools/call`` URL
+    authorization: Optional[str] = None  # full Authorization header value
+
+    def to_wire(self) -> Dict[str, Any]:
+        """Serialize to the camelCase shape the TS wrapper expects on the wire."""
+        return {
+            "endpoint": self.endpoint,
+            "authorization": self.authorization,
+        }
+
+
 @dataclass
 class HarnessRequest:
     """One agent run: instructions, model, the user turn, and optional history."""
@@ -93,6 +115,10 @@ class HarnessRequest:
     prompt: Optional[str] = None
     messages: List[Any] = field(default_factory=list)
     tools: List[str] = field(default_factory=list)
+    # Resolved runnable tool specs, already in the camelCase wire shape the TS
+    # wrapper turns into Pi customTools: {name, description, inputSchema, callRef}.
+    custom_tools: List[Dict[str, Any]] = field(default_factory=list)
+    tool_callback: Optional[ToolCallback] = None
     trace: Optional[TraceContext] = None
 
 
diff --git a/services/oss/src/agent_pi/schemas.py b/services/oss/src/agent_pi/schemas.py
index 93a22c6532..cef2440679 100644
--- a/services/oss/src/agent_pi/schemas.py
+++ b/services/oss/src/agent_pi/schemas.py
@@ -34,24 +34,30 @@
     },
 }
 
-# Parameters: the agent config the playground renders as editable fields. Exposes
-# the two values that actually drive a run: the model and the AGENTS.md instructions.
-# `x-parameters.multiline` is the hint the playground honors to render a textarea.
+# Parameters: the agent config the playground renders. We reuse the existing
+# `prompt-template` control (model selector + tool picker + message editor) instead
+# of a bespoke agent form: the `x-ag-type-ref: prompt-template` marker makes the
+# playground render the same prompt UI chat/completion use, so the tool picker comes
+# for free. The agent reads the system message as its AGENTS.md, `llm_config.model`
+# as the model, and `llm_config.tools` (the picker output) as its runnable tools.
 AGENT_PARAMETERS_SCHEMA = {
     "$schema": _SCHEMA,
     "type": "object",
     "additionalProperties": True,
     "properties": {
-        "model": {
-            "type": "string",
-            "default": _DEFAULT_MODEL,
-            "description": "Model the agent runs on.",
-        },
-        "agents_md": {
-            "type": "string",
-            "default": _DEFAULT_AGENTS_MD,
-            "description": "The agent's instructions (AGENTS.md).",
-            "x-parameters": {"multiline": True},
+        "prompt": {
+            "x-ag-type-ref": "prompt-template",
+            "type": "object",
+            "description": (
+                "The agent's instructions (system message), model, and tools. Tools "
+                "are picked from connected providers (e.g. Composio) and run "
+                "server-side via /tools/call."
+            ),
+            "default": {
+                "messages": [{"role": "system", "content": _DEFAULT_AGENTS_MD}],
+                "template_format": "mustache",
+                "llm_config": {"model": _DEFAULT_MODEL, "tools": []},
+            },
         },
     },
 }

From 9c3d14118483aa6bd5ca72b959671de0cda8ee8b Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 11:59:05 +0200
Subject: [PATCH 03/10] feat(agent): drive harnesses over ACP via rivet
 sandbox-agent (WP-8)

Re-platform the agent workflow service to drive coding harnesses (Pi, Claude
Code) over the Agent Client Protocol through a rivet sandbox-agent daemon,
behind the unchanged Harness port and /invoke contract. The harness (pi/claude)
and sandbox (local/daytona) are editable playground config; tracing nests under
the /invoke span; tools are delivered Pi-native via a bundled extension; and the
model provider key resolves from the project vault.
---
 .gitignore                                    |    2 +
 docs/design/agent-workflows/README.md         |    3 +
 .../wp-8-rivet-acp-runtime/README.md          |   80 +
 .../wp-8-rivet-acp-runtime/architecture.md    |  176 ++
 .../wp-8-rivet-acp-runtime/context.md         |   89 +
 .../isolation-and-fork.md                     |   76 +
 .../wp-8-rivet-acp-runtime/plan.md            |  110 +
 .../poc/build_rivet_snapshot.py               |   75 +
 .../poc/commit_agent_config.py                |   75 +
 .../poc/debug-events.ts                       |   29 +
 .../wp-8-rivet-acp-runtime/poc/dump-full.ts   |   30 +
 .../wp-8-rivet-acp-runtime/poc/package.json   |   14 +
 .../wp-8-rivet-acp-runtime/poc/spike.ts       |  103 +
 .../wp-8-rivet-acp-runtime/research.md        |  147 ++
 .../wp-8-rivet-acp-runtime/status.md          |  160 ++
 .../docker-compose/ee/docker-compose.dev.yml  |   25 +-
 services/agent/docker/Dockerfile.dev          |   13 +
 services/agent/package.json                   |   21 +-
 services/agent/pnpm-lock.yaml                 | 1890 ++++++++++++++++-
 services/agent/scripts/build-extension.mjs    |   30 +
 services/agent/src/agenta-otel.ts             |  302 +++
 services/agent/src/cli.ts                     |   10 +-
 services/agent/src/piExtension.ts             |  171 ++
 services/agent/src/runPi.ts                   |   11 +
 services/agent/src/runRivet.ts                |  698 ++++++
 services/agent/src/server.ts                  |   31 +-
 services/agent/src/toolBridge.ts              |   76 +
 services/agent/src/toolBridgeServer.ts        |  170 ++
 services/oss/src/agent.py                     |  123 +-
 services/oss/src/agent_pi/ports.py            |   10 +
 services/oss/src/agent_pi/rivet_harness.py    |  143 ++
 services/oss/src/agent_pi/schemas.py          |   17 +
 32 files changed, 4893 insertions(+), 17 deletions(-)
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md
 create mode 100644 docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md
 create mode 100644 services/agent/scripts/build-extension.mjs
 create mode 100644 services/agent/src/piExtension.ts
 create mode 100644 services/agent/src/runRivet.ts
 create mode 100644 services/agent/src/toolBridge.ts
 create mode 100644 services/agent/src/toolBridgeServer.ts
 create mode 100644 services/oss/src/agent_pi/rivet_harness.py

diff --git a/.gitignore b/.gitignore
index 48e7b32614..6c91758e28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,8 @@
 **/*dont_commit_me*
 web/packages/agenta-api-client/dist/
 web/tsconfig.tsbuildinfo
+# Agent Pi extension bundle, built by `pnpm run build:extension` and in the Docker image.
+services/agent/dist/
 
 __pycache__/
 **/__pycache__/
diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md
index 7d5784dfc2..d8ae4537ce 100644
--- a/docs/design/agent-workflows/README.md
+++ b/docs/design/agent-workflows/README.md
@@ -116,6 +116,9 @@ running agent.
 - [`wp-7-tools/`](wp-7-tools/README.md) — make runnable tools part of the agent config; resolve
   Composio actions into Pi tools and route tool calls back through the existing
   `POST /tools/call`, with MCP and workflow-as-tool as future adapters.
+- [`wp-8-rivet-acp-runtime/`](wp-8-rivet-acp-runtime/README.md) — re-platform the service onto
+  `rivet-dev/sandbox-agent` so the agent is driven over ACP and the harness (Pi, Claude Code,
+  Codex) becomes a config value, running locally first; tools, Daytona, and the folder jail deferred.
 
 ## Related work
 
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md
new file mode 100644
index 0000000000..716a97d60e
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md
@@ -0,0 +1,80 @@
+# WP-8: Rivet + ACP agent runtime
+
+Status: design ready to implement. Start at [`plan.md`](plan.md). Decisions and open
+items are in [`status.md`](status.md).
+
+This folder is self-contained. A new engineer should be able to read it and implement the
+work end to end without prior context. Read in this order: this README, then
+[`context.md`](context.md) (the code that exists today), [`research.md`](research.md)
+(verified facts about rivet, ACP, and the pattern we copy), [`architecture.md`](architecture.md)
+(the target design), and [`plan.md`](plan.md) (the phased build).
+
+## Summary
+
+Re-platform the agent workflow service (`services/oss/src/agent.py`) so it drives the
+agent over the **Agent Client Protocol (ACP)** through [`rivet-dev/sandbox-agent`](https://github.com/rivet-dev/sandbox-agent),
+instead of the bespoke Pi JSON protocol it uses today.
+
+The `/invoke` contract does not change. The handler still builds a user turn and returns
+`{"role": "assistant", "content": ...}`. What changes is the transport behind the existing
+`Harness` port: rivet runs the chosen harness (Pi, Claude Code) as an ACP session and
+streams the reply back. Picking a different harness becomes a config value, not new code.
+
+## The four requirements
+
+1. **Drive the agent over ACP**, not the Pi JSON protocol. Rivet speaks ACP to the
+   harness; our service drives rivet.
+2. **Swap harness as config.** The same agent config runs on Pi or Claude Code by setting
+   one value.
+3. **Run locally.** The same path runs on a dev machine with no container, using rivet's
+   `local` provider. The rivet server is open source, so running it locally is normal.
+4. **Defer tools.** Ship with no tools. The tool model is fixed (definition plus swappable
+   body, delivered per-harness over MCP), but nothing is built here.
+
+## The design in five lines
+
+- Keep `agent.py`, the `/invoke` contract, and the `Harness` port unchanged.
+- Add a `RivetHarness` adapter behind the port, plus a small TypeScript runner that wraps
+  the rivet SDK.
+- Run **one rivet daemon and one sandbox per invoke** (cold), then tear it down. This
+  copies the pattern Agenta already ships for code evaluators.
+- Inject the trace context as an environment variable **at the daemon's birth** (the
+  sandbox `env_vars` on Daytona, the SDK `env` option locally). No fork of rivet or the
+  adapters is needed under this per-invoke model.
+- Two axes swap independently: **sandbox** (local, daytona) and **harness** (pi, claude).
+
+## Agent configuration (the contract to rivet: filesystem plus config)
+
+- **AGENTS.md** — instructions, after variable substitution.
+- **Input variables** — substituted into AGENTS.md, like prompt-template variables.
+- **Skills** — laid into the workspace as files (path and format are per-harness).
+- **Tool definitions** — schema only, separate from bodies. Empty here.
+- **Harness** — `pi` / `claude`.
+- **Sandbox** — `local` / `daytona`.
+- **Secrets** — harness and LLM auth, passed as launch env, never written into the
+  agent-visible filesystem.
+
+## In scope
+
+ACP transport via rivet, harness swap (Pi and Claude Code), local run, and **tracing**
+(the agent's spans must nest under the `/invoke` span; standalone traces are not
+acceptable). Daytona and concurrency are described as the immediate follow-on phases.
+
+## Deferred (each its own follow-on)
+
+- **Tools** ([WP-7](../wp-7-tools/README.md)): the definition-plus-body model over MCP.
+- **Folder isolation (the jail)**: rivet has no filesystem confinement. Needed only when a
+  single warm daemon hosts many agents at once. A TypeScript-or-Rust change, deferred. See
+  [`isolation-and-fork.md`](isolation-and-fork.md).
+- **Multi-turn and streaming to the client** ([WP-4](../wp-4-multi-message-output/README.md)):
+  one turn in, one message out, matching today. A session is persisted message history
+  replayed via ACP `session/load`.
+- **Standalone SDK runner**: run an agent from the SDK with a config. The adapters are
+  written to live in the SDK so this is a packaging step later, not a rewrite.
+
+## Why rivet
+
+Rivet is the thing we were about to hand-build in the `Harness` and `Runtime` ports: an
+ACP daemon that drives several harnesses, keyed by session, over a swappable sandbox
+(local, daytona) with an HTTP and SSE control plane. We adopt it unmodified (Apache-2.0).
+The one capability it lacks, filesystem confinement, we are deferring.
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md
new file mode 100644
index 0000000000..a9e71321f3
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md
@@ -0,0 +1,176 @@
+# Architecture
+
+## Principle
+
+Keep the `Harness` port and the `/invoke` contract. Add one adapter behind the port that
+runs the agent through rivet over ACP, and a small TypeScript runner that wraps the rivet
+SDK. Everything Pi-specific moves below the port and becomes one harness choice.
+
+```
+                 unchanged
+  ┌───────────────────────────────────────────────┐
+  │ agent.py  (/invoke, /inspect, ag.create_app)   │
+  │   _resolve_run_config / _latest_user_message   │
+  │   _build_harness()  ── selects adapter by env  │
+  └───────────────────────────────────────────────┘
+                      │  Harness port (setup / invoke / shutdown)
+                      ▼
+  ┌───────────────────────────────────────────────┐
+  │ RivetHarness (new, Python)                     │   PiHarness / PiHttpHarness
+  │  maps HarnessRequest + {harness, sandbox} →    │   (kept; legacy path)
+  │  a one-shot rivet run; passes trace + secrets  │
+  └───────────────────────────────────────────────┘
+                      │  /run (HTTP or stdio), same contract family as runPi
+                      ▼
+  ┌───────────────────────────────────────────────┐
+  │ runRivet.ts  (services/agent, wraps rivet SDK) │
+  │  start({ sandbox, env }) → createSession({     │
+  │  agent, cwd }) → write AGENTS.md → prompt →     │
+  │  collect chunks → destroy                       │
+  └───────────────────────────────────────────────┘
+                      │  spawns the daemon (local subprocess, or in Daytona)
+                      ▼
+  ┌───────────────────────────────────────────────┐
+  │ sandbox-agent daemon (Rust, one per invoke)    │
+  └───────────────────────────────────────────────┘
+                      │  ACP (JSON-RPC: session/prompt, session/update)
+                      ▼
+  ┌───────────────────────────────────────────────┐
+  │ harness ACP adapter subprocess in cwd          │
+  │  pi-acp │ claude-code-acp                       │
+  └───────────────────────────────────────────────┘
+```
+
+The ACP boundary is daemon to harness. That is the requirement: the agent loop runs over
+ACP, not the Pi JSON envelope. The service-to-rivet hop is rivet's own control surface and
+stays harness-agnostic behind the port.
+
+## Two orthogonal swap axes
+
+These swap independently. Do not bundle them.
+
+- **Sandbox (where the daemon runs):** `local`, `daytona`. A config value passed to
+  `runRivet`, which selects the rivet provider.
+- **Harness (which engine):** `pi`, `claude`. A config value passed as the rivet `agent`.
+
+The demo proves each separately: swap `local` and `daytona` with the harness fixed, and
+swap `pi` and `claude` with the sandbox fixed.
+
+## Lifecycle: one daemon and one sandbox per invoke (cold)
+
+Each `/invoke` brings up its own daemon and sandbox, runs, and tears down. This copies the
+shipped code-evaluator pattern (`DaytonaRunner`: an ephemeral sandbox per execution from a
+snapshot, deleted in a `finally`). Two reasons it is the right default:
+
+- It makes the daemon's environment **per-invoke**, which is what makes tracing work
+  without forking anything (see below).
+- It needs no filesystem jail, because agents never share a daemon.
+
+Cost is acceptable. Locally the daemon is a Rust binary that boots in tens of
+milliseconds, so the per-invoke cost is the Node adapter spawn (~0.2 to 0.5s). On Daytona
+the sandbox create adds ~1s. Concurrency is bounded the way evaluations already bound it
+(see Concurrency).
+
+## Tracing: inject at the daemon's birth
+
+The agent's spans must nest under the `/invoke` span. Standalone traces are not
+acceptable. The mechanism is uniform across sandboxes because each invoke owns its daemon:
+
+- The static OTLP target and auth (`OTEL_*`, the Agenta endpoint and `Authorization`) and
+  the per-invoke `traceparent` go into the daemon's environment when it is created.
+  - **Local:** the SDK `env` option on `start({ sandbox: local(), env })`.
+  - **Daytona:** the sandbox `env_vars`, exactly like `DaytonaRunner` injects `AGENTA_*`.
+- The daemon passes its env to the adapter subprocess, which passes it to the harness.
+- **Pi:** install the `agenta-otel` logic as a Pi extension in the environment (global
+  `~/.pi/agent/extensions`, or baked into the Daytona snapshot). Pi loads it and emits
+  spans under the injected `traceparent`.
+- **Claude Code:** set `CLAUDE_CODE_ENABLE_TELEMETRY=1`, `OTEL_*`, and `TRACEPARENT`, and
+  run it in `-p` / Agent-SDK mode.
+
+No fork of rivet or the adapters is needed under the per-invoke model. A fork (the
+TypeScript adapter reading ACP `_meta.traceparent`, not Rust) is only needed if a later
+phase shares one warm daemon across concurrent invokes.
+
+## Components
+
+### `RivetHarness` (Python, new)
+
+`services/oss/src/agent_pi/rivet_harness.py`, implements the `Harness` ABC. It holds the
+harness id and sandbox choice (from config) and the trace/secret context, and maps a
+`HarnessRequest` onto a `runRivet` `/run` call. Field mapping:
+
+| `HarnessRequest` | Becomes |
+| --- | --- |
+| `agents_md` | written as `AGENTS.md` into the session `cwd` |
+| `model` | session model where the harness honors it (the adapter normalizes this) |
+| `prompt` | the ACP prompt text |
+| `messages` | MVP uses the latest user turn; history replay is later |
+| `tools` etc. | unused (empty) in WP-8 |
+| `trace` | injected as daemon env (`traceparent`, OTLP endpoint, auth) |
+
+### `runRivet.ts` (TypeScript, in `services/agent`)
+
+Wraps the rivet SDK. Selected by env (`AGENT_BACKEND=rivet`) and serves the same `/run`
+contract `runPi.ts` serves, so the Python side stays thin. Per invoke:
+
+1. `start({ sandbox: local() | daytona({...}), env })` (env carries trace + secrets).
+2. `createSession({ agent: <harness>, cwd })`.
+3. Write `AGENTS.md` (and later skills) into `cwd`.
+4. `prompt(sessionId, prompt)`, accumulate `agent_message_chunk` into the output.
+5. `destroy()`.
+6. Return `{ ok, output, sessionId, model }`.
+
+### `agent.py` selection
+
+Extend `_build_harness()` with `AGENTA_AGENT_RUNTIME=rivet` to return `RivetHarness`
+(harness from `AGENTA_AGENT_HARNESS`, sandbox from config, default `local`). Keep the Pi
+path as default so nothing regresses.
+
+## Agent configuration (the contract: filesystem plus config)
+
+Resolved before each run: AGENTS.md, input variables (substituted into AGENTS.md), skills
+(files in the workspace), tool definitions (empty here), harness, sandbox, secrets. The
+contract handed to rivet is files in `cwd` plus the session/daemon config. Secrets go as
+launch env, never as files, because there is no jail.
+
+## Tools: definition vs body (deferred, but shapes the seam)
+
+A tool splits into a **definition** (the schema the model sees, stored in a neutral
+OpenAI-function shape) and a **body** (the execution). The body is swappable: real,
+service-backed, or mock. A test variant of an agent swaps bodies without touching
+definitions. Delivery is per-harness over **MCP** (rivet's per-directory MCP config), not a
+raw OpenAI array. The body model is general and not Agenta-specific: a self-contained body
+runs in-process, a service-backed body (for example a Composio tool calling Agenta's
+`/tools/call`) needs its service reachable (a local or remote Agenta), and a mock needs
+nothing. WP-8 ships no tools; this is the shape to preserve, not build.
+
+## Sessions and state
+
+A session is the **stored message history**, not a kept-alive sandbox. Because we offer no
+persistent file writes, nothing on disk is worth keeping. So: ephemeral sandbox per turn,
+persisted messages, continue by replaying history with ACP `session/load` (Pi
+`resumeSession`, Claude Code `loadSession`). Zero at-rest cost. The history store is the
+backend DB on the platform and a local file standalone. Tradeoff: long-history replay
+re-sends tokens, so cap it. Paused or FS-persisted sessions wait until we offer durable
+writes.
+
+## Concurrency
+
+Mirror evaluations. Do not run the agent inside the API request if a background path is
+available; dispatch it like an evaluation (taskiq worker on a Redis stream) and bound
+concurrency with a shared semaphore. Each concurrent slot is one ephemeral sandbox, so the
+semaphore caps how many sandboxes (and how much Daytona cost) run at once. Extra invokes
+queue. Locally a slot is a cheap subprocess.
+
+## Running standalone via the SDK (later)
+
+The harness and sandbox adapters are written to live in the SDK, so the backend service
+and a standalone run share one implementation. Running locally is not special: the rivet
+server is open source (Apache-2.0, a static binary), so a local run runs that server
+locally and the SDK wraps the rivet client. A standalone run fetches or loads a config,
+then calls the SDK runner.
+
+## What this does not change
+
+No new endpoints. No change to `/invoke` or `/inspect` shapes. No tools, no jail, no
+multi-turn, no client-side streaming. Each is its own follow-on.
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md
new file mode 100644
index 0000000000..fe7d1ecac0
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md
@@ -0,0 +1,89 @@
+# Context: the code that exists today
+
+Read this to orient on the current service before changing it. All paths are in this repo
+(`/home/mahmoud/code/agenta`).
+
+## The agent service (WP-2)
+
+`services/oss/src/agent.py` is an Agenta app exposing `/invoke` and `/inspect`, like the
+chat and completion services. The handler `_agent(...)`:
+
+1. Resolves config with `_resolve_run_config(...)`: model, AGENTS.md (the system text),
+   and tools, from the request `parameters` or the file config.
+2. Builds the latest user turn with `_latest_user_message(...)`.
+3. Picks a harness adapter with `_build_harness()` and calls the `Harness` port
+   (`setup` / `invoke` / `shutdown`).
+4. Returns `{"role": "assistant", "content": result.output}`.
+
+Trace context is captured in `_trace_context()` and threaded into the harness so the
+agent's spans nest under the `/invoke` span.
+
+## The ports (the seam we keep)
+
+`services/oss/src/agent_pi/ports.py`:
+
+- `Harness` (ABC): `setup()`, `invoke(HarnessRequest) -> HarnessResult`, `shutdown()`.
+- `HarnessRequest`: `agents_md`, `model`, `prompt`, `messages`, `tools`, `custom_tools`,
+  `tool_callback`, `trace`.
+- `HarnessResult`: `output`, `session_id`, `model`.
+- `TraceContext`: `traceparent`, `baggage`, `endpoint` (OTLP), `authorization`,
+  `capture_content`. Has `to_wire()` (camelCase).
+- `Runtime` (ABC): the sandbox/environment seam for the legacy Pi path (`start`,
+  `shutdown`, `exec`). The rivet path does not use `Runtime.exec`; it selects a rivet
+  provider instead (see architecture).
+
+## The current Pi adapters (legacy, keep working)
+
+- `services/oss/src/agent_pi/pi_harness.py` (`PiHarness`): spawns the TypeScript Pi
+  wrapper as a subprocess, one JSON object over stdio.
+- `services/oss/src/agent_pi/pi_http_harness.py` (`PiHttpHarness`): POSTs the same JSON to
+  the wrapper running as an HTTP sidecar.
+- Both send a Pi-shaped envelope (`{agentsMd, model, prompt, messages, tools, customTools,
+  toolCallback, trace}`).
+
+## The TypeScript wrapper
+
+`services/agent/` is a small Node service.
+
+- `src/runPi.ts`: turns the envelope into direct Pi SDK calls (`createAgentSession`, ...).
+- `src/agenta-otel.ts`: a Pi OTel helper. Today `runPi.ts` imports it in-process and emits
+  `invoke_agent` as a child of the incoming `traceparent`. Under rivet this logic must
+  become a Pi **extension** installed in the environment (see architecture, tracing).
+- `src/server.ts` (HTTP `/run`) and `src/cli.ts` (stdio) are the two transports.
+
+## The pattern we copy: how code evaluators run in Daytona
+
+This is the shipped precedent for "ephemeral sandbox per execution", and the agent service
+mirrors it.
+
+- `sdks/python/agenta/sdk/engines/running/runners/` holds `base.py` (`CodeRunner`),
+  `local.py` (`LocalRunner`, in-process `exec`), `daytona.py` (`DaytonaRunner`, remote
+  sandbox), and `registry.py` (`get_runner()`).
+- Selection: env `AGENTA_SERVICES_CODE_SANDBOX_RUNNER` (`local` default, `daytona` in
+  cloud).
+- `DaytonaRunner.run()` creates an `ephemeral=True` sandbox from a snapshot
+  (`DAYTONA_SNAPSHOT`), runs, and deletes it in a `finally`. **One sandbox per execution.**
+  No warm pool, no shared instance. It injects `AGENTA_HOST`, `AGENTA_API_KEY`, and the
+  user's provider keys as the sandbox `env_vars`.
+- Concurrency is bounded by the evaluation engine, not the runner: a shared
+  `asyncio.Semaphore(batch_size)` (default 10) in
+  `sdks/python/agenta/sdk/evaluations/runtime/processor.py`. So at most ~10 ephemeral
+  sandboxes exist at once.
+- Daytona config lives in `api/oss/src/utils/env.py` (`DaytonaConfig`:
+  `DAYTONA_API_KEY`, `DAYTONA_API_URL`, `DAYTONA_SNAPSHOT`, `DAYTONA_TARGET`).
+
+## What we change and what we keep
+
+Change: the transport behind the `Harness` port becomes rivet over ACP, with harness and
+sandbox as config values.
+
+Keep: the `/invoke` and `/inspect` contract, the `Harness` port and its dataclasses, the
+config resolution in `agent.py`, and the env-driven adapter selection in
+`_build_harness()` (extended with a rivet branch). The legacy Pi adapters keep working so
+nothing regresses.
+
+## Conventions
+
+- Standalone scripts run with `uv run` and inline `# /// script` dependencies.
+- Python edits: `ruff format` then `ruff check --fix` before committing.
+- Local-server parity is a first-class requirement carried from WP-2.
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md
new file mode 100644
index 0000000000..3f219acebb
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md
@@ -0,0 +1,76 @@
+# Isolation and when a fork is needed
+
+This is deferred for WP-8. It matters only if a later phase runs **one warm daemon hosting
+many agents at once**. The WP-8 model (one daemon and one sandbox per invoke) avoids it: a
+single agent owns its sandbox, so there is nothing to isolate it from. Read this only when
+you move to a shared warm daemon, or when you want many agents inside one long-lived
+sandbox each confined to its own folder.
+
+Note on language: a "fork" here can mean two different things. The **jail** below is new
+code we add. Separately, the tracing discussion mentioned forking an ACP **adapter**;
+those are small TypeScript packages, not the Rust daemon. Neither is needed for WP-8.
+
+## The gap
+
+Rivet has no filesystem isolation (see [`research.md`](research.md#filesystem-no-jail-exists)).
+A session's `cwd` is advisory and the file API resolves absolute paths verbatim. So if many
+agents share one daemon, each can read and write the whole host, including other agents'
+folders. Confining them to their own folders is then the load-bearing new capability.
+
+## What rivet gives for free vs what we build
+
+| Capability | Status in rivet |
+| --- | --- |
+| One daemon, many agents/sessions | done (`AcpProxyRuntime` instance map) |
+| Multiple harnesses incl. Pi | done (`AgentId`, ACP adapters) |
+| Per-session working directory | done (`cwd` plumbed end to end) |
+| Per-directory tool config | done (MCP / skills) |
+| HTTP + SSE streaming | done |
+| **Folder jail (the agent sees only its folder)** | **missing; we add it (needs a fork)** |
+
+## How the jail would work (deferred)
+
+The field has converged on this for confining a coding agent to one folder without a
+container per agent:
+
+- **Linux, preferred:** bubblewrap (mount namespace, bind-mount only the folder so
+  nothing else exists) + Landlock (VFS-level deny as a backstop) + seccomp (trim escape
+  syscalls). This is what Codex CLI and Anthropic's `srt` do.
+- **Caveat:** bubblewrap needs unprivileged user namespaces, which are disabled on
+  hardened or managed distros. Fallback is **Landlock-only**: no root, no namespaces,
+  still confines file access, but outside paths stay visible (EACCES on access) rather
+  than invisible. Detect user namespaces at startup and degrade gracefully.
+- **macOS:** no Landlock or namespaces. Use `sandbox-exec` / Seatbelt with a
+  `(deny default)(allow file-* (subpath "<folder>"))` profile.
+- Do not rely on the harness: opencode and Pi do no FS sandboxing; they trust the caller.
+
+Threat model sets the bar. For self-hosted single-org, Landlock plus per-session `cwd` is
+likely enough, which also sidesteps the user-namespace problem. For multi-tenant cloud,
+you want the full bubblewrap + seccomp stack or genuine containers.
+
+## Where the fork would touch rivet
+
+If and when we add the jail, the changes are localized (paths inside the rivet repo):
+
+1. **Subprocess confinement** — wrap the harness launch with bwrap / a Landlock helper.
+   Easiest at the generated launcher in `agent-management/src/agents.rs` (`write_launcher`),
+   threading a per-instance root through `acp_proxy_runtime.rs::create_instance` and
+   `acp-http-adapter/src/process.rs` (`AdapterRuntime::start`, which today never even sets
+   `current_dir`).
+2. **File API jail** — `router/support.rs::resolve_fs_path`: add a configured root and
+   reject absolute paths outside it.
+3. **Process runtime jail** — `process_runtime.rs`: same confinement, or the jail leaks
+   via `/v1/process`.
+4. **Config** — `cli.rs` + `daemon.rs`: a `--root` / per-server root option (none exists).
+5. (Optional) a TS provider that maps each agent to its own root folder, copying
+   `providers/local.ts`.
+
+Effort: the multi-agent / multi-harness / streaming half is inherited. The jail itself is
+medium-to-large because it is platform-specific and has three escape surfaces with no
+existing isolation code to build on. A soft jail (path-prefix checks + `cwd`, no kernel
+enforcement) is small-to-medium but is not a real "cannot see outside" guarantee.
+
+## Decision for now
+
+Use rivet unmodified for WP-8 (ACP + harness swap + local, tools deferred). Fork only
+when we need the jail, and keep the fork minimal and rebaseable against upstream.
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md
new file mode 100644
index 0000000000..17a0051827
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md
@@ -0,0 +1,110 @@
+# Plan
+
+Phased so each phase is demonstrable and reversible. Phases 0 to 2 deliver the four
+requirements (ACP, harness swap, local, tools deferred) plus tracing. Phase 3 adds
+Daytona. Phase 4 adds concurrency. Keep the legacy Pi adapters working throughout; select
+the rivet path with env.
+
+Read [`context.md`](context.md) and [`architecture.md`](architecture.md) first.
+
+## Demo targets (what success looks like)
+
+1. **Sandbox swap:** the same agent on `local` and `daytona`, harness fixed.
+2. **Harness swap:** the same agent on `pi` and `claude`, sandbox fixed.
+3. **Tracing:** the agent's spans nest under the `/invoke` span in Agenta, for both
+   harnesses.
+
+## Phase 0 — Spike: rivet + local + Pi + ACP + tracing (throwaway)
+
+Goal: prove the path end to end before touching the service.
+
+1. Install locally: the rivet SDK and `sandbox-agent` binary (check the package name on
+   rivet.dev), the Pi CLI, and the `pi-acp` adapter. Verify the SDK API names against the
+   installed version.
+2. Write `services/agent/src/runRivet.ts`: `start({ sandbox: local(), env })`,
+   `createSession({ agent: "pi", cwd })`, write `AGENTS.md` into `cwd`, `prompt(...)`,
+   accumulate `agent_message_chunk` into a string, `destroy()`. Return `{ ok, output,
+   sessionId, model }`.
+3. Package the `agenta-otel` logic (from `services/agent/src/agenta-otel.ts`) as a Pi
+   extension and install it at `~/.pi/agent/extensions`. Pass `traceparent`, the Agenta
+   OTLP endpoint, and auth in the `start({ env })` map.
+4. Write a `uv run` showcase script (inline `# /// script` deps) that calls `runRivet`
+   with a fixed config (AGENTS.md, model), prints the reply, then re-runs with
+   `agent: "claude"`.
+
+Done when: Pi answers a prompt locally through rivet over ACP, Claude Code answers the same
+config, and Pi's spans show up in Agenta nested under a parent trace.
+
+## Phase 1 — `RivetHarness` behind the port
+
+Goal: wire rivet into the service with no change to `/invoke`.
+
+1. `services/oss/src/agent_pi/rivet_harness.py`: `RivetHarness(Harness)`. Map
+   `HarnessRequest` plus `{harness, sandbox}` config and `TraceContext` to a `runRivet`
+   `/run` call (reuse the `PiHttpHarness` HTTP-client shape, or stdio).
+2. `services/agent/src/server.ts`: route `/run` to `runRivet` when `AGENT_BACKEND=rivet`.
+3. `agent.py` `_build_harness()`: add `AGENTA_AGENT_RUNTIME=rivet` to return
+   `RivetHarness` (harness from `AGENTA_AGENT_HARNESS`, sandbox `local`). Keep the Pi
+   default.
+4. Pass `_trace_context()` through `RivetHarness` to `runRivet`, which injects it into
+   `start({ env })`.
+
+Done when: `/invoke` returns the same `{"role": "assistant", "content": ...}` for a
+no-tools agent via rivet, spans nest under `/invoke`, and flipping `AGENTA_AGENT_RUNTIME`
+switches between the rivet and Pi paths with no other change.
+
+## Phase 2 — Harness swap as config
+
+Goal: one config, two harnesses.
+
+1. Thread `AGENTA_AGENT_HARNESS` (`pi` / `claude`) through `RivetHarness` to `runRivet`'s
+   `agent` value.
+2. Pass harness auth as launch env: Pi's LLM key; Claude Code's Anthropic auth plus
+   `CLAUDE_CODE_ENABLE_TELEMETRY=1`, `OTEL_*`, `TRACEPARENT`, run in `-p`/SDK mode.
+3. The `RivetHarness` (the adapter) normalizes `model` per harness (Pi takes the id;
+   Claude Code uses its own).
+
+Done when: the same agent config runs on Pi and Claude Code by changing one value, and both
+nest spans under `/invoke`. This completes the four requirements.
+
+## Phase 3 — Daytona sandbox (mirror the code evaluator)
+
+Goal: swap `local` for `daytona`, same agent.
+
+1. Build a Daytona snapshot with the rivet binary, the Pi and Claude CLIs, both ACP
+   adapters, and the `agenta-otel` Pi extension preinstalled. Record the snapshot id.
+2. `runRivet`: when `sandbox=daytona`, `start({ sandbox: daytona({ snapshot, target }),
+   env })`. Create ephemeral per invoke, inject `traceparent` and secrets as `env_vars`,
+   `destroy()` after. Reuse the config keys `DaytonaRunner` uses (`DAYTONA_API_KEY`,
+   `DAYTONA_API_URL`, `DAYTONA_SNAPSHOT`, `DAYTONA_TARGET` in `api/oss/src/utils/env.py`).
+
+Done when: the same agent runs on `local` and `daytona` by changing the sandbox value, with
+one ephemeral sandbox per invoke and spans nested.
+
+## Phase 4 — Concurrency and background dispatch
+
+Goal: bound concurrent sandboxes the way evaluations do.
+
+1. Dispatch agent invokes through the existing taskiq worker + Redis-stream pattern if the
+   `/invoke` caller allows async; otherwise bound the synchronous path with a shared
+   semaphore. Size it to the max concurrent ephemeral sandboxes (mirror
+   `DEFAULT_BATCH_SIZE = 10`).
+2. Confirm Daytona cost and quota stay within the cap under load; extra invokes queue.
+
+Done when: N concurrent invokes never exceed the configured number of live sandboxes.
+
+## Deferred (own work packages)
+
+- Tools, definition plus body over MCP ([WP-7](../wp-7-tools/README.md)).
+- Folder jail ([`isolation-and-fork.md`](isolation-and-fork.md)), needed only with a warm
+  shared daemon.
+- Multi-turn and client streaming ([WP-4](../wp-4-multi-message-output/README.md)).
+- Standalone SDK runner (packaging the adapters into the SDK).
+
+## Validation
+
+- Behavior parity: reuse the WP-2 manual `/invoke` curl check against both the Pi and rivet
+  paths.
+- Tracing: confirm in Agenta that the agent run appears under the `/invoke` `trace_id`.
+- Python edits: `ruff format` then `ruff check --fix` before committing.
+- Add unit coverage for `RivetHarness` request mapping once it grows past a thin client.
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
new file mode 100644
index 0000000000..5e85e7b491
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
@@ -0,0 +1,75 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["daytona"]
+# ///
+"""Build a Daytona snapshot for the WP-8 rivet agent runtime.
+
+Bakes the `pi` CLI into rivet's `-full` image (which already ships the sandbox-agent
+daemon, the Claude CLI, and CA certs) so Daytona runs don't pay a ~150s per-invoke
+`npm install pi`. Set the agent service to use it:
+
+    AGENTA_RIVET_DAYTONA_SNAPSHOT=agenta-rivet-pi
+    AGENTA_RIVET_DAYTONA_INSTALL_PI=false
+
+Run: DAYTONA_API_KEY=... DAYTONA_TARGET=eu uv run build_rivet_snapshot.py [--force]
+"""
+
+import sys
+import time
+
+from daytona import (
+    CreateSnapshotParams,
+    Daytona,
+    DaytonaConfig,
+    Image,
+    Resources,
+)
+
+SNAPSHOT_NAME = "agenta-rivet-pi"
+RIVET_IMAGE = "rivetdev/sandbox-agent:0.5.0-rc.2-full"
+PI_PACKAGE = "@earendil-works/pi-coding-agent@0.79.4"
+
+
+def main() -> None:
+    force = "--force" in sys.argv
+    daytona = Daytona(DaytonaConfig())
+
+    try:
+        existing = daytona.snapshot.get(SNAPSHOT_NAME)
+    except Exception:
+        existing = None
+
+    if existing and not force:
+        print(f"snapshot '{SNAPSHOT_NAME}' already exists; pass --force to rebuild.")
+        return
+    if existing:
+        print(f"deleting existing snapshot '{SNAPSHOT_NAME}'...")
+        daytona.snapshot.delete(existing)
+
+    # Base on rivet's -full image (daemon + claude + certs) and add the pi CLI globally
+    # so it is on PATH for the sandbox user the daemon runs as. The image's default user
+    # is the non-root `sandbox`, so switch to root for the global install, then back.
+    image = Image.base(RIVET_IMAGE).dockerfile_commands(
+        [
+            "USER root",
+            f"RUN npm install -g --ignore-scripts {PI_PACKAGE}",
+            "RUN pi --version || true",
+            "USER sandbox",
+        ]
+    )
+
+    print(f"building snapshot '{SNAPSHOT_NAME}' from {RIVET_IMAGE} (+ pi)...")
+    started = time.monotonic()
+    daytona.snapshot.create(
+        CreateSnapshotParams(
+            name=SNAPSHOT_NAME,
+            image=image,
+            resources=Resources(cpu=2, memory=4, disk=8),
+        ),
+        on_logs=print,
+    )
+    print(f"\nsnapshot '{SNAPSHOT_NAME}' built in {time.monotonic() - started:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
new file mode 100644
index 0000000000..7b6db094de
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
@@ -0,0 +1,75 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["httpx"]
+# ///
+"""Commit an agent revision that exposes harness + sandbox as editable playground config.
+
+Adds two enum string params (harness: pi/claude, sandbox: local/daytona) to the agent
+workflow's parameters schema, alongside the existing model + agents_md, so the playground
+renders them as dropdowns (SchemaPropertyRenderer -> EnumSelectControl). WP-8 point 4.
+"""
+
+import os
+import httpx
+
+BASE = os.getenv("AGENTA_HOST", "http://144.76.237.122:8280").rstrip("/")
+KEY = os.environ["AGENTA_API_KEY"]
+PROJ = os.getenv("AGENTA_PROJECT_ID", "019ecbaf-5f3f-7d12-9aef-f49272dfd82e")
+REV = os.getenv("AGENT_REVISION_ID", "019ecfc9-1ea0-7293-aa1c-350c029cb118")
+
+H = {"Authorization": f"ApiKey {KEY}", "Content-Type": "application/json"}
+
+
+def main() -> None:
+    with httpx.Client(timeout=30) as client:
+        r = client.get(
+            f"{BASE}/api/workflows/revisions/{REV}",
+            params={"project_id": PROJ},
+            headers=H,
+        )
+        r.raise_for_status()
+        wr = r.json()["workflow_revision"]
+        variant_id = wr["workflow_variant_id"]
+        data = dict(wr["data"])
+
+        props = data["schemas"]["parameters"]["properties"]
+        props["harness"] = {
+            "type": "string",
+            "title": "Harness",
+            "enum": ["pi", "claude"],
+            "default": "pi",
+            "description": "Coding agent engine to drive over ACP.",
+        }
+        props["sandbox"] = {
+            "type": "string",
+            "title": "Sandbox",
+            "enum": ["local", "daytona"],
+            "default": "local",
+            "description": "Where the agent runs.",
+        }
+        params = dict(data["parameters"])
+        params.setdefault("harness", "pi")
+        params.setdefault("sandbox", "local")
+        data["parameters"] = params
+
+        body = {
+            "workflow_revision": {
+                "workflow_variant_id": variant_id,
+                "message": "WP-8: expose harness + sandbox as editable config",
+                "data": data,
+            }
+        }
+        resp = client.post(
+            f"{BASE}/api/workflows/revisions/commit",
+            params={"project_id": PROJ},
+            headers=H,
+            json=body,
+        )
+        print("commit status:", resp.status_code)
+        out = resp.json()
+        new = out.get("workflow_revision") or out
+        print("new revision id:", new.get("id"), "version:", new.get("version"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts
new file mode 100644
index 0000000000..a3db5da87f
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts
@@ -0,0 +1,29 @@
+import { mkdtempSync, writeFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { SandboxAgent } from "sandbox-agent";
+import { local } from "sandbox-agent/local";
+
+const AGENT = process.env.SPIKE_AGENT ?? "claude";
+const here = dirname(fileURLToPath(import.meta.url));
+const binDir = join(here, "node_modules", ".bin");
+const BIN = join(here, "node_modules/.pnpm/@sandbox-agent+cli-linux-x64@0.4.2/node_modules/@sandbox-agent/cli-linux-x64/bin/sandbox-agent");
+
+const cwd = mkdtempSync(join(tmpdir(), "wp8-dbg-"));
+writeFileSync(join(cwd, "AGENTS.md"), "You are concise.\n", "utf-8");
+const env: Record<string,string> = { PATH: `${binDir}:/home/mahmoud/.local/bin:${process.env.PATH ?? ""}`, PI_ACP_PI_COMMAND: join(binDir,"pi"), PI_CODING_AGENT_DIR: join(process.env.HOME??"",".pi/agent"), SANDBOX_AGENT_BIN: BIN, HOME: process.env.HOME??"" };
+const sandbox = await SandboxAgent.start({ sandbox: local({ env, binaryPath: BIN, log: "silent" }) });
+const session = await sandbox.createSession({ agent: AGENT, cwd, model: process.env.SPIKE_MODEL || undefined });
+let n = 0;
+session.onEvent((event: any) => {
+  const p = event?.payload;
+  const u = p?.params?.update ?? p?.update;
+  const su = u?.sessionUpdate;
+  if (su) console.error(`[ev ${n++}] sender=${event.sender} sessionUpdate=${su} text=${JSON.stringify(u?.content?.text ?? u?.content)}`);
+  else console.error(`[ev ${n++}] sender=${event.sender} method=${p?.method} keys=${Object.keys(p||{})}`);
+});
+await session.prompt([{ type: "text", text: "Count from 1 to 5, one number per line" }]);
+await sandbox.destroySandbox().catch(()=>{});
+await sandbox.dispose().catch(()=>{});
+rmSync(cwd, { recursive: true, force: true });
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts
new file mode 100644
index 0000000000..1cff6fcfa1
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts
@@ -0,0 +1,30 @@
+import { mkdtempSync, writeFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { SandboxAgent } from "sandbox-agent";
+import { local } from "sandbox-agent/local";
+const AGENT = process.env.SPIKE_AGENT ?? "pi";
+const here = dirname(fileURLToPath(import.meta.url));
+const binDir = join(here, "node_modules", ".bin");
+const BIN = join(here, "node_modules/.pnpm/@sandbox-agent+cli-linux-x64@0.4.2/node_modules/@sandbox-agent/cli-linux-x64/bin/sandbox-agent");
+const cwd = mkdtempSync(join(tmpdir(), "wp8-dump-"));
+writeFileSync(join(cwd, "AGENTS.md"), "You are concise.\n", "utf-8");
+const env: Record<string,string> = { PATH: `${binDir}:/home/mahmoud/.local/bin:${process.env.PATH ?? ""}`, PI_ACP_PI_COMMAND: join(binDir,"pi"), PI_CODING_AGENT_DIR: join(process.env.HOME??"",".pi/agent"), SANDBOX_AGENT_BIN: BIN, HOME: process.env.HOME??"" };
+const sandbox = await SandboxAgent.start({ sandbox: local({ env, binaryPath: BIN, log: "silent" }) });
+const session = await sandbox.createSession({ agent: AGENT, cwd, model: process.env.SPIKE_MODEL || undefined });
+session.onEvent((event: any) => {
+  const p = event?.payload;
+  const u = p?.params?.update ?? p?.update;
+  const su = u?.sessionUpdate;
+  if (su === "usage_update" || su === "tool_call" || su === "tool_call_update") {
+    console.error(`[${su}] ${JSON.stringify(u).slice(0,500)}`);
+  } else if (!su && p?.result) {
+    console.error(`[result] ${JSON.stringify(p.result).slice(0,400)}`);
+  }
+});
+const res = await session.prompt([{ type: "text", text: "What is 2+2? Answer in one word." }]);
+console.error(`[promptResponse] ${JSON.stringify(res).slice(0,400)}`);
+await sandbox.destroySandbox().catch(()=>{});
+await sandbox.dispose().catch(()=>{});
+rmSync(cwd, { recursive: true, force: true });
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json
new file mode 100644
index 0000000000..c491095f12
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "wp8-rivet-spike",
+  "private": true,
+  "type": "module",
+  "version": "0.0.0",
+  "dependencies": {
+    "@earendil-works/pi-coding-agent": "0.79.4",
+    "pi-acp": "0.0.29",
+    "sandbox-agent": "0.4.2"
+  },
+  "devDependencies": {
+    "tsx": "4.19.2"
+  }
+}
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts
new file mode 100644
index 0000000000..bd792fe210
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts
@@ -0,0 +1,103 @@
+/**
+ * WP-8 Phase 0 spike: drive Pi over ACP through a local rivet daemon.
+ *
+ * Verifies the whole chain end to end before touching the service:
+ *   SandboxAgent.start({ sandbox: local({ env }) })  // spawns `sandbox-agent server`
+ *     -> createSession({ agent: "pi", cwd })          // opens an ACP session
+ *       -> write AGENTS.md into cwd
+ *       -> prompt([{ type: "text", text }])            // sends the user turn
+ *         -> collect `agent_message_chunk` text from session events
+ *           -> dispose()                               // tears the daemon down
+ *
+ * Run: pnpm exec tsx spike.ts "<prompt>"
+ */
+import { mkdtempSync, writeFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import { SandboxAgent } from "sandbox-agent";
+import { local } from "sandbox-agent/local";
+
+const AGENT = process.env.SPIKE_AGENT ?? "pi";
+const MODEL = process.env.SPIKE_MODEL ?? "gpt-5.5";
+const PROMPT = process.argv[2] ?? "Say hello in one short sentence and tell me what 2+2 is.";
+
+const here = dirname(fileURLToPath(import.meta.url));
+const binDir = join(here, "node_modules", ".bin");
+const BIN = join(
+  here,
+  "node_modules/.pnpm/@sandbox-agent+cli-linux-x64@0.4.2/node_modules/@sandbox-agent/cli-linux-x64/bin/sandbox-agent",
+);
+
+function textOf(block: any): string {
+  if (!block) return "";
+  if (typeof block === "string") return block;
+  if (block.type === "text" && typeof block.text === "string") return block.text;
+  return "";
+}
+
+async function main() {
+  const cwd = mkdtempSync(join(tmpdir(), "wp8-spike-"));
+  writeFileSync(
+    join(cwd, "AGENTS.md"),
+    "You are a concise assistant. Answer in one or two short sentences.\n",
+    "utf-8",
+  );
+
+  // Env handed to the daemon at birth. The local provider merges this into the
+  // `sandbox-agent server` subprocess, which passes it to the pi-acp adapter and
+  // then to `pi`. PI_ACP_PI_COMMAND points pi-acp at the local pi bin; PATH lets
+  // the daemon resolve the pi-acp adapter binary.
+  const env: Record<string, string> = {
+    PATH: `${binDir}:${process.env.PATH ?? ""}`,
+    PI_ACP_PI_COMMAND: join(binDir, "pi"),
+    PI_CODING_AGENT_DIR: join(process.env.HOME ?? "", ".pi/agent"),
+    SANDBOX_AGENT_BIN: BIN,
+    HOME: process.env.HOME ?? "",
+  };
+
+  console.error(`[spike] starting daemon, agent=${AGENT} model=${MODEL}`);
+  const sandbox = await SandboxAgent.start({
+    sandbox: local({ env, binaryPath: BIN, log: "silent" }),
+  });
+
+  let output = "";
+  try {
+    console.error(`[spike] creating session in ${cwd}`);
+    const session = await sandbox.createSession({ agent: AGENT, cwd, model: MODEL });
+
+    session.onEvent((event: any) => {
+      const payload = event?.payload;
+      // ACP session/update notifications carry the streamed assistant text.
+      const update = payload?.params?.update ?? payload?.update;
+      if (!update) return;
+      if (update.sessionUpdate === "agent_message_chunk") {
+        const t = textOf(update.content);
+        if (!t) return;
+        // Harnesses differ: Pi streams pure deltas, Claude streams deltas plus a
+        // cumulative full snapshot. Replace when a chunk is a superset of what we
+        // have (snapshot), append otherwise (delta). Unifies both without doubling.
+        if (t.startsWith(output)) output = t;
+        else output += t;
+      }
+    });
+
+    console.error(`[spike] prompting...`);
+    const res = await session.prompt([{ type: "text", text: PROMPT }]);
+    console.error(`[spike] prompt returned stopReason=${(res as any)?.stopReason}`);
+
+    console.error("[spike] OUTPUT >>>");
+    console.log(output.trim());
+    console.error("[spike] <<< OUTPUT");
+  } finally {
+    await sandbox.destroySandbox().catch(() => {});
+    await sandbox.dispose().catch(() => {});
+    rmSync(cwd, { recursive: true, force: true });
+  }
+}
+
+main().catch((err) => {
+  console.error("[spike] FAILED:", err?.stack ?? err);
+  process.exit(1);
+});
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md
new file mode 100644
index 0000000000..f7d276806c
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md
@@ -0,0 +1,147 @@
+# Research (verified facts)
+
+Source-verified June 2026 against a clone of `rivet-dev/sandbox-agent` (Rust daemon plus
+TypeScript SDK), the ACP spec and adapters, the Pi and Claude Code docs, and the Agenta
+codebase. Rivet file paths below are inside the rivet repo. Agenta paths are in this repo.
+
+## Rivet, in one paragraph
+
+`sandbox-agent` is a daemon that runs **inside** a sandbox and drives coding harnesses
+over ACP. Where it runs (local, Docker, E2B, Daytona, Vercel, Cloudflare) is decided by
+the **TypeScript SDK** providers, not the Rust core. License: Apache-2.0. We adopt it as a
+dependency and do not fork it for this WP.
+
+## Licensing (verified, safe to adopt commercially)
+
+Confirmed against the actual LICENSE files and package manifests June 2026.
+
+- **rivet-dev/sandbox-agent is Apache-2.0 throughout** (root LICENSE, Rust crates, TS SDK).
+  OSI-open, no BSL/SSPL/Elastic/non-commercial clause. Compatible with Agenta's MIT OSS
+  core and the commercial EE.
+- **The server binary is open and self-buildable** (`cargo run -p sandbox-agent
+  --release`, ~15MB static binary). The `curl | sh` installer pulls a prebuilt from Rivet's
+  CDN (`releases.rivet.dev`), the same source compiled, with no key/auth/telemetry. Build
+  from source if you want zero external dependency.
+- **No phone-home.** No Rivet account, API key to rivet.dev, or license server. Runs
+  offline and air-gapped. `$SANDBOX_TOKEN` is local auth, disable with `--no-token`.
+  Session persistence is pluggable (Postgres / in-memory; Rivet Actors optional).
+- **Everything we ship or link is permissive:** pi-acp (MIT), claude-code-acp
+  /`@zed-industries/claude-agent-acp` (Apache-2.0), Daytona SDK (Apache-2.0), E2B (MIT),
+  Pi / Codex / opencode (MIT or Apache-2.0). No GPL/AGPL/SSPL/BSL in the bundled path.
+- **Two restrictive pieces, both user-brought (weak coupling):** Claude Code is
+  proprietary (Anthropic Commercial ToS); the user installs it and brings their own
+  Anthropic auth, and we only shell out to it over ACP. Never bundle, auto-download, or
+  repackage it. Daytona's *server* is AGPL-3.0, but its client SDK is Apache-2.0 and the
+  AGPL binds whoever operates/modifies the server, not an API consumer; Agenta already
+  depends on the Daytona SDK for code evaluators.
+
+## The SDK shape (what the TypeScript runner calls)
+
+Approximate API (verify exact names against the installed SDK version from rivet.dev):
+
+- `SandboxAgent.start({ sandbox: local() })` or `{ sandbox: daytona({...}), env: {...} }`
+  brings up a daemon and returns a handle. The `local` provider spawns
+  `sandbox-agent server` as a host subprocess; the SDK merges `{...process.env,
+  ...options.env}` into that process. The `daytona` provider creates a Daytona sandbox and
+  starts the daemon inside it.
+- `createSession({ agent, cwd })` opens an ACP session and returns a `serverId`. `agent`
+  is the harness id.
+- `prompt(sessionId, text)` sends the turn; the daemon streams events (SSE), assistant
+  text arrives as `agent_message_chunk`. Accumulate the chunks into the final string.
+- `destroy()` / `pauseSandbox()` tear down. On the Daytona provider, both delete the
+  sandbox (it implements only create/destroy; no stop/pause is wired).
+
+Harness ids (`AgentId` enum in `server/packages/agent-management/src/agents.rs`):
+`Claude, Codex, Opencode, Amp, Pi, Cursor`. **Pi is first-class.**
+
+## One daemon hosts many sessions
+
+The core is `AcpProxyRuntime` with `instances: HashMap<server_id, ProxyInstance>`
+(`server/packages/sandbox-agent/src/acp_proxy_runtime.rs`). Each session spawns its own
+ACP adapter subprocess with its own `cwd`. We do **not** rely on this multiplexing for the
+MVP; we run one daemon and one session per invoke (see the lifecycle decision below).
+
+## Harnesses are ACP adapters, resolved from a registry
+
+Each harness maps to an ACP adapter program. Rivet builds a `LaunchSpec {program, args,
+env}` from a registry (`acp-http-adapter/src/registry.rs`); the canonical registry is the
+ACP one, with a pinned audit list in `scripts/audit-acp-deps/adapters.json` (e.g.
+`pi-acp@0.0.23`, `@zed-industries/claude-agent-acp@0.20.0`). The adapters are small
+TypeScript npm packages:
+
+- **pi-acp** (svkozak/pi-acp, MIT, TypeScript): spawns `pi --mode rpc`, passes its env
+  through to `pi`. Pi auto-loads extensions from `~/.pi/agent/extensions` and global
+  settings.
+- **claude-code-acp** (`@zed-industries/claude-agent-acp`, Apache-2.0, TypeScript): wraps
+  the Claude Agent SDK.
+
+To use a forked adapter, point the launch command at it (npm package, local path, or your
+own registry json). The adapter runs wherever the daemon runs. We do **not** need a fork
+for this WP (see tracing).
+
+## Environment injection (how trace context and secrets reach the harness)
+
+`AdapterRuntime::start` (`acp-http-adapter/src/process.rs`) inherits the **daemon's env**
+and overlays the static registry `LaunchSpec.env`. There is **no per-session env channel**
+from the create-session HTTP path. Consequence:
+
+- A value set in the daemon's env is inherited by the adapter and the harness.
+- Because we run **one daemon per invoke**, the daemon's env is per-invoke. So we set the
+  `traceparent`, OTLP config, and secrets in the daemon's env at its birth: the SDK `env`
+  option locally, the sandbox `env_vars` on Daytona. This is exactly how `DaytonaRunner`
+  already injects `AGENTA_*` and provider keys for code evaluators.
+- The per-session-env gap only bites if you later share one warm daemon across concurrent
+  invokes. Then you would carry the traceparent in ACP `_meta` (a spec-blessed reserved
+  key, RFD completed 2026-06-03) plus a small adapter read, or patch rivet. Not now.
+
+## ACP facts
+
+- ACP is Zed's **Agent Client Protocol** (editor to coding-agent), JSON-RPC. Flow:
+  `initialize`, then `session/new` or `session/load`, then `session/prompt`, with streamed
+  `session/update` notifications. Not IBM's Agent Communication Protocol, not Google A2A.
+- `session/load` replays the conversation via `session/update`, an optional capability
+  advertised in `initialize`. Pi exposes `resumeSession`; Claude Code `loadSession` (with
+  limits reconstructing old tool calls). This backs message-history continuation without
+  any persisted filesystem.
+
+## The pattern we mirror: code evaluators in Daytona
+
+Verified in the Agenta SDK. `DaytonaRunner`
+(`sdks/python/agenta/sdk/engines/running/runners/daytona.py`) runs each code evaluator in
+**one ephemeral Daytona sandbox per execution**: it creates an `ephemeral=True` sandbox
+from a snapshot (`DAYTONA_SNAPSHOT`), runs, and deletes it in a `finally`. No warm pool, no
+shared instance. It injects `AGENTA_HOST`, `AGENTA_API_KEY`, and provider keys as the
+sandbox `env_vars`. Concurrency is bounded by the evaluation engine's shared
+`asyncio.Semaphore(batch_size)` (default 10), not by the runner. Selected by env
+`AGENTA_SERVICES_CODE_SANDBOX_RUNNER=daytona`. The agent service copies this shape.
+
+## Sessions and Daytona cost
+
+Daytona bills compute while a sandbox runs, storage while stopped, cheapest when archived.
+An idle-but-running sandbox keeps billing. Rivet's Daytona provider only does
+create/destroy, so "keep it warm and resume" is both unbuilt and costly. With no
+persistent file writes there is nothing on disk to keep. So a session is stored message
+history plus an ephemeral sandbox per turn (~1s Daytona cold start per the WP-3 POC, plus
+history replay). Tradeoff: replaying long histories re-sends tokens, so cap with
+truncation or summarization.
+
+## Tracing per harness
+
+- **Pi:** reuse the existing `agenta-otel` logic, but install it as a Pi extension in the
+  environment (global `~/.pi/agent/extensions`, or baked into the Daytona snapshot). Feed
+  `AGENTA_*` / `OTEL_*` / `traceparent` as env. pi-acp passes env through to `pi`, Pi loads
+  the extension, and spans nest under the parent.
+- **Claude Code:** OTel is first-party. Set `CLAUDE_CODE_ENABLE_TELEMETRY=1`, `OTEL_*`
+  (endpoint and `Authorization` header for Agenta's OTLP), and `TRACEPARENT`, and run it in
+  `-p` / Agent-SDK mode (interactive mode ignores inbound traceparent). A known beta bug
+  may drop some spans in streaming ACP mode; verify before relying on it.
+- The dominant way people instrument Claude Code is this built-in OTel exporter into a
+  collector or platform. Our wiring uses the same channel.
+
+## Filesystem: no jail exists
+
+Grep for `chroot|landlock|bubblewrap|seccomp|namespace|unshare|jail` across rivet's
+`server/` returns zero hits. `cwd` is advisory; the file HTTP API (`resolve_fs_path`)
+returns absolute paths verbatim. An agent can read and write anywhere the daemon can. This
+only matters when many agents share one daemon, which the per-invoke model avoids.
+Confinement is deferred to [`isolation-and-fork.md`](isolation-and-fork.md).
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md
new file mode 100644
index 0000000000..836d60f6ee
--- /dev/null
+++ b/docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md
@@ -0,0 +1,160 @@
+# Status
+
+Source of truth for this WP. Keep it current.
+
+## Current state
+
+IMPLEMENTED and verified end to end (2026-06-17). The agent service drives the harness
+over ACP through a rivet `sandbox-agent` daemon, behind the unchanged `Harness` port and
+`/invoke` contract. Verified: Pi and Claude Code locally; harness swap as one config
+value; the full UI playground run through the live dev stack; message history; and the
+agent's spans nested under the `/invoke` workflow span. Tools are wired over MCP with a
+documented harness limitation, and Daytona is wired with a documented snapshot
+prerequisite (both below).
+
+### What was verified
+
+| Requirement | Status | How |
+| --- | --- | --- |
+| Drive the harness over ACP via rivet | done | `runRivet.ts` + `sandbox-agent@0.4.2`; Pi & Claude answer over ACP |
+| Harness swap as config | done | `AGENTA_AGENT_HARNESS=pi\|claude`; same config, both answer (host) |
+| Run locally (self-hosted) | done | rivet `local` provider; host CLI + dockerized sidecar in the dev stack |
+| Tracing nested under `/invoke` | done | live: `_agent`→`invoke_agent`→`turn 0`→`chat <model>` in one trace, span_ids chained |
+| End to end from the UI | done | playground run in pi-agents → "Success" reply via the rivet path |
+| Message history | done | prior turns replayed as transcript context (client/playground holds history) |
+| Tools | mechanism | MCP bridge → `/tools/call` built & bridge-verified; harness MCP support gates it (below) |
+| Daytona sandbox | wired | provider branch implemented + auth upload; needs a rivet+Pi snapshot (below) |
+
+### Implementation map
+
+- `services/agent/src/runRivet.ts` — the rivet driver (same `/run` contract as `runPi`).
+- `services/agent/src/agenta-otel.ts` — added `createRivetOtel` (ACP-event-stream tracer).
+- `services/agent/src/toolBridge.ts` + `toolBridgeServer.ts` — tools over MCP → `/tools/call`.
+- `services/agent/src/{server,cli}.ts` — route `/run` to `runRivet` (`AGENT_BACKEND`, or auto by request shape).
+- `services/oss/src/agent_pi/rivet_harness.py` — `RivetHarness` (HTTP sidecar or subprocess).
+- `services/oss/src/agent.py` — `_build_harness()` rivet branch (`AGENTA_AGENT_RUNTIME=rivet`).
+- `hosting/docker-compose/ee/docker-compose.dev.yml` — rivet env on the `services` container.
+
+### Tracing: propagate trace context into the harness (the WP-1/WP-2 mechanism)
+
+For Pi we DON'T build spans in the runner. We propagate the caller's trace context into
+Pi and let Pi emit its real span tree (`invoke_agent` → `turn N` → `chat <model>` /
+`execute_tool`, with real token usage), via the `agenta` Pi extension. The extension is
+bundled self-contained with esbuild (`scripts/build-extension.mjs` → `dist/extensions/
+agenta.js`), installed into Pi's agent dir (local: copied; Daytona: uploaded via the
+sandbox FS API), and reads everything from env (`AGENTA_TRACEPARENT`, `AGENTA_OTLP_*`,
+`AGENTA_TOOL_*`). It is inert when no Agenta env is set, so a global install is safe.
+Verified live: `chat gpt-5.5` carries `input_tokens`/`cost` and nests under the caller's
+`/invoke` span, in both REST (ApiKey) and the browser playground (session JWT).
+
+Cumulative roll-up: the harness span tree and the `_agent` workflow span are exported in
+separate OTLP batches (different processes), so Agenta's per-batch cumulative roll-up
+cannot bridge them. We close that by passing the run's token/cost totals back (Pi writes
+them on `agent_end` to `AGENTA_USAGE_OUT`; `runRivet` returns them; `agent.py` stamps
+`gen_ai.usage.*` on the workflow span in-process). Verified: `_agent` shows
+`ag.metrics.tokens.cumulative` and the trace list Usage/Cost columns populate.
+
+For non-Pi harnesses (e.g. Claude) the runner still builds the span tree from the ACP
+event stream (`createRivetOtel`, `emitSpans:true`) as a uniform fallback.
+
+The runner-built chat span is named from the model the harness actually resolved, not the
+requested one: `runRivet` creates the tracer after `applyModel`, so when a harness rejects
+the requested id and keeps its own default (Claude ignores `gpt-5.5`; the in-sandbox Pi on
+Daytona only advertises `default`), the span is `chat` rather than falsely `chat gpt-5.5`.
+Pi-local sets the requested model and the Pi extension emits the real `chat <model>`.
+
+### Tools: Pi-native (no MCP)
+
+Pi tools are delivered the Pi-native way: the same extension calls `pi.registerTool` for
+each backend-resolved spec, and each tool's `execute` POSTs back to Agenta's
+`/tools/call` (the WP-7 envelope; the provider key + connection auth stay server-side).
+Verified live: a real Composio `github_whoami` tool runs in the dockerized playground and
+shows an `execute_tool` span. Other (MCP-capable) harnesses get tools over ACP MCP via
+`toolBridge.ts` instead.
+
+### Daytona status — working (fast, traced)
+
+`sandbox=daytona` runs Pi end to end in ~10s (verified live via `/invoke`), with the full
+trace tree. Per invoke runRivet creates an ephemeral sandbox from the pre-baked snapshot
+`agenta-rivet-pi` (rivet `-full` image + `pi` baked in, built by
+`poc/build_rivet_snapshot.py`), uploads AGENTS.md, runs the ACP session, and destroys it
+in `finally`. The earlier ~150s came from a per-invoke `npm install pi`; the snapshot
+removes it (`AGENTA_RIVET_DAYTONA_INSTALL_PI=false`).
+
+Credentials: the `agent-pi` sidecar gets scoped `DAYTONA_API_KEY`/`API_URL`/`TARGET`. The
+model provider key (OpenAI/Anthropic) is resolved from the project vault and injected as
+the sandbox env var, so no Codex/OAuth subscription token leaves the box (OAuth upload
+remains a fallback only when no key exists).
+
+Tracing: the in-sandbox harness can't reach Agenta's OTLP, so on Daytona the **runner**
+builds the span tree from the ACP event stream (reliable export from the sidecar) and the
+token total is passed back onto the `_agent` workflow span. Verified: 4-span tree
+`_agent → invoke_agent → turn → chat`, `_agent` tokens populated.
+
+Known limitation: the freshly-provisioned Pi inside the Daytona snapshot advertises only
+`model: default` over ACP (it lacks the model catalog the dev's local Pi loads from its
+`auth.json`), so a playground `model` choice is not honored on Daytona — Pi runs its
+default with whatever provider key the vault supplied. The model axis is honored on
+Pi-local. Settling the in-sandbox Pi model config is follow-up.
+
+Notable fixes: the rivet daytona provider's default `image` conflicts with `snapshot`
+("Cannot specify a snapshot when using a build info entry") — suppressed by passing
+`image: undefined` in the create opts. The Daytona preview proxy uses cookie auth — a
+cookie-persisting `fetch` is passed to `SandboxAgent.start`. Unhandled rejections from the
+rivet SDK are caught in `server.ts` so one bad run can't crash the sidecar.
+
+### Credentials (API key or OAuth)
+
+Auth is a resolved credential, not hardcoded. The agent fetches the project vault's
+`provider_key` secrets and injects each as its env var (`OPENAI_API_KEY`,
+`ANTHROPIC_API_KEY`, …) into the harness; the harness uses whichever its model needs. With
+no key the harness falls back to its own login (OAuth): local Pi uses the Codex login;
+Claude needs an Anthropic key (verified: with credit, `/invoke` returns a clean reply; the
+guardrail surfaces "insufficient credit"/"authentication failed" as one line).
+
+## Decisions
+
+| Decision | Rationale |
+| --- | --- |
+| Adopt rivet unmodified (no Rust fork) | It gives ACP, harness swap, local, and streaming. The only gap (the jail) is deferred. |
+| Licensing is clear for commercial use | rivet is Apache-2.0 (binary self-buildable, no phone-home); all shipped deps are MIT/Apache-2.0. Claude Code (proprietary) and Daytona's AGPL server are user-brought, weak coupling. Never bundle Claude Code. See [`research.md`](research.md#licensing-verified-safe-to-adopt-commercially). |
+| Drive the harness over ACP via rivet | Satisfies "ACP, not Pi JSON". |
+| Keep the `Harness` port and `/invoke` unchanged | The seam is right; only the adapter below it changes. Keep the legacy Pi adapters working. |
+| Add `RivetHarness` (Python) + `runRivet.ts` (wraps the rivet SDK) | Thin Python adapter over a TS runner; reuse the `/run` contract. |
+| Sandbox and harness are two orthogonal config axes | Swap each independently; matches rivet (provider vs `agent`). |
+| One daemon and one sandbox per invoke (cold) | Mirrors the shipped code-evaluator `DaytonaRunner` (ephemeral per execution). Makes daemon env per-invoke and needs no jail. |
+| Inject trace + secrets at the daemon's birth (SDK `env` local, sandbox `env_vars` Daytona) | Per-invoke daemon means per-invoke env. No fork of rivet or adapters needed. |
+| Tracing is in scope; standalone traces are not acceptable | Pi reuses `agenta-otel` as a Pi extension; Claude Code uses `CLAUDE_CODE_ENABLE_TELEMETRY` + `OTEL_*` + `TRACEPARENT` in `-p` mode. |
+| Local run = run the open-source rivet server locally; Python wraps the client | Rivet is Apache-2.0. Not a special case. |
+| Session = persisted message history + ephemeral sandbox; continue via ACP `session/load` | No persistent FS writes, so nothing on disk to keep. Zero at-rest cost. |
+| Concurrency mirrors evaluations (taskiq + Redis + shared semaphore) | Each slot = one ephemeral sandbox; the semaphore caps Daytona cost/quota. |
+| Tools split into definition + swappable body, per-harness over MCP; deferred build | Enables test variants with mock bodies; body model is general, not Agenta-specific. |
+| Input variables substituted into AGENTS.md | Mirrors prompt-template variables. |
+| Secrets via launch env, never in the agent-visible filesystem | No jail. |
+| `model` semantics owned by the harness adapter | The adapter normalizes per harness. Not an open question. |
+| Adapters live in the SDK | Backend and standalone share one implementation. |
+
+## Open questions
+
+1. **SDK API names.** Verify the exact rivet SDK package name and method signatures
+   (`start` / `createSession` / `prompt` / event names) against the installed version
+   during Phase 0.
+2. **Message-history store and truncation.** Backend DB on the platform, local file
+   standalone; pick a truncation or summarization policy so replay does not grow tokens
+   unbounded.
+3. **Concurrency placement.** Dispatch agent invokes through the taskiq worker, or bound a
+   synchronous `/invoke` with a semaphore? Depends on what the playground expects.
+4. **Claude Code ACP-mode span completeness.** Confirm the current beta behavior before
+   relying on Claude Code traces; a known bug may drop `interaction`/`tool` spans.
+5. **Daytona snapshot contents.** Settle exactly what the snapshot pre-installs (rivet,
+   both harness CLIs, both adapters, the `agenta-otel` extension) and how it is built.
+
+## Future (returns only if we change the lifecycle)
+
+- A warm shared daemon multiplexing concurrent invokes would re-introduce the per-session
+  env problem (fork an adapter to read ACP `_meta.traceparent`, TypeScript not Rust) and
+  the need for a filesystem jail. The per-invoke model avoids both.
+
+## Next step
+
+Phase 0 spike. See [`plan.md`](plan.md).
diff --git a/hosting/docker-compose/ee/docker-compose.dev.yml b/hosting/docker-compose/ee/docker-compose.dev.yml
index e09b82b29f..b1f9a2d773 100644
--- a/hosting/docker-compose/ee/docker-compose.dev.yml
+++ b/hosting/docker-compose/ee/docker-compose.dev.yml
@@ -394,8 +394,13 @@ services:
             - ${ENV_FILE:-./.env.ee.dev}
         environment:
             DOCKER_NETWORK_MODE: ${DOCKER_NETWORK_MODE:-bridge}
-            # Agent workflow (WP-2): reach the Pi harness sidecar in-network.
+            # Agent workflow (WP-2): reach the harness wrapper sidecar in-network.
             AGENTA_AGENT_PI_URL: http://agent-pi:8765
+            # Agent runtime (WP-8): drive the harness over ACP via a rivet daemon.
+            # Harness (pi/claude) and sandbox (local/daytona) are independent axes.
+            AGENTA_AGENT_RUNTIME: ${AGENTA_AGENT_RUNTIME:-rivet}
+            AGENTA_AGENT_HARNESS: ${AGENTA_AGENT_HARNESS:-pi}
+            AGENTA_AGENT_SANDBOX: ${AGENTA_AGENT_SANDBOX:-local}
         # === NETWORK ============================================== #
         networks:
             - agenta-network
@@ -429,11 +434,10 @@ services:
             exec node_modules/.bin/tsx src/server.ts"
         # === CONFIGURATION ======================================== #
         # Deliberately NO env_file: the Pi sandbox must not inherit the stack's
-        # secrets (COMPOSIO_API_KEY, STRIPE/POSTHOG/GOOGLE/DAYTONA keys, ...). Tools
-        # run server-side via /tools/call, so the sandbox only needs its own port,
-        # the Pi login (mounted below), and the OTLP export fallback. The wrapper
-        # reads exactly: PORT, PI_CODING_AGENT_DIR, AGENTA_HOST, AGENTA_API_KEY, and
-        # the optional AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS / OTEL_SERVICE_NAME.
+        # secrets (COMPOSIO_API_KEY, STRIPE/POSTHOG/GOOGLE keys, ...). Tools run
+        # server-side via /tools/call, so the sandbox only needs its own port, the Pi
+        # login (mounted below), the OTLP export fallback, and — for the rivet `daytona`
+        # sandbox axis (WP-8) — the Daytona credentials the SDK reads to create sandboxes.
         environment:
             PORT: "8765"
             PI_CODING_AGENT_DIR: /pi-agent
@@ -441,6 +445,15 @@ services:
             # credential). Must be reachable from this container.
             AGENTA_HOST: ${AGENTA_HOST:-http://144.76.237.122:8280}
             AGENTA_API_KEY: ${AGENTA_API_KEY:-}
+            # Daytona sandbox axis: the rivet daytona provider's `new Daytona()` reads
+            # these. Scoped to Daytona only (not the full stack secret set).
+            DAYTONA_API_KEY: ${DAYTONA_API_KEY:-}
+            DAYTONA_API_URL: ${DAYTONA_API_URL:-}
+            DAYTONA_TARGET: ${DAYTONA_TARGET:-}
+            # Pre-baked snapshot (rivet daemon + Pi + Claude + certs) so Daytona runs skip
+            # the ~150s per-invoke `npm install pi`. Built by poc/build_rivet_snapshot.py.
+            AGENTA_RIVET_DAYTONA_SNAPSHOT: ${AGENTA_RIVET_DAYTONA_SNAPSHOT:-agenta-rivet-pi}
+            AGENTA_RIVET_DAYTONA_INSTALL_PI: ${AGENTA_RIVET_DAYTONA_INSTALL_PI:-false}
         # === STORAGE ============================================== #
         volumes:
             - ../../../services/agent/src:/app/src
diff --git a/services/agent/docker/Dockerfile.dev b/services/agent/docker/Dockerfile.dev
index 2b2320600e..4f2f64f126 100644
--- a/services/agent/docker/Dockerfile.dev
+++ b/services/agent/docker/Dockerfile.dev
@@ -8,6 +8,13 @@ FROM node:24-slim
 
 WORKDIR /app
 
+# CA certificates: the rivet daemon (Rust) downloads harness CLIs (e.g. Claude Code) over
+# HTTPS using the system trust store, which node:*-slim omits — without this the daemon's
+# `install-agent claude` fails TLS verification. git lets npm/installers fetch git deps.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends ca-certificates git \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN corepack enable
 
 # Install deps as a cached layer (manifest + lockfile only).
@@ -16,8 +23,14 @@ RUN pnpm install --frozen-lockfile
 
 # Fallback copy for non-mounted runs; in dev these are bind-mounted over.
 COPY tsconfig.json ./
+COPY scripts ./scripts
 COPY src ./src
 
+# Bundle the Agenta Pi extension (tracing + tools) into dist/. dist/ is NOT bind-mounted
+# in dev, so this baked copy is what runRivet installs into Pi's agent dir. Rebuild the
+# image after editing src/piExtension.ts or src/agenta-otel.ts.
+RUN pnpm run build:extension
+
 ENV NODE_ENV=development \
     PORT=8765
 
diff --git a/services/agent/package.json b/services/agent/package.json
index 5f2a39fb88..231b6ff5f6 100644
--- a/services/agent/package.json
+++ b/services/agent/package.json
@@ -9,19 +9,34 @@
     "run:cli": "tsx src/cli.ts",
     "serve": "tsx src/server.ts",
     "serve:watch": "tsx watch src/server.ts",
+    "build:extension": "node scripts/build-extension.mjs",
     "login": "pi"
   },
   "dependencies": {
+    "@daytonaio/sdk": "^0.187.0",
     "@earendil-works/pi-coding-agent": "0.79.4",
     "@opentelemetry/api": "1.9.0",
     "@opentelemetry/exporter-trace-otlp-proto": "0.54.0",
     "@opentelemetry/resources": "1.28.0",
     "@opentelemetry/sdk-trace-base": "1.28.0",
     "@opentelemetry/sdk-trace-node": "1.28.0",
-    "@opentelemetry/semantic-conventions": "1.28.0"
+    "@opentelemetry/semantic-conventions": "1.28.0",
+    "@zed-industries/claude-agent-acp": "^0.23.1",
+    "pi-acp": "0.0.29",
+    "sandbox-agent": "0.4.2"
   },
   "devDependencies": {
-    "tsx": "4.19.2",
-    "@types/node": "22.10.2"
+    "@types/node": "22.10.2",
+    "esbuild": "0.23.1",
+    "tsx": "4.19.2"
+  },
+  "pnpm": {
+    "onlyBuiltDependencies": [
+      "@sandbox-agent/cli-linux-x64",
+      "@sandbox-agent/cli-darwin-arm64",
+      "@sandbox-agent/cli-darwin-x64",
+      "@sandbox-agent/cli-linux-arm64",
+      "esbuild"
+    ]
   }
 }
diff --git a/services/agent/pnpm-lock.yaml b/services/agent/pnpm-lock.yaml
index eab8e5fb3a..7bd7134915 100644
--- a/services/agent/pnpm-lock.yaml
+++ b/services/agent/pnpm-lock.yaml
@@ -8,6 +8,9 @@ importers:
 
   .:
     dependencies:
+      '@daytonaio/sdk':
+        specifier: ^0.187.0
+        version: 0.187.0(ws@8.21.0)
       '@earendil-works/pi-coding-agent':
         specifier: 0.79.4
         version: 0.79.4(ws@8.21.0)(zod@4.4.3)
@@ -29,16 +32,49 @@ importers:
       '@opentelemetry/semantic-conventions':
         specifier: 1.28.0
         version: 1.28.0
+      '@zed-industries/claude-agent-acp':
+        specifier: ^0.23.1
+        version: 0.23.1
+      pi-acp:
+        specifier: 0.0.29
+        version: 0.0.29
+      sandbox-agent:
+        specifier: 0.4.2
+        version: 0.4.2(@daytonaio/sdk@0.187.0(ws@8.21.0))(zod@4.4.3)
     devDependencies:
       '@types/node':
         specifier: 22.10.2
         version: 22.10.2
+      esbuild:
+        specifier: 0.23.1
+        version: 0.23.1
       tsx:
         specifier: 4.19.2
         version: 4.19.2
 
 packages:
 
+  '@agentclientprotocol/sdk@0.16.1':
+    resolution: {integrity: sha512-1ad+Sc/0sCtZGHthxxvgEUo5Wsbw16I+aF+YwdiLnPwkZG8KAGUEAPK6LM6Pf69lCyJPt1Aomk1d+8oE3C4ZEw==}
+    peerDependencies:
+      zod: ^3.25.0 || ^4.0.0
+
+  '@agentclientprotocol/sdk@0.17.0':
+    resolution: {integrity: sha512-inBMYAEd9t4E+ULZK2os9kmLG5jbPvMLbPvY71XDDem1YteW/uDwkahg6OwsGR3tvvgVhYbRJ9mJCp2VXqG4xQ==}
+    peerDependencies:
+      zod: ^3.25.0 || ^4.0.0
+
+  '@agentclientprotocol/sdk@0.26.0':
+    resolution: {integrity: sha512-ialrcI+RzKOYe+fw+TfpyTdRmEoqIkXLlwbTi6XgaXXfdhNcdod7TmE1VsTnG3yTlox8TMTSMQgWbLLbz3r86Q==}
+    peerDependencies:
+      zod: ^3.25.0 || ^4.0.0
+
+  '@anthropic-ai/claude-agent-sdk@0.2.83':
+    resolution: {integrity: sha512-O8g56htGMxrwbjCbqUqRBMNC0O98B7SkPnfQC7vmo3w2DVnUrBj3qat/IBLB8SI4sjVSZHeJrcK7+ozsCzStSw==}
+    engines: {node: '>=18.0.0'}
+    peerDependencies:
+      zod: ^4.0.0
+
   '@anthropic-ai/sdk@0.91.1':
     resolution: {integrity: sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw==}
     hasBin: true
@@ -52,6 +88,12 @@ packages:
     resolution: {integrity: sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==}
     engines: {node: '>=16.0.0'}
 
+  '@aws-crypto/crc32c@5.2.0':
+    resolution: {integrity: sha512-+iWb8qaHLYKrNvGRbiYRHSdKRWhto5XlZUEBwDjYNf+ly5SVYG6zEoYIdxvf5R3zyeP16w4PLBn3rH1xc74Rag==}
+
+  '@aws-crypto/sha1-browser@5.2.0':
+    resolution: {integrity: sha512-OH6lveCFfcDjX4dbAvCFSYUjJZjDr/3XJ3xHtjn3Oj5b9RjojQo8npoLeA/bNwkOkrSQ0wgrHzXk4tDRxGKJeg==}
+
   '@aws-crypto/sha256-browser@5.2.0':
     resolution: {integrity: sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==}
 
@@ -65,54 +107,112 @@ packages:
   '@aws-crypto/util@5.2.0':
     resolution: {integrity: sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==}
 
+  '@aws-sdk/checksums@3.1000.6':
+    resolution: {integrity: sha512-RMCrCteiUwYTEv2G9zfP/BEuKHv57665vVieJyp9cf8VgilWxP/KrWVtMdfdDlIH8nFhvu3rIMc29z3ebGEZ1w==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/client-bedrock-runtime@3.1048.0':
     resolution: {integrity: sha512-u+NT61JZEkRFtpL0CAw1N1dwxnaLgwVXQl/zjJxTGgLyS/jTIdg2SdoEoCTHxgDyCnqa1HEi9QOoE9/pYRNpOQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/client-s3@3.1070.0':
+    resolution: {integrity: sha512-B/OUiCqGQ4Zr7v9gFFyiuitKN2c0PIgvOlQb5bYg1SM2y0F8a5JQ7FNsjRcl+d2PqYWLHwHx12CvZDyLn4KxIw==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/core@3.974.20':
     resolution: {integrity: sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/core@3.974.21':
+    resolution: {integrity: sha512-P5JAHvn4dTi96UsAGS67LVOqqpUNNRhnfFXqzCYtdBIGZtqBue4CXvRr9YenOO7PALj/Pn8uuyw53FBCiCYw8w==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-env@3.972.46':
     resolution: {integrity: sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-env@3.972.47':
+    resolution: {integrity: sha512-3YoPwJczcc+MtX2xxXaYaOOWO6xKUJr1ZIIDIFuninr51BYONVVcF/CP8K2xfVRC/PztJjqKWxNGFH7BWQAw1Q==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-http@3.972.48':
     resolution: {integrity: sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-http@3.972.49':
+    resolution: {integrity: sha512-2UtGUPy+x3lqyceHrtC1uEuVxBZbDalPF6KAFqBwYgm4edWdBrZKNnCqzDs7KynWUvEC6mrR+ojRk+ZgQz9C2w==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-ini@3.972.53':
     resolution: {integrity: sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-ini@3.972.54':
+    resolution: {integrity: sha512-Hx4gO4YRjFwitf3MVl3cDwYe1aryJthC4txVl9b+JAURovA50M2ywf9r8j1E/Q6SCTPT4qQpjOAbKYIC9CG+Vw==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-login@3.972.52':
     resolution: {integrity: sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-login@3.972.53':
+    resolution: {integrity: sha512-+71sluhkgPqdhbbD3UDwUpj24GCkng9HQx6z7qoBFb8dwkF4ktpOcVKDeHpgg8PvBgLYwAnUYLTEGRC/PniCiQ==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-node@3.972.55':
     resolution: {integrity: sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-node@3.972.56':
+    resolution: {integrity: sha512-iI+4o0dvQQ4NHel4FMDiFy5q2gaU/ryLK3niOsoPccAt9WLFRkV4XTYPWRr9XvmBUqEzXG73S4p/8gm0Lu/W3A==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-process@3.972.46':
     resolution: {integrity: sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-process@3.972.47':
+    resolution: {integrity: sha512-tAizPm9IFo/PHn06c+LQJlzfY2AGOlyF0CUljFejrU6LcZBjnk8pmbZK3/xoIDdnIzjEdbClfvY3mXfr818ZEg==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-sso@3.972.52':
     resolution: {integrity: sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-sso@3.972.53':
+    resolution: {integrity: sha512-pUXE3fu4tfEDV8BksIgf4dXvuIH10FhwHMl/wu8rBD5T1sMpryQWFVitH3kdPS90wlgrGYJQ/meQTSPacyZfeg==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/credential-provider-web-identity@3.972.52':
     resolution: {integrity: sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/credential-provider-web-identity@3.972.53':
+    resolution: {integrity: sha512-JmMGlhVvSj8uSG9CpeDkJAXT35H89tc6v84iMgEIE75q4yp1MKVVKvopv6Gg28HJIR7hMNkojRF8H2m5W44wyg==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/eventstream-handler-node@3.972.21':
     resolution: {integrity: sha512-mVC0hOmwGJmNFezZ+wM8Sqfap/LjsMavEf2Evl0YWrLAcrdZOEdjnY8nRvgakVViWJSGm2eJxLuPVHGdeV06kA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/lib-storage@3.1070.0':
+    resolution: {integrity: sha512-TMfkkBaLIlHhqt28wJp14EhATO9WbFwEheCi5K5gahYKQNWCUE4l4CmuWl1Wi8j0ZeVs/vCaSWxHv6DahrHOzQ==}
+    engines: {node: '>=20.0.0'}
+    peerDependencies:
+      '@aws-sdk/client-s3': ^3.1070.0
+
   '@aws-sdk/middleware-eventstream@3.972.17':
     resolution: {integrity: sha512-tdbnXbw73ww62ABWP0G0Z/euvFowEEvAoi/zG4NaZo7HJFpfGho/Z65HyVzkJLT1cMsUregr4pTyxljlarT0wA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/middleware-flexible-checksums@3.974.31':
+    resolution: {integrity: sha512-Yzj6NRYVZdBaCp7o1BwHGyeDBfixdeToLIAMprshIITEdl9wKVSiidVOfeaiH8FyeC1hBmBfDZFvs/aH1Y3xpw==}
+    engines: {node: '>=20.0.0'}
+
+  '@aws-sdk/middleware-sdk-s3@3.972.52':
+    resolution: {integrity: sha512-rerjP08onRqkBh0AcCqip6GkKvESapmLoTgi1xysZ4C6a1xMrIMtTBcEbUb6EY71oeajnigeUD4KwZjtIO+aWQ==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/middleware-websocket@3.972.28':
     resolution: {integrity: sha512-SCW06Zjugn86pq7+dxGnFcyWJuEWHT753HTU/Vj/OzVxP+NoShwdAr4ynxAcvWL883OgRVbSqW3ohnjIxwXjjw==}
     engines: {node: '>= 14.0.0'}
@@ -121,10 +221,18 @@ packages:
     resolution: {integrity: sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/nested-clients@3.997.21':
+    resolution: {integrity: sha512-eC7Vl7Qom/BGhZjG9GEqPwdQ/fk45hg1t5LP4EUxG5d1fdshLbaxCiwh/tszUzDX/4mW40mu2QsbeJJRPBbqUw==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/signature-v4-multi-region@3.996.34':
     resolution: {integrity: sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/signature-v4-multi-region@3.996.35':
+    resolution: {integrity: sha512-6L/VWs+Wch2stHemCGTmUNqKLMzURxQDK5boNG3Jn3kAOp71meDUuS5sbObpEvFxHDq0uWeSLFDNSYsjNt+Dlg==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/token-providers@3.1048.0':
     resolution: {integrity: sha512-k0y/GcuesuSfWyUM0WamrGyeZmltRYaPbHO82UDA6mZ/doB+FOHKutikPAtSXMn/hDz970cF+iRuuiYO9VEbAA==}
     engines: {node: '>=20.0.0'}
@@ -133,10 +241,18 @@ packages:
     resolution: {integrity: sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/token-providers@3.1069.0':
+    resolution: {integrity: sha512-ks4X+kngC3PA5howV7Qu1TgG4bfC4jPykKdvw3nmBSXR9yZxRJouBholFSNQ5kY3L+Fgwyw+LCjzQmNi+KR91g==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/types@3.973.12':
     resolution: {integrity: sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/types@3.973.13':
+    resolution: {integrity: sha512-pEHZqRkAlHfnfAU9tK+WpKv/gBNjGJrHMgA3A0iYRGyswBS2t0pfez+lWlwktb3Bqa0ovh7w/QJTFwp3fDxLNg==}
+    engines: {node: '>=20.0.0'}
+
   '@aws-sdk/util-locate-window@3.965.7':
     resolution: {integrity: sha512-M0D6oIpohdNHjc7udzTHEQyot0+0iuA36jc2I9Hps+f/GtKi2HO/pyijQnCnNcwZqLB5+rtn81z3eZK/GyjAmA==}
     engines: {node: '>=20.0.0'}
@@ -145,6 +261,10 @@ packages:
     resolution: {integrity: sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==}
     engines: {node: '>=20.0.0'}
 
+  '@aws-sdk/xml-builder@3.972.30':
+    resolution: {integrity: sha512-StElZPEoBquWwNqw1AcfpzEyZqJvFxouG+mpDNYlcH6ZOrqd2CuIryv+8LV8gNHZUOyKyJF3Dq9vxaXEmDR9TQ==}
+    engines: {node: '>=20.0.0'}
+
   '@aws/lambda-invoke-store@0.2.4':
     resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==}
     engines: {node: '>=18.0.0'}
@@ -153,6 +273,16 @@ packages:
     resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==}
     engines: {node: '>=6.9.0'}
 
+  '@daytona/api-client@0.187.0':
+    resolution: {integrity: sha512-riKOJ6eSuy67DL6iJlAa3Bfjnm4iQmkOdJk0B5hqrYMZeZmVDsgdiZtYvFpyoa+2KCZFNb0Gs5dQwO1d6NhGCw==}
+
+  '@daytona/toolbox-api-client@0.187.0':
+    resolution: {integrity: sha512-T5F+++cakH5Nl67fR53SLkEeTgayEmw5JFXhdMKRgk/mUf6IL30nHC/2kIbc4yK8Iol6YVo9vlG4cLk+4x8y1A==}
+
+  '@daytonaio/sdk@0.187.0':
+    resolution: {integrity: sha512-j6PfT6735Uu34t4JoxBi4IMh1JLNrEDg5w3ZUaT0Mgkas2UfoAAhQ2Eg1LqMhy4n1CTffvCyJID9W6Ldi4xEGQ==}
+    deprecated: 'Moved to @daytona/sdk, same API, no breaking changes. Please update: npm uninstall @daytonaio/sdk && npm i @daytona/sdk'
+
   '@earendil-works/pi-agent-core@0.79.4':
     resolution: {integrity: sha512-xkaZ3yK2XbP9HYdHrrdj/6HqZPM0o/mwbjMSU4RTJyR3HjDG0ZrPz76Hg6s0W+G4u6PpJr1mGx/srCG+3eQA8A==}
     engines: {node: '>=22.19.0'}
@@ -324,6 +454,124 @@ packages:
       '@modelcontextprotocol/sdk':
         optional: true
 
+  '@grpc/grpc-js@1.14.4':
+    resolution: {integrity: sha512-k9Dj3DV/itK9D06Y8f190Qgop7/Ui+D0njFV3LHMPwPT75DpXLQohE9Wmz0QElrJnzsjB7KPWiKJbOl7IPDArQ==}
+    engines: {node: '>=12.10.0'}
+
+  '@grpc/proto-loader@0.8.1':
+    resolution: {integrity: sha512-wtF6h+DY6M3YaDBPAmvuuA6jV8Sif9MjtOI5euKFWRgCDl5PeDpPsHR9u2l6St5ceY8AZgoNDww5+HvEsXFsGg==}
+    engines: {node: '>=6'}
+    hasBin: true
+
+  '@iarna/toml@2.2.5':
+    resolution: {integrity: sha512-trnsAYxU3xnS1gPHPyU961coFyLkh4gAD/0zQ5mymY4yOZ+CYvsPqUbOFSw0aDM4y0tV7tiFxL/1XfXPNC6IPg==}
+
+  '@img/sharp-darwin-arm64@0.34.5':
+    resolution: {integrity: sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@img/sharp-darwin-x64@0.34.5':
+    resolution: {integrity: sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [x64]
+    os: [darwin]
+
+  '@img/sharp-libvips-darwin-arm64@1.2.4':
+    resolution: {integrity: sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@img/sharp-libvips-darwin-x64@1.2.4':
+    resolution: {integrity: sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==}
+    cpu: [x64]
+    os: [darwin]
+
+  '@img/sharp-libvips-linux-arm64@1.2.4':
+    resolution: {integrity: sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-libvips-linux-arm@1.2.4':
+    resolution: {integrity: sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==}
+    cpu: [arm]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-libvips-linux-x64@1.2.4':
+    resolution: {integrity: sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-libvips-linuxmusl-arm64@1.2.4':
+    resolution: {integrity: sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  '@img/sharp-libvips-linuxmusl-x64@1.2.4':
+    resolution: {integrity: sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  '@img/sharp-linux-arm64@0.34.5':
+    resolution: {integrity: sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-linux-arm@0.34.5':
+    resolution: {integrity: sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [arm]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-linux-x64@0.34.5':
+    resolution: {integrity: sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  '@img/sharp-linuxmusl-arm64@0.34.5':
+    resolution: {integrity: sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  '@img/sharp-linuxmusl-x64@0.34.5':
+    resolution: {integrity: sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  '@img/sharp-win32-arm64@0.34.5':
+    resolution: {integrity: sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [arm64]
+    os: [win32]
+
+  '@img/sharp-win32-x64@0.34.5':
+    resolution: {integrity: sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==}
+    engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
+    cpu: [x64]
+    os: [win32]
+
+  '@isaacs/fs-minipass@4.0.1':
+    resolution: {integrity: sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==}
+    engines: {node: '>=18.0.0'}
+
+  '@js-sdsl/ordered-map@4.4.2':
+    resolution: {integrity: sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw==}
+
   '@mariozechner/clipboard-darwin-arm64@0.3.9':
     resolution: {integrity: sha512-BfgV7vCEWZwJwZJw03r6bP5+tf0iI/ANuQYCxi9RNn7FrWB3yzGuMKCrNLRl6V761vXRdL8+OqZ0wd4TqlsNOQ==}
     engines: {node: '>= 10'}
@@ -398,6 +646,22 @@ packages:
   '@nodable/entities@2.2.0':
     resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==}
 
+  '@nodelib/fs.scandir@2.1.5':
+    resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==}
+    engines: {node: '>= 8'}
+
+  '@nodelib/fs.stat@2.0.5':
+    resolution: {integrity: sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==}
+    engines: {node: '>= 8'}
+
+  '@nodelib/fs.walk@1.2.8':
+    resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==}
+    engines: {node: '>= 8'}
+
+  '@opentelemetry/api-logs@0.217.0':
+    resolution: {integrity: sha512-Cdq0jW2lknrNfrAm92MyEAvpe2cRsKjdnQLHUL6xRA4IVUnsWx6P65E7NcUO0Y+L4w1Aee5iV8FvjSwd+lrs9A==}
+    engines: {node: '>=8.0.0'}
+
   '@opentelemetry/api-logs@0.54.0':
     resolution: {integrity: sha512-9HhEh5GqFrassUndqJsyW7a0PzfyWr2eV2xwzHLIS+wX3125+9HE9FMRAKmJRwxZhgZGwH3HNQQjoMGZqmOeVA==}
     engines: {node: '>=14'}
@@ -406,12 +670,24 @@ packages:
     resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
     engines: {node: '>=8.0.0'}
 
+  '@opentelemetry/configuration@0.217.0':
+    resolution: {integrity: sha512-xCtrYOhBqdy6ZOMfe0Oa73ZKF+2LMhoOv4L5vmwAHVvOXUg+V3fvKuEIr9ZyD0Ow+vxllEjWO6PV1wd0DOtyvw==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.9.0
+
   '@opentelemetry/context-async-hooks@1.28.0':
     resolution: {integrity: sha512-igcl4Ve+F1N2063PJUkesk/GkYyuGIWinYkSyAFTnIj3gzrOgvOA4k747XNdL47HRRL1w/qh7UW8NDuxOLvKFA==}
     engines: {node: '>=14'}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/context-async-hooks@2.7.1':
+    resolution: {integrity: sha512-OPFBYuXEn1E4ja3Y6eeA7O+ZnLBNcXTV5Cgsn1VaqBZ6hC5FnpZPLBNme1LJY8ZtF4aOujPKFoeWN4ik487KuQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
   '@opentelemetry/core@1.27.0':
     resolution: {integrity: sha512-yQPKnK5e+76XuiqUH/gKyS8wv/7qITd5ln56QkBTf3uggr0VkXOXfcaAuG330UfdYu83wsyoBwqwxigpIG+Jkg==}
     engines: {node: '>=14'}
@@ -424,18 +700,126 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/core@2.7.1':
+    resolution: {integrity: sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/core@2.8.0':
+    resolution: {integrity: sha512-hd1Lfh8p545nNz+jq1Ejfz+Mn1hyLuxYn1YzTfFNrxr8urEWMNQLPf1Th8kjOH+HxwawCrtgBp8JpBUR4ZSgww==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
+  '@opentelemetry/exporter-logs-otlp-grpc@0.217.0':
+    resolution: {integrity: sha512-vC5S0Dc+noxD86CVtNu1+awCHPA5Kewi1Sg23ps+9lh4YifwsKXh3pe4XTNEKtUJiAcjpJ5dqStGakLbrSE+YQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-logs-otlp-http@0.217.0':
+    resolution: {integrity: sha512-KfLAdt1uilVE+3FxbgVnp2ZrzqbIawzcesnRoi+Kh9ckB5Ld5D8btUgoBvwTbdmuNx1j6b132Wsh72azq+pPNQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-logs-otlp-proto@0.217.0':
+    resolution: {integrity: sha512-Se0GG/ZO24mQTlQj7zprR4pNI0nKe4lPDPBsuJmi6508b9TlZEuUd3EfyuHk6oJxzL7fGyDFYAbxNigQvRP2ZQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-metrics-otlp-grpc@0.217.0':
+    resolution: {integrity: sha512-0GpJKnCoVaVA1rKBMVPHziznfOQlXgH72S9ktjBAF1AnAVPzX7vVEBGrhwiSxxHDAiefXk+J8znApsMb/K6Z3w==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-metrics-otlp-http@0.217.0':
+    resolution: {integrity: sha512-1zkMzzhiNJdVmLxuwkltqWGw4fOOam47bqRxmuQNjyKJe/9NmY5cIrZ4kiQV7sVGxoOgT0ZvGUfLcjvtpC/b9Q==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-metrics-otlp-proto@0.217.0':
+    resolution: {integrity: sha512-nfxt/KxVGFkjkO/M+58y1ugHu/dwPtxG4eYq0KApcQ7xk5CHzhdn+IuLZfDSvNDrJ3Uy5q++Fj/wbK7i8yryfQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-prometheus@0.217.0':
+    resolution: {integrity: sha512-U9MCXxJu0sBCh5aEkylYRR4xVIL8D1CW6dGwvYXbfFr0qveSorfD0XJchCAWoW6QfAAIcY/yxjf4Dj8OgkHBPw==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-trace-otlp-grpc@0.217.0':
+    resolution: {integrity: sha512-fPZs2fw7veLH3pEKu8vSepUa2fQpAE2P7al6qU10aH9GrEJJ8YaPgsd5xON7by5rbcEVS71FOU2aWyK6nzB7VQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-trace-otlp-http@0.217.0':
+    resolution: {integrity: sha512-38YQoqtYjglz2GV94LGUN/djLvxtvGIQO68o6qAFPVshjmwSdX1F2i0c7vn3lEl1L5B/YqjB/bgKXaVx7KO+RQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.217.0':
+    resolution: {integrity: sha512-nPV8gKHUiSuTZpQcnZU3/pBlK7crSyEGpZuh5MtWySB0vv6NNG0QvvfKitQt+Fc2Mc6qfyU54KlZcurwoTbrVg==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
   '@opentelemetry/exporter-trace-otlp-proto@0.54.0':
     resolution: {integrity: sha512-cpDQj5wl7G8pLu3lW94SnMpn0C85A9Ehe7+JBow2IL5DGPWXTkynFngMtCC3PpQzQgzlyOVe0MVZfoBB3M5ECA==}
     engines: {node: '>=14'}
     peerDependencies:
       '@opentelemetry/api': ^1.3.0
 
+  '@opentelemetry/exporter-zipkin@2.7.1':
+    resolution: {integrity: sha512-mfsD9bKAxcKrh5+y08TPodvClBO0CznBE3p79YAGnO81WI4LrdsGA65T53e4iTSbCalW4WaUpkbeJcbpyIUHfg==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.0.0
+
+  '@opentelemetry/instrumentation-http@0.217.0':
+    resolution: {integrity: sha512-B88Y7k5A9a60pHUboFoeJlgVwXq2T0rsZKj6dTwzSMKSOsNXR4Jz5ovwprVn3kHLAZrkyLEjQtBJ34DYHs1U4Q==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/instrumentation@0.217.0':
+    resolution: {integrity: sha512-24ucQMjz7Y34Kw3trbxL2ZrssbtgWnR+Clpaa+YdeWuuyH3Cvk23Q03PcQvqiZrDvt8AmQmjgg9v6Y9PHoxG7w==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-exporter-base@0.217.0':
+    resolution: {integrity: sha512-eYfqnB3UhKu/5frhd1R6+FprKygbhkomuaceMXDyzxbfXB9tKgZOVmjaJ02CkLA6Tdzumxl+e2H+vo2a8jiMPQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
   '@opentelemetry/otlp-exporter-base@0.54.0':
     resolution: {integrity: sha512-g+H7+QleVF/9lz4zhaR9Dt4VwApjqG5WWupy5CTMpWJfHB/nLxBbX73GBZDgdiNfh08nO3rNa6AS7fK8OhgF5g==}
     engines: {node: '>=14'}
     peerDependencies:
       '@opentelemetry/api': ^1.3.0
 
+  '@opentelemetry/otlp-grpc-exporter-base@0.217.0':
+    resolution: {integrity: sha512-7RTAdZuOsCDnsyqTCG4+bDzrfnsWdzkRs7z0AVi/V3tEQx0oKeyc+OuRWYxnRsmaJXgxcmB8vb/lfxn58Dj6Ag==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
+  '@opentelemetry/otlp-transformer@0.217.0':
+    resolution: {integrity: sha512-MKK8UHKFUOGAvbZRWh90MhwHG+Fxm6OROBdjKPCF+HQobjuJ/Kuf8Chs8CR45X1aqotxrMj7OxTdsXe8sXuGVA==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': ^1.3.0
+
   '@opentelemetry/otlp-transformer@0.54.0':
     resolution: {integrity: sha512-jRexIASQQzdK4AjfNIBfn94itAq4Q8EXR9d3b/OVbhd3kKQKvMr7GkxYDjbeTbY7hHCOLcLfJ3dpYQYGOe8qOQ==}
     engines: {node: '>=14'}
@@ -448,12 +832,24 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/propagator-b3@2.7.1':
+    resolution: {integrity: sha512-RJid6E2CKyeGfKBzXKF21ejabGMHypFkPAh3qZ+NvI+SGjuIye79t3PmiqcDgtRzdKH6ynXzbfslQ8DfpRUg2A==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
   '@opentelemetry/propagator-jaeger@1.28.0':
     resolution: {integrity: sha512-wKJ94+s8467CnIRgoSRh0yXm/te0QMOwTq9J01PfG/RzYZvlvN8aRisN2oZ9SznB45dDGnMj3BhUlchSA9cEKA==}
     engines: {node: '>=14'}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/propagator-jaeger@2.7.1':
+    resolution: {integrity: sha512-KMjVBHzP4N60bOzxja76M1F1hZZ43lGPga5ix+mkv9+kk1nx9SbkxSvJsMbuVUxdPQmsPTqGShmhN8ulrMOg6Q==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
   '@opentelemetry/resources@1.27.0':
     resolution: {integrity: sha512-jOwt2VJ/lUD5BLc+PMNymDrUCpm5PKi1E9oSVYAvz01U/VdndGmrtV3DU1pG4AwlYhJRHbHfOUIlpBeXCPw6QQ==}
     engines: {node: '>=14'}
@@ -466,6 +862,24 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/resources@2.7.1':
+    resolution: {integrity: sha512-DeT6KKolmC4e/dRQvMQ/RwlnzhaqeiFOXY5ngoOPJ07GgVVKxZOg9EcrNZb5aTzUn+iCrJldAgOfQm1O/QfPAQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
+  '@opentelemetry/resources@2.8.0':
+    resolution: {integrity: sha512-qmXQ27ilDbUK/vGMqwL8D4/rhn76C+sherM4wTbjlfknR8Nvfc/hCxjRJPhkzZzUsPiNg16SA31NxMabwttRjg==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
+  '@opentelemetry/sdk-logs@0.217.0':
+    resolution: {integrity: sha512-BB+PcHItcZDL63dPMW+mJvwN9rk37wuIDjRxbVlg6pPDvDR/7GL7UJHbGsllgoggOoTimsKgENaWPoGch/oE1A==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.4.0 <1.10.0'
+
   '@opentelemetry/sdk-logs@0.54.0':
     resolution: {integrity: sha512-HeWvOPiWhEw6lWvg+lCIi1WhJnIPbI4/OFZgHq9tKfpwF3LX6/kk3+GR8sGUGAEZfbjPElkkngzvd2s03zbD7Q==}
     engines: {node: '>=14'}
@@ -478,6 +892,18 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
 
+  '@opentelemetry/sdk-metrics@2.7.1':
+    resolution: {integrity: sha512-MpDJdkiFDs3Pm1RHO3KByuZbuBdJEXEAkiC0+yJdsZGVCdf1RpHR6n+LHDcS7ffmfrt5kVCzJSCfm4z2C7v0uQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.9.0 <1.10.0'
+
+  '@opentelemetry/sdk-node@0.217.0':
+    resolution: {integrity: sha512-K/60pSv42+NQiZKy1pAH18nYDkxltsDV4O3SJ233J0E9raU1ksyL9gsKuS8p30bYBb4AMPCfDuutHQaHYpcv0Q==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
   '@opentelemetry/sdk-trace-base@1.27.0':
     resolution: {integrity: sha512-btz6XTQzwsyJjombpeqCX6LhiMQYpzt2pIYNPnw0IPO/3AhT6yjnf8Mnv3ZC2A4eRYOjqrg+bfaXg9XHDRJDWQ==}
     engines: {node: '>=14'}
@@ -490,12 +916,30 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/sdk-trace-base@2.7.1':
+    resolution: {integrity: sha512-NAYIlsF8MPUsKqJMiDQJTMPOmlbawC1Iz/omMLygZ1C9am8fTKYjTaI+OZM+WTY3t3Glo0wnOg/6/pac6RGPPw==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
+  '@opentelemetry/sdk-trace-base@2.8.0':
+    resolution: {integrity: sha512-mhU4jp+vW0mGbFRd+GeXHvmfA4aDqWjBjLC3pE5XMpLs0IE2ryYb019Ts2AQrOq67gaTF25D91+fgvEHDZEnuQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.3.0 <1.10.0'
+
   '@opentelemetry/sdk-trace-node@1.28.0':
     resolution: {integrity: sha512-N0sYfYXvHpP0FNIyc+UfhLnLSTOuZLytV0qQVrDWIlABeD/DWJIGttS7nYeR14gQLXch0M1DW8zm3VeN6Opwtg==}
     engines: {node: '>=14'}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
+  '@opentelemetry/sdk-trace-node@2.7.1':
+    resolution: {integrity: sha512-pCpQxU68lV+I9s9svqMyVu5iHdDDUnqUpSxqwyCU8A9ejEsSnMPCbearwsUO4yk08ZJzAIUCFuReMdVQvHrdvg==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.0.0 <1.10.0'
+
   '@opentelemetry/semantic-conventions@1.27.0':
     resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==}
     engines: {node: '>=14'}
@@ -504,6 +948,10 @@ packages:
     resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==}
     engines: {node: '>=14'}
 
+  '@opentelemetry/semantic-conventions@1.41.1':
+    resolution: {integrity: sha512-/UhIkaZgPutTFmQ7RnIJGgDXZmtEJ7Dvi86xNTFWcnRxVRNk/aotsqDJYeEvDP+FSMB2SdW+pQzNMcWP0rwuNA==}
+    engines: {node: '>=14'}
+
   '@protobufjs/aspromise@1.1.2':
     resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
 
@@ -522,6 +970,9 @@ packages:
   '@protobufjs/float@1.0.2':
     resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
 
+  '@protobufjs/inquire@1.1.2':
+    resolution: {integrity: sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==}
+
   '@protobufjs/path@1.1.2':
     resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
 
@@ -531,6 +982,38 @@ packages:
   '@protobufjs/utf8@1.1.1':
     resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
 
+  '@sandbox-agent/cli-darwin-arm64@0.4.2':
+    resolution: {integrity: sha512-+L1O8SI7k/LLhyB4dG0ghmz1cJHa0WtVjuRTrEE2gw/5EbGLWopPBsCVCmQ7snrQ4fPwtaiZDhfExcEj1VI7aw==}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@sandbox-agent/cli-darwin-x64@0.4.2':
+    resolution: {integrity: sha512-dDg/EwWsdgVVbJiiCX1scSNRRA48u77SsC7Tuqrfzx4fIJMLuLiIcmEtXQyCBWysSyQNV2Cr+PYXXQfCb3xg8g==}
+    cpu: [x64]
+    os: [darwin]
+
+  '@sandbox-agent/cli-linux-arm64@0.4.2':
+    resolution: {integrity: sha512-TGmTUexMoubmWQyTeaOJu0rDVl2h0Ifh1pZ0ceZy7u/6Eoqs2n46CbfQtasUxZJf10uxPgRyzEDhcdDrTYVQUA==}
+    cpu: [arm64]
+    os: [linux]
+
+  '@sandbox-agent/cli-linux-x64@0.4.2':
+    resolution: {integrity: sha512-H9Rbqq0DRkCHvakzefJUDrDa2y+vJjlYd5/tefzKbQ34locE13TGNygRLxdEVXpBECjK9wVdBwTVEphQNsOcjw==}
+    cpu: [x64]
+    os: [linux]
+
+  '@sandbox-agent/cli-shared@0.4.2':
+    resolution: {integrity: sha512-sjZXRkKeFXCSKR6hHzF2Af8CCRO3F3WFwVQJ22+sLTXJ2xskV8lkUE4egknQU9B5BC1Zumts/YiNCFQWG85awQ==}
+
+  '@sandbox-agent/cli-win32-x64@0.4.2':
+    resolution: {integrity: sha512-lZNfHWPwQe/VH51Yvrl/ATCUvBZ3a+c8mwovojhQcmZlv4QuUQPkuvxhPqHRh9AyBx78L5J/ha46es2doa34nQ==}
+    cpu: [x64]
+    os: [win32]
+
+  '@sandbox-agent/cli@0.4.2':
+    resolution: {integrity: sha512-trO//ypJBSt5xkewuol9LOykvDgHwUXq8R+yQVS+0CmpN3lYUtewHkb+At9RVGRhDMmJZY2oasaXDnhfurQ33w==}
+    hasBin: true
+
   '@silvia-odwyer/photon-node@0.3.4':
     resolution: {integrity: sha512-bnly4BKB3KDTFxrUIcgCLbaeVVS8lrAkri1pEzskpmxu9MdfGQTy8b8EgcD83ywD3RPMsIulY8xJH5Awa+t9fA==}
 
@@ -580,13 +1063,49 @@ packages:
   '@types/retry@0.12.0':
     resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==}
 
+  '@zed-industries/claude-agent-acp@0.23.1':
+    resolution: {integrity: sha512-aQ1gAm1MBalwEgE/VB/m4z6sXw/fRccNOW268pNLXnWV704ZuLbbm0N+oEv8KTmd53dJ6YzMhMpD8p5ig6C+sA==}
+    deprecated: This package has been renamed to @agentclientprotocol/claude-agent-acp. Please migrate to continue receiving updates.
+    hasBin: true
+
+  acorn-import-attributes@1.9.5:
+    resolution: {integrity: sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==}
+    peerDependencies:
+      acorn: ^8
+
+  acorn@8.17.0:
+    resolution: {integrity: sha512-xRQbDb9BnwDafYNn6Vwl839DYVjqXYb1XVGtWAZ1kcDc6iwAL4hg3B1dZlRiuENFeO2H53gFG3in621AdERVAg==}
+    engines: {node: '>=0.4.0'}
+    hasBin: true
+
+  acp-http-client@0.4.2:
+    resolution: {integrity: sha512-3wtPieF08YIU4vNXaoL5up/1D0if4i9IX3Ye5q/bwbcwg1BKsazIK/VNNfvN4ldbPjWul69IqIOpGRS3I0qo3Q==}
+
+  agent-base@6.0.2:
+    resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==}
+    engines: {node: '>= 6.0.0'}
+
   agent-base@7.1.4:
     resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
     engines: {node: '>= 14'}
 
+  ansi-regex@5.0.1:
+    resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
+    engines: {node: '>=8'}
+
+  ansi-styles@4.3.0:
+    resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==}
+    engines: {node: '>=8'}
+
   anynum@1.0.0:
     resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==}
 
+  asynckit@0.4.0:
+    resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
+
+  axios@1.18.0:
+    resolution: {integrity: sha512-E32NzpYKp++W7XRe52rHiXV2ehxmh3wbdgO7MHeFM+vqxLBYHzt0ElkiImtOBxtOmyp0yoC8C6uESVV84Y2/hw==}
+
   balanced-match@4.0.4:
     resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
     engines: {node: 18 || 20 || >=22}
@@ -604,13 +1123,50 @@ packages:
     resolution: {integrity: sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==}
     engines: {node: 18 || 20 || >=22}
 
+  braces@3.0.3:
+    resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==}
+    engines: {node: '>=8'}
+
   buffer-equal-constant-time@1.0.1:
     resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
 
+  buffer@5.6.0:
+    resolution: {integrity: sha512-/gDYp/UtU0eA1ys8bOs9J6a+E/KWIY+DZ+Q2WESNUA0jFRsJOc0SNUO6xJ5SGA1xueg3NL65W6s+NY5l9cunuw==}
+
+  busboy@1.6.0:
+    resolution: {integrity: sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==}
+    engines: {node: '>=10.16.0'}
+
+  call-bind-apply-helpers@1.0.2:
+    resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==}
+    engines: {node: '>= 0.4'}
+
   chalk@5.6.2:
     resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==}
     engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
 
+  chownr@3.0.0:
+    resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
+    engines: {node: '>=18'}
+
+  cjs-module-lexer@2.2.0:
+    resolution: {integrity: sha512-4bHTS2YuzUvtoLjdy+98ykbNB5jS0+07EvFNXerqZQJ89F7DI6ET7OQo/HJuW6K0aVsKA9hj9/RVb2kQVOrPDQ==}
+
+  cliui@8.0.1:
+    resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==}
+    engines: {node: '>=12'}
+
+  color-convert@2.0.1:
+    resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
+    engines: {node: '>=7.0.0'}
+
+  color-name@1.1.4:
+    resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
+
+  combined-stream@1.0.8:
+    resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
+    engines: {node: '>= 0.8'}
+
   cross-spawn@7.0.6:
     resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
     engines: {node: '>= 8'}
@@ -628,21 +1184,68 @@ packages:
       supports-color:
         optional: true
 
+  delayed-stream@1.0.0:
+    resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
+    engines: {node: '>=0.4.0'}
+
   diff@8.0.4:
     resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==}
     engines: {node: '>=0.3.1'}
 
+  dotenv@17.4.2:
+    resolution: {integrity: sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw==}
+    engines: {node: '>=12'}
+
+  dunder-proto@1.0.1:
+    resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==}
+    engines: {node: '>= 0.4'}
+
   ecdsa-sig-formatter@1.0.11:
     resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==}
 
+  emoji-regex@8.0.0:
+    resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==}
+
+  es-define-property@1.0.1:
+    resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==}
+    engines: {node: '>= 0.4'}
+
+  es-errors@1.3.0:
+    resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==}
+    engines: {node: '>= 0.4'}
+
+  es-object-atoms@1.1.2:
+    resolution: {integrity: sha512-HWcBoN6NileqtSydK2FqHbS/LoDd2pqrnQHLyJzBj4kOp/ky2MWMN694xOfkK8/SnUsW2DH7EfyVlydKCsm1Zw==}
+    engines: {node: '>= 0.4'}
+
+  es-set-tostringtag@2.1.0:
+    resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==}
+    engines: {node: '>= 0.4'}
+
   esbuild@0.23.1:
     resolution: {integrity: sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==}
     engines: {node: '>=18'}
     hasBin: true
 
+  escalade@3.2.0:
+    resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==}
+    engines: {node: '>=6'}
+
+  events@3.3.0:
+    resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
+    engines: {node: '>=0.8.x'}
+
+  expand-tilde@2.0.2:
+    resolution: {integrity: sha512-A5EmesHW6rfnZ9ysHQjPdJRni0SRar0tjtG5MNtm9n5TUvsYU8oozprtRD4AqHxcZWWlVuAmQo2nWKfN9oyjTw==}
+    engines: {node: '>=0.10.0'}
+
   extend@3.0.2:
     resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==}
 
+  fast-glob@3.3.3:
+    resolution: {integrity: sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==}
+    engines: {node: '>=8.6.0'}
+
   fast-xml-builder@1.2.0:
     resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==}
 
@@ -650,19 +1253,45 @@ packages:
     resolution: {integrity: sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==}
     hasBin: true
 
+  fastq@1.20.1:
+    resolution: {integrity: sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==}
+
   fetch-blob@3.2.0:
     resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==}
     engines: {node: ^12.20 || >= 14.13}
 
+  fill-range@7.1.1:
+    resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
+    engines: {node: '>=8'}
+
+  follow-redirects@1.16.0:
+    resolution: {integrity: sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==}
+    engines: {node: '>=4.0'}
+    peerDependencies:
+      debug: '*'
+    peerDependenciesMeta:
+      debug:
+        optional: true
+
+  form-data@4.0.6:
+    resolution: {integrity: sha512-vKatAh4SlVfgbv+YtmhiRjhEMJsYpsG1Y2rMQtR+SVSbytsSD1YGzDIcrAJmdFec88u/+VoGmxnl+80gL1tRCQ==}
+    engines: {node: '>= 6'}
+
   formdata-polyfill@4.0.10:
     resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==}
     engines: {node: '>=12.20.0'}
 
+  forwarded-parse@2.1.2:
+    resolution: {integrity: sha512-alTFZZQDKMporBH77856pXgzhEzaUVmLCDk+egLgIgHst3Tpndzz8MnKe+GzRJRfvVdn69HhpW7cmXzvtLvJAw==}
+
   fsevents@2.3.3:
     resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
     engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
     os: [darwin]
 
+  function-bind@1.1.2:
+    resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
+
   gaxios@7.1.5:
     resolution: {integrity: sha512-5FZy72Rh8LhtjmvDrKkI+lVhrsQrVKVsItxMoDm5mNQE+xR0WVIIs+jzPSJgBvKVsLi24fZhXJIsNI0bihDzFg==}
     engines: {node: '>=18'}
@@ -671,13 +1300,29 @@ packages:
     resolution: {integrity: sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==}
     engines: {node: '>=18'}
 
+  get-caller-file@2.0.5:
+    resolution: {integrity: sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==}
+    engines: {node: 6.* || 8.* || >= 10.*}
+
   get-east-asian-width@1.6.0:
     resolution: {integrity: sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA==}
     engines: {node: '>=18'}
 
+  get-intrinsic@1.3.0:
+    resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==}
+    engines: {node: '>= 0.4'}
+
+  get-proto@1.0.1:
+    resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==}
+    engines: {node: '>= 0.4'}
+
   get-tsconfig@4.14.0:
     resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==}
 
+  glob-parent@5.1.2:
+    resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
+    engines: {node: '>= 6'}
+
   glob@13.0.6:
     resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==}
     engines: {node: 18 || 20 || >=22}
@@ -690,12 +1335,32 @@ packages:
     resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==}
     engines: {node: '>=14'}
 
+  gopd@1.2.0:
+    resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
+    engines: {node: '>= 0.4'}
+
   graceful-fs@4.2.11:
     resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
 
+  has-symbols@1.1.0:
+    resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==}
+    engines: {node: '>= 0.4'}
+
+  has-tostringtag@1.0.2:
+    resolution: {integrity: sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==}
+    engines: {node: '>= 0.4'}
+
+  hasown@2.0.4:
+    resolution: {integrity: sha512-T2UbfbBEF32wiepXIsMlTW9+dDYC6wMh/t/vYA4tuOMKqWz/n3vr1NFSxQiyP+zk2mXsoMA/i/7qV6LKut1t1A==}
+    engines: {node: '>= 0.4'}
+
   highlight.js@10.7.3:
     resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==}
 
+  homedir-polyfill@1.0.3:
+    resolution: {integrity: sha512-eSmmWE5bZTK2Nou4g0AI3zZ9rswp7GRKoKXS1BLUkvPviOqs4YTN1djQIqrXy9k5gEtdLPy86JjRwsNM9tnDcA==}
+    engines: {node: '>=0.10.0'}
+
   hosted-git-info@9.0.3:
     resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==}
     engines: {node: ^20.17.0 || >=22.9.0}
@@ -704,17 +1369,52 @@ packages:
     resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==}
     engines: {node: '>= 14'}
 
+  https-proxy-agent@5.0.1:
+    resolution: {integrity: sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==}
+    engines: {node: '>= 6'}
+
   https-proxy-agent@7.0.6:
     resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==}
     engines: {node: '>= 14'}
 
+  ieee754@1.2.1:
+    resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
+
   ignore@7.0.5:
     resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==}
     engines: {node: '>= 4'}
 
+  import-in-the-middle@3.0.2:
+    resolution: {integrity: sha512-LGLYRl0A2gtyUJb2WDliBHmk6TtlHwdDjxonacZ8QrEs/ZW+YDgNv2QAfjRQWpS8HqvNcq6GGnN6jrOa5FysDQ==}
+    engines: {node: '>=18'}
+
+  inherits@2.0.4:
+    resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
+
+  is-extglob@2.1.1:
+    resolution: {integrity: sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==}
+    engines: {node: '>=0.10.0'}
+
+  is-fullwidth-code-point@3.0.0:
+    resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==}
+    engines: {node: '>=8'}
+
+  is-glob@4.0.3:
+    resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==}
+    engines: {node: '>=0.10.0'}
+
+  is-number@7.0.0:
+    resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
+    engines: {node: '>=0.12.0'}
+
   isexe@2.0.0:
     resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
 
+  isomorphic-ws@5.0.0:
+    resolution: {integrity: sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==}
+    peerDependencies:
+      ws: '*'
+
   jiti@2.7.0:
     resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==}
     hasBin: true
@@ -732,6 +1432,9 @@ packages:
   jws@4.0.1:
     resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==}
 
+  lodash.camelcase@4.3.0:
+    resolution: {integrity: sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==}
+
   long@5.3.2:
     resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==}
 
@@ -744,6 +1447,26 @@ packages:
     engines: {node: '>= 18'}
     hasBin: true
 
+  math-intrinsics@1.1.0:
+    resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
+    engines: {node: '>= 0.4'}
+
+  merge2@1.4.1:
+    resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==}
+    engines: {node: '>= 8'}
+
+  micromatch@4.0.8:
+    resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==}
+    engines: {node: '>=8.6'}
+
+  mime-db@1.52.0:
+    resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==}
+    engines: {node: '>= 0.6'}
+
+  mime-types@2.1.35:
+    resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==}
+    engines: {node: '>= 0.6'}
+
   minimatch@10.2.5:
     resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==}
     engines: {node: 18 || 20 || >=22}
@@ -752,6 +1475,13 @@ packages:
     resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==}
     engines: {node: '>=16 || 14 >=14.17'}
 
+  minizlib@3.1.0:
+    resolution: {integrity: sha512-KZxYo1BUkWD2TVFLr0MQoM8vUUigWD3LlD83a/75BqC+4qE0Hb1Vo5v1FgcfaNXvfXzr+5EhQ6ing/CaBijTlw==}
+    engines: {node: '>= 18'}
+
+  module-details-from-path@1.0.4:
+    resolution: {integrity: sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w==}
+
   ms@2.1.3:
     resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
 
@@ -780,6 +1510,10 @@ packages:
     resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==}
     engines: {node: '>=8'}
 
+  parse-passwd@1.0.0:
+    resolution: {integrity: sha512-1Y1A//QUXEZK7YKz+rD9WydcE1+EuPr6ZBgKecAB8tmoW6UFv0NREVJe1p+jRxtThkcbbKkfwIbWJe/IeE6m2Q==}
+    engines: {node: '>=0.10.0'}
+
   partial-json@0.1.7:
     resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==}
 
@@ -795,6 +1529,18 @@ packages:
     resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==}
     engines: {node: 18 || 20 || >=22}
 
+  pathe@2.0.3:
+    resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==}
+
+  pi-acp@0.0.29:
+    resolution: {integrity: sha512-WL2+arwD+TFpZoXSsybopL5nOcZQSWn5W50tnXgPJeYrBBVG43afzHs7SJl1+QFNgFtKUh2xT6VqaF76Kggn3w==}
+    engines: {node: '>=20'}
+    hasBin: true
+
+  picomatch@2.3.2:
+    resolution: {integrity: sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==}
+    engines: {node: '>=8.6'}
+
   proper-lockfile@4.1.2:
     resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==}
 
@@ -802,6 +1548,29 @@ packages:
     resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==}
     engines: {node: '>=12.0.0'}
 
+  protobufjs@8.0.1:
+    resolution: {integrity: sha512-NWWCCscLjs+cOKF/s/XVNFRW7Yih0fdH+9brffR5NZCy8k42yRdl5KlWKMVXuI1vfCoy4o1z80XR/W/QUb3V3w==}
+    engines: {node: '>=12.0.0'}
+
+  proxy-from-env@2.1.0:
+    resolution: {integrity: sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==}
+    engines: {node: '>=10'}
+
+  queue-microtask@1.2.3:
+    resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==}
+
+  readable-stream@3.6.2:
+    resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
+    engines: {node: '>= 6'}
+
+  require-directory@2.1.1:
+    resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
+    engines: {node: '>=0.10.0'}
+
+  require-in-the-middle@8.0.1:
+    resolution: {integrity: sha512-QT7FVMXfWOYFbeRBF6nu+I6tr2Tf3u0q8RIEjNob/heKY/nh7drD/k7eeMFmSQgnTtCzLDcCu/XEnpW2wk4xCQ==}
+    engines: {node: '>=9.3.0 || >=8.10.0 <9.0.0'}
+
   resolve-pkg-maps@1.0.0:
     resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==}
 
@@ -813,9 +1582,48 @@ packages:
     resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==}
     engines: {node: '>= 4'}
 
+  reusify@1.1.0:
+    resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==}
+    engines: {iojs: '>=1.0.0', node: '>=0.10.0'}
+
+  run-parallel@1.2.0:
+    resolution: {integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==}
+
   safe-buffer@5.2.1:
     resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
 
+  sandbox-agent@0.4.2:
+    resolution: {integrity: sha512-fH6WDQEaIrgiu93LxZcy+4Dx+t+/cslu+hzXImDyUlsaL6jV2jIv4fdxELkALlo7uzyEDVK9lmqs9qy65RHwBQ==}
+    peerDependencies:
+      '@cloudflare/sandbox': '>=0.1.0'
+      '@daytonaio/sdk': '>=0.12.0'
+      '@e2b/code-interpreter': '>=1.0.0'
+      '@fly/sprites': '>=0.0.1'
+      '@vercel/sandbox': '>=0.1.0'
+      computesdk: '>=0.1.0'
+      dockerode: '>=4.0.0'
+      get-port: '>=7.0.0'
+      modal: '>=0.1.0'
+    peerDependenciesMeta:
+      '@cloudflare/sandbox':
+        optional: true
+      '@daytonaio/sdk':
+        optional: true
+      '@e2b/code-interpreter':
+        optional: true
+      '@fly/sprites':
+        optional: true
+      '@vercel/sandbox':
+        optional: true
+      computesdk:
+        optional: true
+      dockerode:
+        optional: true
+      get-port:
+        optional: true
+      modal:
+        optional: true
+
   semver@7.8.0:
     resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==}
     engines: {node: '>=10'}
@@ -829,12 +1637,42 @@ packages:
     resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
     engines: {node: '>=8'}
 
+  shell-quote@1.8.4:
+    resolution: {integrity: sha512-VsC6n6vz1ihYYyZZwX7YZSF5l5x36ca17OC+a69h94YqB7X6XLwf+5MOgynYir2SLFUbl8gIYvBo8K8RoNQ6bQ==}
+    engines: {node: '>= 0.4'}
+
   signal-exit@3.0.7:
     resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==}
 
+  stream-browserify@3.0.0:
+    resolution: {integrity: sha512-H73RAHsVBapbim0tU2JwwOiXUj+fikfiaoYAKHF3VJfA0pe2BCzkhAHBlLG6REzE+2WNZcxOXjK7lkso+9euLA==}
+
+  streamsearch@1.1.0:
+    resolution: {integrity: sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==}
+    engines: {node: '>=10.0.0'}
+
+  string-width@4.2.3:
+    resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
+    engines: {node: '>=8'}
+
+  string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+
+  strip-ansi@6.0.1:
+    resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
+    engines: {node: '>=8'}
+
   strnum@2.4.0:
     resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==}
 
+  tar@7.5.16:
+    resolution: {integrity: sha512-56adEpPMouktRlBLXiaYFFzZ/3+JXa8P9n7WbR+ibIjtviN55mEaOkiysCnPnWm+7kkui1Dn8J9l+g6zV8731w==}
+    engines: {node: '>=18'}
+
+  to-regex-range@5.0.1:
+    resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
+    engines: {node: '>=8.0'}
+
   ts-algebra@2.0.0:
     resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==}
 
@@ -856,6 +1694,9 @@ packages:
     resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==}
     engines: {node: '>=22.19.0'}
 
+  util-deprecate@1.0.2:
+    resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
+
   web-streams-polyfill@3.3.3:
     resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==}
     engines: {node: '>= 8'}
@@ -865,6 +1706,10 @@ packages:
     engines: {node: '>= 8'}
     hasBin: true
 
+  wrap-ansi@7.0.0:
+    resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==}
+    engines: {node: '>=10'}
+
   ws@8.21.0:
     resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==}
     engines: {node: '>=10.0.0'}
@@ -881,21 +1726,66 @@ packages:
     resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==}
     engines: {node: '>=16.0.0'}
 
+  y18n@5.0.8:
+    resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==}
+    engines: {node: '>=10'}
+
+  yallist@5.0.0:
+    resolution: {integrity: sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==}
+    engines: {node: '>=18'}
+
   yaml@2.9.0:
     resolution: {integrity: sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==}
     engines: {node: '>= 14.6'}
     hasBin: true
 
+  yargs-parser@21.1.1:
+    resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==}
+    engines: {node: '>=12'}
+
+  yargs@17.7.2:
+    resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
+    engines: {node: '>=12'}
+
   zod-to-json-schema@3.25.2:
     resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==}
     peerDependencies:
       zod: ^3.25.28 || ^4
 
+  zod@3.25.76:
+    resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==}
+
   zod@4.4.3:
     resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==}
 
 snapshots:
 
+  '@agentclientprotocol/sdk@0.16.1(zod@4.4.3)':
+    dependencies:
+      zod: 4.4.3
+
+  '@agentclientprotocol/sdk@0.17.0(zod@4.4.3)':
+    dependencies:
+      zod: 4.4.3
+
+  '@agentclientprotocol/sdk@0.26.0(zod@3.25.76)':
+    dependencies:
+      zod: 3.25.76
+
+  '@anthropic-ai/claude-agent-sdk@0.2.83(zod@4.4.3)':
+    dependencies:
+      zod: 4.4.3
+    optionalDependencies:
+      '@img/sharp-darwin-arm64': 0.34.5
+      '@img/sharp-darwin-x64': 0.34.5
+      '@img/sharp-linux-arm': 0.34.5
+      '@img/sharp-linux-arm64': 0.34.5
+      '@img/sharp-linux-x64': 0.34.5
+      '@img/sharp-linuxmusl-arm64': 0.34.5
+      '@img/sharp-linuxmusl-x64': 0.34.5
+      '@img/sharp-win32-arm64': 0.34.5
+      '@img/sharp-win32-x64': 0.34.5
+
   '@anthropic-ai/sdk@0.91.1(zod@4.4.3)':
     dependencies:
       json-schema-to-ts: 3.1.1
@@ -908,6 +1798,21 @@ snapshots:
       '@aws-sdk/types': 3.973.12
       tslib: 2.8.1
 
+  '@aws-crypto/crc32c@5.2.0':
+    dependencies:
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.13
+      tslib: 2.8.1
+
+  '@aws-crypto/sha1-browser@5.2.0':
+    dependencies:
+      '@aws-crypto/supports-web-crypto': 5.2.0
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/types': 3.973.13
+      '@aws-sdk/util-locate-window': 3.965.7
+      '@smithy/util-utf8': 2.3.0
+      tslib: 2.8.1
+
   '@aws-crypto/sha256-browser@5.2.0':
     dependencies:
       '@aws-crypto/sha256-js': 5.2.0
@@ -934,6 +1839,17 @@ snapshots:
       '@smithy/util-utf8': 2.3.0
       tslib: 2.8.1
 
+  '@aws-sdk/checksums@3.1000.6':
+    dependencies:
+      '@aws-crypto/crc32': 5.2.0
+      '@aws-crypto/crc32c': 5.2.0
+      '@aws-crypto/util': 5.2.0
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/client-bedrock-runtime@3.1048.0':
     dependencies:
       '@aws-crypto/sha256-browser': 5.2.0
@@ -951,6 +1867,23 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/client-s3@3.1070.0':
+    dependencies:
+      '@aws-crypto/sha1-browser': 5.2.0
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/credential-provider-node': 3.972.56
+      '@aws-sdk/middleware-flexible-checksums': 3.974.31
+      '@aws-sdk/middleware-sdk-s3': 3.972.52
+      '@aws-sdk/signature-v4-multi-region': 3.996.35
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/core@3.974.20':
     dependencies:
       '@aws-sdk/types': 3.973.12
@@ -962,6 +1895,17 @@ snapshots:
       bowser: 2.14.1
       tslib: 2.8.1
 
+  '@aws-sdk/core@3.974.21':
+    dependencies:
+      '@aws-sdk/types': 3.973.13
+      '@aws-sdk/xml-builder': 3.972.30
+      '@aws/lambda-invoke-store': 0.2.4
+      '@smithy/core': 3.24.7
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      bowser: 2.14.1
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-env@3.972.46':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -970,6 +1914,14 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-env@3.972.47':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-http@3.972.48':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -980,6 +1932,16 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-http@3.972.49':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-ini@3.972.53':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -996,6 +1958,22 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-ini@3.972.54':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/credential-provider-env': 3.972.47
+      '@aws-sdk/credential-provider-http': 3.972.49
+      '@aws-sdk/credential-provider-login': 3.972.53
+      '@aws-sdk/credential-provider-process': 3.972.47
+      '@aws-sdk/credential-provider-sso': 3.972.53
+      '@aws-sdk/credential-provider-web-identity': 3.972.53
+      '@aws-sdk/nested-clients': 3.997.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-login@3.972.52':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1005,6 +1983,15 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-login@3.972.53':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/nested-clients': 3.997.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-node@3.972.55':
     dependencies:
       '@aws-sdk/credential-provider-env': 3.972.46
@@ -1019,6 +2006,20 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-node@3.972.56':
+    dependencies:
+      '@aws-sdk/credential-provider-env': 3.972.47
+      '@aws-sdk/credential-provider-http': 3.972.49
+      '@aws-sdk/credential-provider-ini': 3.972.54
+      '@aws-sdk/credential-provider-process': 3.972.47
+      '@aws-sdk/credential-provider-sso': 3.972.53
+      '@aws-sdk/credential-provider-web-identity': 3.972.53
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/credential-provider-imds': 4.3.9
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-process@3.972.46':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1027,6 +2028,14 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-process@3.972.47':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-sso@3.972.52':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1037,6 +2046,16 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-sso@3.972.53':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/nested-clients': 3.997.21
+      '@aws-sdk/token-providers': 3.1069.0
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/credential-provider-web-identity@3.972.52':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1046,6 +2065,15 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/credential-provider-web-identity@3.972.53':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/nested-clients': 3.997.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/eventstream-handler-node@3.972.21':
     dependencies:
       '@aws-sdk/types': 3.973.12
@@ -1053,6 +2081,16 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/lib-storage@3.1070.0(@aws-sdk/client-s3@3.1070.0)':
+    dependencies:
+      '@aws-sdk/client-s3': 3.1070.0
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      buffer: 5.6.0
+      events: 3.3.0
+      stream-browserify: 3.0.0
+      tslib: 2.8.1
+
   '@aws-sdk/middleware-eventstream@3.972.17':
     dependencies:
       '@aws-sdk/types': 3.973.12
@@ -1060,6 +2098,20 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/middleware-flexible-checksums@3.974.31':
+    dependencies:
+      '@aws-sdk/checksums': 3.1000.6
+      tslib: 2.8.1
+
+  '@aws-sdk/middleware-sdk-s3@3.972.52':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/signature-v4-multi-region': 3.996.35
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/middleware-websocket@3.972.28':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1083,6 +2135,19 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/nested-clients@3.997.21':
+    dependencies:
+      '@aws-crypto/sha256-browser': 5.2.0
+      '@aws-crypto/sha256-js': 5.2.0
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/signature-v4-multi-region': 3.996.35
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/fetch-http-handler': 5.4.7
+      '@smithy/node-http-handler': 4.7.8
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/signature-v4-multi-region@3.996.34':
     dependencies:
       '@aws-sdk/types': 3.973.12
@@ -1090,6 +2155,13 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/signature-v4-multi-region@3.996.35':
+    dependencies:
+      '@aws-sdk/types': 3.973.13
+      '@smithy/signature-v4': 5.4.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/token-providers@3.1048.0':
     dependencies:
       '@aws-sdk/core': 3.974.20
@@ -1108,11 +2180,25 @@ snapshots:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/token-providers@3.1069.0':
+    dependencies:
+      '@aws-sdk/core': 3.974.21
+      '@aws-sdk/nested-clients': 3.997.21
+      '@aws-sdk/types': 3.973.13
+      '@smithy/core': 3.24.7
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/types@3.973.12':
     dependencies:
       '@smithy/types': 4.14.4
       tslib: 2.8.1
 
+  '@aws-sdk/types@3.973.13':
+    dependencies:
+      '@smithy/types': 4.14.4
+      tslib: 2.8.1
+
   '@aws-sdk/util-locate-window@3.965.7':
     dependencies:
       tslib: 2.8.1
@@ -1123,10 +2209,60 @@ snapshots:
       fast-xml-parser: 5.7.3
       tslib: 2.8.1
 
+  '@aws-sdk/xml-builder@3.972.30':
+    dependencies:
+      '@smithy/types': 4.14.4
+      fast-xml-parser: 5.7.3
+      tslib: 2.8.1
+
   '@aws/lambda-invoke-store@0.2.4': {}
 
   '@babel/runtime@7.29.7': {}
 
+  '@daytona/api-client@0.187.0':
+    dependencies:
+      axios: 1.18.0
+    transitivePeerDependencies:
+      - debug
+      - supports-color
+
+  '@daytona/toolbox-api-client@0.187.0':
+    dependencies:
+      axios: 1.18.0
+    transitivePeerDependencies:
+      - debug
+      - supports-color
+
+  '@daytonaio/sdk@0.187.0(ws@8.21.0)':
+    dependencies:
+      '@aws-sdk/client-s3': 3.1070.0
+      '@aws-sdk/lib-storage': 3.1070.0(@aws-sdk/client-s3@3.1070.0)
+      '@daytona/api-client': 0.187.0
+      '@daytona/toolbox-api-client': 0.187.0
+      '@iarna/toml': 2.2.5
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/exporter-trace-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-node': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.8.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+      axios: 1.18.0
+      busboy: 1.6.0
+      dotenv: 17.4.2
+      expand-tilde: 2.0.2
+      fast-glob: 3.3.3
+      form-data: 4.0.6
+      isomorphic-ws: 5.0.0(ws@8.21.0)
+      pathe: 2.0.3
+      shell-quote: 1.8.4
+      tar: 7.5.16
+    transitivePeerDependencies:
+      - debug
+      - supports-color
+      - ws
+
   '@earendil-works/pi-agent-core@0.79.4(ws@8.21.0)(zod@4.4.3)':
     dependencies:
       '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3)
@@ -1279,8 +2415,90 @@ snapshots:
       - supports-color
       - utf-8-validate
 
-  '@mariozechner/clipboard-darwin-arm64@0.3.9':
-    optional: true
+  '@grpc/grpc-js@1.14.4':
+    dependencies:
+      '@grpc/proto-loader': 0.8.1
+      '@js-sdsl/ordered-map': 4.4.2
+
+  '@grpc/proto-loader@0.8.1':
+    dependencies:
+      lodash.camelcase: 4.3.0
+      long: 5.3.2
+      protobufjs: 7.6.4
+      yargs: 17.7.2
+
+  '@iarna/toml@2.2.5': {}
+
+  '@img/sharp-darwin-arm64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-darwin-arm64': 1.2.4
+    optional: true
+
+  '@img/sharp-darwin-x64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-darwin-x64': 1.2.4
+    optional: true
+
+  '@img/sharp-libvips-darwin-arm64@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-darwin-x64@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-linux-arm64@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-linux-arm@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-linux-x64@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-linuxmusl-arm64@1.2.4':
+    optional: true
+
+  '@img/sharp-libvips-linuxmusl-x64@1.2.4':
+    optional: true
+
+  '@img/sharp-linux-arm64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-linux-arm64': 1.2.4
+    optional: true
+
+  '@img/sharp-linux-arm@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-linux-arm': 1.2.4
+    optional: true
+
+  '@img/sharp-linux-x64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-linux-x64': 1.2.4
+    optional: true
+
+  '@img/sharp-linuxmusl-arm64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-linuxmusl-arm64': 1.2.4
+    optional: true
+
+  '@img/sharp-linuxmusl-x64@0.34.5':
+    optionalDependencies:
+      '@img/sharp-libvips-linuxmusl-x64': 1.2.4
+    optional: true
+
+  '@img/sharp-win32-arm64@0.34.5':
+    optional: true
+
+  '@img/sharp-win32-x64@0.34.5':
+    optional: true
+
+  '@isaacs/fs-minipass@4.0.1':
+    dependencies:
+      minipass: 7.1.3
+
+  '@js-sdsl/ordered-map@4.4.2': {}
+
+  '@mariozechner/clipboard-darwin-arm64@0.3.9':
+    optional: true
 
   '@mariozechner/clipboard-darwin-universal@0.3.9':
     optional: true
@@ -1334,16 +2552,42 @@ snapshots:
 
   '@nodable/entities@2.2.0': {}
 
+  '@nodelib/fs.scandir@2.1.5':
+    dependencies:
+      '@nodelib/fs.stat': 2.0.5
+      run-parallel: 1.2.0
+
+  '@nodelib/fs.stat@2.0.5': {}
+
+  '@nodelib/fs.walk@1.2.8':
+    dependencies:
+      '@nodelib/fs.scandir': 2.1.5
+      fastq: 1.20.1
+
+  '@opentelemetry/api-logs@0.217.0':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
   '@opentelemetry/api-logs@0.54.0':
     dependencies:
       '@opentelemetry/api': 1.9.0
 
   '@opentelemetry/api@1.9.0': {}
 
+  '@opentelemetry/configuration@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      yaml: 2.9.0
+
   '@opentelemetry/context-async-hooks@1.28.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
 
+  '@opentelemetry/context-async-hooks@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+
   '@opentelemetry/core@1.27.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1354,6 +2598,114 @@ snapshots:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/semantic-conventions': 1.27.0
 
+  '@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/core@2.8.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/exporter-logs-otlp-grpc@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@grpc/grpc-js': 1.14.4
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.217.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-logs-otlp-http@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.217.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-logs-otlp-proto@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-metrics-otlp-grpc@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@grpc/grpc-js': 1.14.4
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-metrics-otlp-http@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-metrics-otlp-proto@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-prometheus@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/exporter-trace-otlp-grpc@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@grpc/grpc-js': 1.14.4
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-trace-otlp-http@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+
   '@opentelemetry/exporter-trace-otlp-proto@0.54.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1363,12 +2715,64 @@ snapshots:
       '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0)
 
+  '@opentelemetry/exporter-zipkin@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/instrumentation-http@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+      forwarded-parse: 2.1.2
+    transitivePeerDependencies:
+      - supports-color
+
+  '@opentelemetry/instrumentation@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      import-in-the-middle: 3.0.2
+      require-in-the-middle: 8.0.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@opentelemetry/otlp-exporter-base@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+
   '@opentelemetry/otlp-exporter-base@0.54.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0)
 
+  '@opentelemetry/otlp-grpc-exporter-base@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@grpc/grpc-js': 1.14.4
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-transformer': 0.217.0(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/otlp-transformer@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+      protobufjs: 8.0.1
+
   '@opentelemetry/otlp-transformer@0.54.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1385,11 +2789,21 @@ snapshots:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
 
+  '@opentelemetry/propagator-b3@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+
   '@opentelemetry/propagator-jaeger@1.28.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
 
+  '@opentelemetry/propagator-jaeger@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+
   '@opentelemetry/resources@1.27.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1402,6 +2816,26 @@ snapshots:
       '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/semantic-conventions': 1.27.0
 
+  '@opentelemetry/resources@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/resources@2.8.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/sdk-logs@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
   '@opentelemetry/sdk-logs@0.54.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1415,6 +2849,43 @@ snapshots:
       '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0)
 
+  '@opentelemetry/sdk-metrics@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+
+  '@opentelemetry/sdk-node@0.217.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api-logs': 0.217.0
+      '@opentelemetry/configuration': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/context-async-hooks': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-logs-otlp-grpc': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-logs-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-logs-otlp-proto': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-metrics-otlp-grpc': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-metrics-otlp-proto': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-prometheus': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-trace-otlp-grpc': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-trace-otlp-http': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-trace-otlp-proto': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/exporter-zipkin': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/otlp-exporter-base': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-b3': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/propagator-jaeger': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-logs': 0.217.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-node': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+    transitivePeerDependencies:
+      - supports-color
+
   '@opentelemetry/sdk-trace-base@1.27.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1429,6 +2900,20 @@ snapshots:
       '@opentelemetry/resources': 1.28.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/semantic-conventions': 1.27.0
 
+  '@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
+  '@opentelemetry/sdk-trace-base@2.8.0(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/semantic-conventions': 1.41.1
+
   '@opentelemetry/sdk-trace-node@1.28.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
@@ -1439,10 +2924,19 @@ snapshots:
       '@opentelemetry/sdk-trace-base': 1.28.0(@opentelemetry/api@1.9.0)
       semver: 7.8.0
 
+  '@opentelemetry/sdk-trace-node@2.7.1(@opentelemetry/api@1.9.0)':
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/context-async-hooks': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.0)
+
   '@opentelemetry/semantic-conventions@1.27.0': {}
 
   '@opentelemetry/semantic-conventions@1.28.0': {}
 
+  '@opentelemetry/semantic-conventions@1.41.1': {}
+
   '@protobufjs/aspromise@1.1.2': {}
 
   '@protobufjs/base64@1.1.2': {}
@@ -1457,12 +2951,42 @@ snapshots:
 
   '@protobufjs/float@1.0.2': {}
 
+  '@protobufjs/inquire@1.1.2': {}
+
   '@protobufjs/path@1.1.2': {}
 
   '@protobufjs/pool@1.1.0': {}
 
   '@protobufjs/utf8@1.1.1': {}
 
+  '@sandbox-agent/cli-darwin-arm64@0.4.2':
+    optional: true
+
+  '@sandbox-agent/cli-darwin-x64@0.4.2':
+    optional: true
+
+  '@sandbox-agent/cli-linux-arm64@0.4.2':
+    optional: true
+
+  '@sandbox-agent/cli-linux-x64@0.4.2':
+    optional: true
+
+  '@sandbox-agent/cli-shared@0.4.2': {}
+
+  '@sandbox-agent/cli-win32-x64@0.4.2':
+    optional: true
+
+  '@sandbox-agent/cli@0.4.2':
+    dependencies:
+      '@sandbox-agent/cli-shared': 0.4.2
+    optionalDependencies:
+      '@sandbox-agent/cli-darwin-arm64': 0.4.2
+      '@sandbox-agent/cli-darwin-x64': 0.4.2
+      '@sandbox-agent/cli-linux-arm64': 0.4.2
+      '@sandbox-agent/cli-linux-x64': 0.4.2
+      '@sandbox-agent/cli-win32-x64': 0.4.2
+    optional: true
+
   '@silvia-odwyer/photon-node@0.3.4': {}
 
   '@smithy/core@3.24.7':
@@ -1525,10 +3049,52 @@ snapshots:
 
   '@types/retry@0.12.0': {}
 
+  '@zed-industries/claude-agent-acp@0.23.1':
+    dependencies:
+      '@agentclientprotocol/sdk': 0.17.0(zod@4.4.3)
+      '@anthropic-ai/claude-agent-sdk': 0.2.83(zod@4.4.3)
+      zod: 4.4.3
+
+  acorn-import-attributes@1.9.5(acorn@8.17.0):
+    dependencies:
+      acorn: 8.17.0
+
+  acorn@8.17.0: {}
+
+  acp-http-client@0.4.2(zod@4.4.3):
+    dependencies:
+      '@agentclientprotocol/sdk': 0.16.1(zod@4.4.3)
+    transitivePeerDependencies:
+      - zod
+
+  agent-base@6.0.2:
+    dependencies:
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
   agent-base@7.1.4: {}
 
+  ansi-regex@5.0.1: {}
+
+  ansi-styles@4.3.0:
+    dependencies:
+      color-convert: 2.0.1
+
   anynum@1.0.0: {}
 
+  asynckit@0.4.0: {}
+
+  axios@1.18.0:
+    dependencies:
+      follow-redirects: 1.16.0
+      form-data: 4.0.6
+      https-proxy-agent: 5.0.1
+      proxy-from-env: 2.1.0
+    transitivePeerDependencies:
+      - debug
+      - supports-color
+
   balanced-match@4.0.4: {}
 
   base64-js@1.5.1: {}
@@ -1541,10 +3107,48 @@ snapshots:
     dependencies:
       balanced-match: 4.0.4
 
+  braces@3.0.3:
+    dependencies:
+      fill-range: 7.1.1
+
   buffer-equal-constant-time@1.0.1: {}
 
+  buffer@5.6.0:
+    dependencies:
+      base64-js: 1.5.1
+      ieee754: 1.2.1
+
+  busboy@1.6.0:
+    dependencies:
+      streamsearch: 1.1.0
+
+  call-bind-apply-helpers@1.0.2:
+    dependencies:
+      es-errors: 1.3.0
+      function-bind: 1.1.2
+
   chalk@5.6.2: {}
 
+  chownr@3.0.0: {}
+
+  cjs-module-lexer@2.2.0: {}
+
+  cliui@8.0.1:
+    dependencies:
+      string-width: 4.2.3
+      strip-ansi: 6.0.1
+      wrap-ansi: 7.0.0
+
+  color-convert@2.0.1:
+    dependencies:
+      color-name: 1.1.4
+
+  color-name@1.1.4: {}
+
+  combined-stream@1.0.8:
+    dependencies:
+      delayed-stream: 1.0.0
+
   cross-spawn@7.0.6:
     dependencies:
       path-key: 3.1.1
@@ -1557,12 +3161,39 @@ snapshots:
     dependencies:
       ms: 2.1.3
 
+  delayed-stream@1.0.0: {}
+
   diff@8.0.4: {}
 
+  dotenv@17.4.2: {}
+
+  dunder-proto@1.0.1:
+    dependencies:
+      call-bind-apply-helpers: 1.0.2
+      es-errors: 1.3.0
+      gopd: 1.2.0
+
   ecdsa-sig-formatter@1.0.11:
     dependencies:
       safe-buffer: 5.2.1
 
+  emoji-regex@8.0.0: {}
+
+  es-define-property@1.0.1: {}
+
+  es-errors@1.3.0: {}
+
+  es-object-atoms@1.1.2:
+    dependencies:
+      es-errors: 1.3.0
+
+  es-set-tostringtag@2.1.0:
+    dependencies:
+      es-errors: 1.3.0
+      get-intrinsic: 1.3.0
+      has-tostringtag: 1.0.2
+      hasown: 2.0.4
+
   esbuild@0.23.1:
     optionalDependencies:
       '@esbuild/aix-ppc64': 0.23.1
@@ -1590,8 +3221,24 @@ snapshots:
       '@esbuild/win32-ia32': 0.23.1
       '@esbuild/win32-x64': 0.23.1
 
+  escalade@3.2.0: {}
+
+  events@3.3.0: {}
+
+  expand-tilde@2.0.2:
+    dependencies:
+      homedir-polyfill: 1.0.3
+
   extend@3.0.2: {}
 
+  fast-glob@3.3.3:
+    dependencies:
+      '@nodelib/fs.stat': 2.0.5
+      '@nodelib/fs.walk': 1.2.8
+      glob-parent: 5.1.2
+      merge2: 1.4.1
+      micromatch: 4.0.8
+
   fast-xml-builder@1.2.0:
     dependencies:
       path-expression-matcher: 1.5.0
@@ -1604,18 +3251,40 @@ snapshots:
       path-expression-matcher: 1.5.0
       strnum: 2.4.0
 
+  fastq@1.20.1:
+    dependencies:
+      reusify: 1.1.0
+
   fetch-blob@3.2.0:
     dependencies:
       node-domexception: 1.0.0
       web-streams-polyfill: 3.3.3
 
+  fill-range@7.1.1:
+    dependencies:
+      to-regex-range: 5.0.1
+
+  follow-redirects@1.16.0: {}
+
+  form-data@4.0.6:
+    dependencies:
+      asynckit: 0.4.0
+      combined-stream: 1.0.8
+      es-set-tostringtag: 2.1.0
+      hasown: 2.0.4
+      mime-types: 2.1.35
+
   formdata-polyfill@4.0.10:
     dependencies:
       fetch-blob: 3.2.0
 
+  forwarded-parse@2.1.2: {}
+
   fsevents@2.3.3:
     optional: true
 
+  function-bind@1.1.2: {}
+
   gaxios@7.1.5:
     dependencies:
       extend: 3.0.2
@@ -1632,12 +3301,36 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  get-caller-file@2.0.5: {}
+
   get-east-asian-width@1.6.0: {}
 
+  get-intrinsic@1.3.0:
+    dependencies:
+      call-bind-apply-helpers: 1.0.2
+      es-define-property: 1.0.1
+      es-errors: 1.3.0
+      es-object-atoms: 1.1.2
+      function-bind: 1.1.2
+      get-proto: 1.0.1
+      gopd: 1.2.0
+      has-symbols: 1.1.0
+      hasown: 2.0.4
+      math-intrinsics: 1.1.0
+
+  get-proto@1.0.1:
+    dependencies:
+      dunder-proto: 1.0.1
+      es-object-atoms: 1.1.2
+
   get-tsconfig@4.14.0:
     dependencies:
       resolve-pkg-maps: 1.0.0
 
+  glob-parent@5.1.2:
+    dependencies:
+      is-glob: 4.0.3
+
   glob@13.0.6:
     dependencies:
       minimatch: 10.2.5
@@ -1657,10 +3350,26 @@ snapshots:
 
   google-logging-utils@1.1.3: {}
 
+  gopd@1.2.0: {}
+
   graceful-fs@4.2.11: {}
 
+  has-symbols@1.1.0: {}
+
+  has-tostringtag@1.0.2:
+    dependencies:
+      has-symbols: 1.1.0
+
+  hasown@2.0.4:
+    dependencies:
+      function-bind: 1.1.2
+
   highlight.js@10.7.3: {}
 
+  homedir-polyfill@1.0.3:
+    dependencies:
+      parse-passwd: 1.0.0
+
   hosted-git-info@9.0.3:
     dependencies:
       lru-cache: 11.5.1
@@ -1672,6 +3381,13 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  https-proxy-agent@5.0.1:
+    dependencies:
+      agent-base: 6.0.2
+      debug: 4.4.3
+    transitivePeerDependencies:
+      - supports-color
+
   https-proxy-agent@7.0.6:
     dependencies:
       agent-base: 7.1.4
@@ -1679,10 +3395,35 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  ieee754@1.2.1: {}
+
   ignore@7.0.5: {}
 
+  import-in-the-middle@3.0.2:
+    dependencies:
+      acorn: 8.17.0
+      acorn-import-attributes: 1.9.5(acorn@8.17.0)
+      cjs-module-lexer: 2.2.0
+      module-details-from-path: 1.0.4
+
+  inherits@2.0.4: {}
+
+  is-extglob@2.1.1: {}
+
+  is-fullwidth-code-point@3.0.0: {}
+
+  is-glob@4.0.3:
+    dependencies:
+      is-extglob: 2.1.1
+
+  is-number@7.0.0: {}
+
   isexe@2.0.0: {}
 
+  isomorphic-ws@5.0.0(ws@8.21.0):
+    dependencies:
+      ws: 8.21.0
+
   jiti@2.7.0: {}
 
   json-bigint@1.0.0:
@@ -1705,18 +3446,41 @@ snapshots:
       jwa: 2.0.1
       safe-buffer: 5.2.1
 
+  lodash.camelcase@4.3.0: {}
+
   long@5.3.2: {}
 
   lru-cache@11.5.1: {}
 
   marked@15.0.12: {}
 
+  math-intrinsics@1.1.0: {}
+
+  merge2@1.4.1: {}
+
+  micromatch@4.0.8:
+    dependencies:
+      braces: 3.0.3
+      picomatch: 2.3.2
+
+  mime-db@1.52.0: {}
+
+  mime-types@2.1.35:
+    dependencies:
+      mime-db: 1.52.0
+
   minimatch@10.2.5:
     dependencies:
       brace-expansion: 5.0.6
 
   minipass@7.1.3: {}
 
+  minizlib@3.1.0:
+    dependencies:
+      minipass: 7.1.3
+
+  module-details-from-path@1.0.4: {}
+
   ms@2.1.3: {}
 
   node-domexception@1.0.0: {}
@@ -1737,6 +3501,8 @@ snapshots:
       '@types/retry': 0.12.0
       retry: 0.13.1
 
+  parse-passwd@1.0.0: {}
+
   partial-json@0.1.7: {}
 
   path-expression-matcher@1.5.0: {}
@@ -1748,6 +3514,15 @@ snapshots:
       lru-cache: 11.5.1
       minipass: 7.1.3
 
+  pathe@2.0.3: {}
+
+  pi-acp@0.0.29:
+    dependencies:
+      '@agentclientprotocol/sdk': 0.26.0(zod@3.25.76)
+      zod: 3.25.76
+
+  picomatch@2.3.2: {}
+
   proper-lockfile@4.1.2:
     dependencies:
       graceful-fs: 4.2.11
@@ -1768,14 +3543,64 @@ snapshots:
       '@types/node': 22.10.2
       long: 5.3.2
 
+  protobufjs@8.0.1:
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+      '@protobufjs/base64': 1.1.2
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.1
+      '@protobufjs/fetch': 1.1.1
+      '@protobufjs/float': 1.0.2
+      '@protobufjs/inquire': 1.1.2
+      '@protobufjs/path': 1.1.2
+      '@protobufjs/pool': 1.1.0
+      '@protobufjs/utf8': 1.1.1
+      '@types/node': 22.10.2
+      long: 5.3.2
+
+  proxy-from-env@2.1.0: {}
+
+  queue-microtask@1.2.3: {}
+
+  readable-stream@3.6.2:
+    dependencies:
+      inherits: 2.0.4
+      string_decoder: 1.3.0
+      util-deprecate: 1.0.2
+
+  require-directory@2.1.1: {}
+
+  require-in-the-middle@8.0.1:
+    dependencies:
+      debug: 4.4.3
+      module-details-from-path: 1.0.4
+    transitivePeerDependencies:
+      - supports-color
+
   resolve-pkg-maps@1.0.0: {}
 
   retry@0.12.0: {}
 
   retry@0.13.1: {}
 
+  reusify@1.1.0: {}
+
+  run-parallel@1.2.0:
+    dependencies:
+      queue-microtask: 1.2.3
+
   safe-buffer@5.2.1: {}
 
+  sandbox-agent@0.4.2(@daytonaio/sdk@0.187.0(ws@8.21.0))(zod@4.4.3):
+    dependencies:
+      '@sandbox-agent/cli-shared': 0.4.2
+      acp-http-client: 0.4.2(zod@4.4.3)
+    optionalDependencies:
+      '@daytonaio/sdk': 0.187.0(ws@8.21.0)
+      '@sandbox-agent/cli': 0.4.2
+    transitivePeerDependencies:
+      - zod
+
   semver@7.8.0: {}
 
   shebang-command@2.0.0:
@@ -1784,12 +3609,47 @@ snapshots:
 
   shebang-regex@3.0.0: {}
 
+  shell-quote@1.8.4: {}
+
   signal-exit@3.0.7: {}
 
+  stream-browserify@3.0.0:
+    dependencies:
+      inherits: 2.0.4
+      readable-stream: 3.6.2
+
+  streamsearch@1.1.0: {}
+
+  string-width@4.2.3:
+    dependencies:
+      emoji-regex: 8.0.0
+      is-fullwidth-code-point: 3.0.0
+      strip-ansi: 6.0.1
+
+  string_decoder@1.3.0:
+    dependencies:
+      safe-buffer: 5.2.1
+
+  strip-ansi@6.0.1:
+    dependencies:
+      ansi-regex: 5.0.1
+
   strnum@2.4.0:
     dependencies:
       anynum: 1.0.0
 
+  tar@7.5.16:
+    dependencies:
+      '@isaacs/fs-minipass': 4.0.1
+      chownr: 3.0.0
+      minipass: 7.1.3
+      minizlib: 3.1.0
+      yallist: 5.0.0
+
+  to-regex-range@5.0.1:
+    dependencies:
+      is-number: 7.0.0
+
   ts-algebra@2.0.0: {}
 
   tslib@2.8.1: {}
@@ -1807,20 +3667,46 @@ snapshots:
 
   undici@8.3.0: {}
 
+  util-deprecate@1.0.2: {}
+
   web-streams-polyfill@3.3.3: {}
 
   which@2.0.2:
     dependencies:
       isexe: 2.0.0
 
+  wrap-ansi@7.0.0:
+    dependencies:
+      ansi-styles: 4.3.0
+      string-width: 4.2.3
+      strip-ansi: 6.0.1
+
   ws@8.21.0: {}
 
   xml-naming@0.1.0: {}
 
+  y18n@5.0.8: {}
+
+  yallist@5.0.0: {}
+
   yaml@2.9.0: {}
 
+  yargs-parser@21.1.1: {}
+
+  yargs@17.7.2:
+    dependencies:
+      cliui: 8.0.1
+      escalade: 3.2.0
+      get-caller-file: 2.0.5
+      require-directory: 2.1.1
+      string-width: 4.2.3
+      y18n: 5.0.8
+      yargs-parser: 21.1.1
+
   zod-to-json-schema@3.25.2(zod@4.4.3):
     dependencies:
       zod: 4.4.3
 
+  zod@3.25.76: {}
+
   zod@4.4.3: {}
diff --git a/services/agent/scripts/build-extension.mjs b/services/agent/scripts/build-extension.mjs
new file mode 100644
index 0000000000..229d805040
--- /dev/null
+++ b/services/agent/scripts/build-extension.mjs
@@ -0,0 +1,30 @@
+/**
+ * Bundle the Agenta Pi extension into one self-contained file so its OpenTelemetry deps
+ * resolve wherever Pi loads it (host, docker sidecar, Daytona snapshot). Pi only accepts
+ * `.ts`/`.js` extension files, so we emit `.js` (ESM) with a default export.
+ *
+ * Run: pnpm run build:extension  ->  dist/extensions/agenta.js
+ */
+import { build } from "esbuild";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const root = join(dirname(fileURLToPath(import.meta.url)), "..");
+
+await build({
+  entryPoints: [join(root, "src/piExtension.ts")],
+  outfile: join(root, "dist/extensions/agenta.js"),
+  bundle: true,
+  platform: "node",
+  format: "esm",
+  target: "node20",
+  // Pi provides the ExtensionAPI at load time; never bundle the harness SDK.
+  external: ["@earendil-works/pi-coding-agent"],
+  banner: {
+    // protobufjs and some deps expect CommonJS globals under ESM; shim them.
+    js: "import{createRequire as __cr}from'node:module';const require=__cr(import.meta.url);",
+  },
+  logLevel: "info",
+});
+
+process.stderr.write("[build-extension] wrote dist/extensions/agenta.js\n");
diff --git a/services/agent/src/agenta-otel.ts b/services/agent/src/agenta-otel.ts
index 3d838329a1..35bdeffd6a 100644
--- a/services/agent/src/agenta-otel.ts
+++ b/services/agent/src/agenta-otel.ts
@@ -402,6 +402,8 @@ export interface AgentaOtel {
   config: RunConfig;
   /** Flush this run's trace to Agenta. Await before the process/response ends. */
   flush: () => Promise<void>;
+  /** Run totals (tokens + cost) summed across turns, for roll-up onto the parent. */
+  usage: () => { input: number; output: number; total: number; cost: number };
 }
 
 /**
@@ -434,6 +436,22 @@ export function createAgentaOtel(
   let llmSpan: Span | undefined;
   let lastContextMessages: any[] | undefined;
   const toolSpans = new Map<string, Span>();
+  // Run totals, summed across every assistant turn. Stamped on the agent span and
+  // returned so the caller can roll them up onto the workflow span in its own process
+  // (the agent and workflow spans are exported in separate OTLP batches, so Agenta's
+  // per-batch cumulative roll-up cannot bridge them on its own).
+  const runUsage = { input: 0, output: 0, total: 0, cost: 0 };
+
+  function accumulateUsage(msg: any): void {
+    const u = msg?.usage;
+    if (!u) return;
+    const input = u.input ?? 0;
+    const output = u.output ?? 0;
+    runUsage.input += input;
+    runUsage.output += output;
+    runUsage.total += u.totalTokens ?? input + output;
+    if (u.cost?.total != null) runUsage.cost += u.cost.total;
+  }
 
   const register = (pi: ExtensionAPI): void => {
     pi.on("before_agent_start", async (event: any) => {
@@ -494,6 +512,7 @@ export function createAgentaOtel(
       const msg = event?.message;
       if (!msg || msg.role !== "assistant" || !llmSpan) return;
       applyAssistant(llmSpan, msg, config.captureContent);
+      accumulateUsage(msg);
       llmSpan.end();
       llmSpan = undefined;
     });
@@ -524,6 +543,7 @@ export function createAgentaOtel(
       // close it from the turn's assistant message.
       if (llmSpan && event?.message) {
         applyAssistant(llmSpan, event.message, config.captureContent);
+        accumulateUsage(event.message);
         llmSpan.end();
         llmSpan = undefined;
       }
@@ -536,6 +556,16 @@ export function createAgentaOtel(
     pi.on("agent_end", async (event: any) => {
       if (!agentSpan) return;
       setOutput(agentSpan, lastAssistantText(event?.messages), config.captureContent);
+      // Stamp the run total on the agent span so it shows the agent's tokens/cost even
+      // though Agenta cannot roll the per-turn LLM spans up across batches.
+      if (runUsage.total > 0) {
+        agentSpan.setAttribute("gen_ai.usage.input_tokens", runUsage.input);
+        agentSpan.setAttribute("gen_ai.usage.output_tokens", runUsage.output);
+        agentSpan.setAttribute("gen_ai.usage.prompt_tokens", runUsage.input);
+        agentSpan.setAttribute("gen_ai.usage.completion_tokens", runUsage.output);
+        agentSpan.setAttribute("gen_ai.usage.total_tokens", runUsage.total);
+        if (runUsage.cost > 0) agentSpan.setAttribute("gen_ai.usage.cost", runUsage.cost);
+      }
       agentSpan.end();
       agentSpan = undefined;
       agentCtx = undefined;
@@ -547,5 +577,277 @@ export function createAgentaOtel(
     register,
     config,
     flush: () => flushTrace(config.traceId),
+    usage: () => ({ ...runUsage }),
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Rivet / ACP tracer (one per run; state is closure-scoped)
+// ---------------------------------------------------------------------------
+//
+// The Pi extension above hooks Pi's in-process `pi.on(...)` events. Under rivet the
+// harness runs as a separate process and we never see those events; instead the rivet
+// SDK surfaces the run as ACP `session/update` notifications (agent_message_chunk,
+// tool_call, tool_call_update, usage_update). This tracer builds the SAME span tree
+// from that event stream, so tracing is uniform across every harness rivet drives
+// (Pi, Claude Code, ...) and always nests under the caller's `/invoke` span.
+//
+// Span tree (per prompt turn):
+//   invoke_agent          (AGENT)
+//     turn 0              (CHAIN)
+//       chat <model>      (LLM)   — model interaction; usage where the harness reports it
+//       execute_tool <n>  (TOOL)  — one per ACP tool_call
+
+/** Text of an ACP ContentBlock (the shape carried by message/thought chunks). */
+function acpBlockText(block: any): string {
+  if (!block) return "";
+  if (typeof block === "string") return block;
+  if (block.type === "text" && typeof block.text === "string") return block.text;
+  return "";
+}
+
+/** Text of an ACP tool_call `content` array (ToolCallContent[]). */
+function acpToolContentText(content: any): string {
+  if (!content) return "";
+  if (typeof content === "string") return content;
+  if (Array.isArray(content)) {
+    return content
+      .map((c: any) => acpBlockText(c?.content ?? c))
+      .filter(Boolean)
+      .join("");
+  }
+  return "";
+}
+
+/**
+ * Strip the pi-acp startup banner that some setups emit as the first agent message
+ * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude,
+ * surfaced ahead of the real answer). Removes only a leading run of those marker lines
+ * so a genuine answer is never touched.
+ */
+function stripStartupBanner(text: string): string {
+  const lines = text.split("\n");
+  const isBanner = (line: string) =>
+    /^pi v\d+\.\d+\.\d+/.test(line) ||
+    /^## Context\b/.test(line) ||
+    /^-\s+\/.*AGENTS\.md\s*$/.test(line) ||
+    /^New version available:/.test(line) ||
+    /^Run: `npm/.test(line) ||
+    line.trim() === "---" ||
+    line.trim() === "";
+  let i = 0;
+  let sawBanner = false;
+  while (i < lines.length && isBanner(lines[i])) {
+    if (lines[i].trim() !== "") sawBanner = true;
+    i++;
+  }
+  return sawBanner ? lines.slice(i).join("\n").trim() : text;
+}
+
+/** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */
+function splitModel(model?: string): { provider?: string; id?: string } {
+  if (!model) return {};
+  const slash = model.indexOf("/");
+  if (slash === -1) return { id: model };
+  return { provider: model.slice(0, slash), id: model.slice(slash + 1) };
+}
+
+export interface RivetOtelInit extends Partial<RunConfig> {
+  captureContent?: boolean;
+  /** Harness id ("pi" / "claude"); becomes gen_ai.agent.name. */
+  harness?: string;
+  /** Resolved model id ("openai-codex/gpt-5.5"); set on the LLM span. */
+  model?: string;
+  /**
+   * Emit the span tree from the ACP event stream. Default true. Set false when the
+   * harness instruments itself (e.g. Pi via the agenta extension propagates the trace
+   * context and emits its own real turn/chat/tool spans) — then this only accumulates
+   * the reply text and builds no spans, so the two do not double up.
+   */
+  emitSpans?: boolean;
+}
+
+export interface RivetOtel {
+  /** Start the invoke_agent (AGENT) span as a child of the caller's traceparent. */
+  start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void;
+  /** Feed one ACP `session/update` payload (the `update` object). */
+  handleUpdate(update: any): void;
+  /** End all open spans. Returns the accumulated assistant text. */
+  finish(): string;
+  /** Flush this run's trace to Agenta (invoke_agent has a remote parent). */
+  flush(): Promise<void>;
+  /** Trace id of the run (the caller's trace when a traceparent was passed). */
+  traceId(): string | undefined;
+  /** Accumulated assistant output text so far. */
+  output(): string;
+}
+
+/**
+ * Build an ACP-event-driven tracer scoped to a single rivet run. Call `start` once,
+ * `handleUpdate` for every ACP session update, then `finish` + `await flush`.
+ */
+export function createRivetOtel(init: RivetOtelInit): RivetOtel {
+  ensureProvider();
+
+  const capture = init.captureContent !== false;
+  const emitSpans = init.emitSpans !== false;
+  const endpoint = init.endpoint ?? defaultTarget().endpoint;
+  const authorization = init.authorization ?? defaultTarget().authorization;
+  const { provider, id: modelId } = splitModel(init.model);
+  const tracer = trace.getTracer("agenta-rivet-otel", "0.1.0");
+
+  let agentSpan: Span | undefined;
+  let agentCtx: Context | undefined;
+  let turnSpan: Span | undefined;
+  let turnCtx: Context | undefined;
+  let llmSpan: Span | undefined;
+  let runTraceId: string | undefined;
+  let accumulated = "";
+  let usage: { cost?: number; total?: number } | undefined;
+  const toolSpans = new Map<string, { span: Span; name: string }>();
+
+  function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void {
+    // Span-less mode (harness self-instruments): only track the trace id so the run can
+    // report it; the harness emits the spans under the propagated parent.
+    if (!emitSpans) {
+      const m = /^00-([0-9a-f]{32})-/.exec(init.traceparent ?? "");
+      runTraceId = m ? m[1] : undefined;
+      return;
+    }
+    const parent = parentContext(init.traceparent);
+    agentSpan = tracer.startSpan("invoke_agent", undefined, parent);
+    agentSpan.setAttribute("openinference.span.kind", "AGENT");
+    agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent");
+    agentSpan.setAttribute("gen_ai.agent.name", init.harness ?? "agent");
+    const sessionId = input.sessionId ?? init.sessionId;
+    if (sessionId) {
+      agentSpan.setAttribute("session.id", sessionId);
+      agentSpan.setAttribute("gen_ai.conversation.id", sessionId);
+    }
+    setInputs(agentSpan, { prompt: input.prompt ?? "" }, capture);
+
+    runTraceId = agentSpan.spanContext().traceId;
+    traceTargets.set(runTraceId, { endpoint, authorization });
+    agentCtx = trace.setSpan(parent ?? context.active(), agentSpan);
+
+    turnSpan = tracer.startSpan("turn 0", undefined, agentCtx);
+    turnSpan.setAttribute("openinference.span.kind", "CHAIN");
+    turnSpan.setAttribute("pi.turn.index", 0);
+    turnCtx = trace.setSpan(agentCtx, turnSpan);
+
+    llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, turnCtx);
+    llmSpan.setAttribute("openinference.span.kind", "LLM");
+    llmSpan.setAttribute("gen_ai.operation.name", "chat");
+    if (provider) llmSpan.setAttribute("gen_ai.system", provider);
+    if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId);
+    const inputMessages =
+      input.messages && input.messages.length
+        ? input.messages
+        : [{ role: "user", content: input.prompt ?? "" }];
+    emitMessages(llmSpan, "llm.input_messages", inputMessages, capture);
+  }
+
+  function handleUpdate(update: any): void {
+    const kind = update?.sessionUpdate;
+    if (!kind) return;
+
+    if (kind === "agent_message_chunk") {
+      const t = acpBlockText(update.content);
+      if (!t) return;
+      // Pi streams pure deltas; Claude streams deltas plus a cumulative snapshot.
+      // Replace when a chunk is a superset of what we have, append otherwise.
+      if (t.startsWith(accumulated)) accumulated = t;
+      else accumulated += t;
+      return;
+    }
+
+    if (!emitSpans) return; // output accumulated above; spans come from the harness
+
+    if (kind === "tool_call") {
+      const id = update.toolCallId;
+      if (!id || !turnCtx) return;
+      const name = update.title || update.kind || "tool";
+      const span = tracer.startSpan(`execute_tool ${name}`, undefined, turnCtx);
+      span.setAttribute("openinference.span.kind", "TOOL");
+      span.setAttribute("gen_ai.operation.name", "execute_tool");
+      span.setAttribute("gen_ai.tool.name", String(name));
+      span.setAttribute("gen_ai.tool.call.id", String(id));
+      if (update.rawInput != null)
+        setInputs(span, update.rawInput as Record<string, unknown>, capture);
+      toolSpans.set(id, { span, name: String(name) });
+      // A tool_call can arrive already completed (status set up front).
+      maybeCloseTool(id, update);
+      return;
+    }
+
+    if (kind === "tool_call_update") {
+      maybeCloseTool(update.toolCallId, update);
+      return;
+    }
+
+    if (kind === "usage_update") {
+      const cost = update.cost?.amount;
+      const total = update.used;
+      usage = {
+        cost: typeof cost === "number" ? cost : usage?.cost,
+        total: typeof total === "number" ? total : usage?.total,
+      };
+    }
+  }
+
+  /** Close a tool span when the update marks it completed or failed. */
+  function maybeCloseTool(id: string | undefined, update: any): void {
+    if (!id) return;
+    const entry = toolSpans.get(id);
+    if (!entry) return;
+    const status = update?.status;
+    if (status !== "completed" && status !== "failed") return;
+    const out = acpToolContentText(update.content) || acpToolContentText(update.rawOutput);
+    setOutput(entry.span, out, capture);
+    if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR });
+    entry.span.end();
+    toolSpans.delete(id);
+  }
+
+  function finish(): string {
+    const text = stripStartupBanner(accumulated.trim());
+    if (!emitSpans) return text;
+    if (llmSpan) {
+      emitMessages(
+        llmSpan,
+        "llm.output_messages",
+        [{ role: "assistant", content: text }],
+        capture,
+      );
+      if (usage?.total != null) {
+        llmSpan.setAttribute("gen_ai.usage.total_tokens", usage.total);
+      }
+      if (usage?.cost != null) llmSpan.setAttribute("gen_ai.usage.cost", usage.cost);
+      llmSpan.end();
+      llmSpan = undefined;
+    }
+    for (const { span } of toolSpans.values()) span.end();
+    toolSpans.clear();
+    if (turnSpan) {
+      turnSpan.end();
+      turnSpan = undefined;
+    }
+    if (agentSpan) {
+      setOutput(agentSpan, text, capture);
+      agentSpan.end();
+      agentSpan = undefined;
+    }
+    agentCtx = undefined;
+    turnCtx = undefined;
+    return text;
+  }
+
+  return {
+    start,
+    handleUpdate,
+    finish,
+    flush: () => flushTrace(runTraceId),
+    traceId: () => runTraceId,
+    output: () => accumulated,
   };
 }
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
index ed8b99c3ae..3eacd78abc 100644
--- a/services/agent/src/cli.ts
+++ b/services/agent/src/cli.ts
@@ -7,6 +7,14 @@
  * long-lived RPC adapter can replace it later behind the same Python-side port.
  */
 import { runPi, type AgentRunRequest, type AgentRunResult } from "./runPi.ts";
+import { runRivet } from "./runRivet.ts";
+
+// `rivet` drives the harness over ACP via a rivet daemon (WP-8); default = legacy Pi.
+const BACKEND = (process.env.AGENT_BACKEND ?? "pi").toLowerCase();
+
+function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
+  return BACKEND === "rivet" ? runRivet(request) : runPi(request);
+}
 
 async function readStdin(): Promise<string> {
   const chunks: Buffer[] = [];
@@ -32,7 +40,7 @@ async function main(): Promise<void> {
   }
 
   try {
-    const result = await runPi(request);
+    const result = await runAgent(request);
     emit(result);
     process.exit(result.ok ? 0 : 1);
   } catch (err) {
diff --git a/services/agent/src/piExtension.ts b/services/agent/src/piExtension.ts
new file mode 100644
index 0000000000..94418a137f
--- /dev/null
+++ b/services/agent/src/piExtension.ts
@@ -0,0 +1,171 @@
+/**
+ * Agenta Pi extension (WP-8): tracing + tools, installed into Pi's agent dir and loaded
+ * by Pi when it runs under rivet (`pi --mode rpc` via pi-acp).
+ *
+ * This is how we keep WP-1/WP-2/WP-7 behavior on the rivet path: instead of a synthetic,
+ * coarse tracer in the runner, we propagate the caller's trace context INTO Pi and let
+ * Pi emit its real span tree (turn / chat / tool, with token usage) under that parent —
+ * and we deliver tools the Pi-native way (`registerTool`), each routing back to Agenta's
+ * /tools/call, rather than over MCP. Pi is highly customizable; this leans on that.
+ *
+ * Everything is read from the environment (injected at the daemon's birth), so nothing
+ * run-specific is written to the agent-visible filesystem:
+ *   AGENTA_TRACEPARENT            W3C traceparent of the caller's /invoke span
+ *   AGENTA_OTLP_ENDPOINT          OTLP traces URL (e.g. https://host/api/otlp/v1/traces)
+ *   AGENTA_OTLP_AUTHORIZATION     Authorization header for the OTLP export
+ *   AGENTA_CAPTURE_CONTENT        "false" to drop prompt/completion/tool I/O from spans
+ *   AGENTA_TOOL_SPECS             JSON [{ name, description, inputSchema, callRef }]
+ *   AGENTA_TOOL_CALLBACK_ENDPOINT full /tools/call URL
+ *   AGENTA_TOOL_CALLBACK_AUTH     Authorization header for the callback
+ *
+ * Bundled self-contained (esbuild) so its OpenTelemetry deps resolve wherever Pi loads
+ * it (local, the docker sidecar, a Daytona snapshot). Default export is the Pi
+ * ExtensionFactory.
+ */
+import { writeFileSync } from "node:fs";
+
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+
+import { createAgentaOtel } from "./agenta-otel.ts";
+
+interface ToolSpec {
+  name: string;
+  description?: string;
+  inputSchema?: Record<string, unknown> | null;
+  callRef: string;
+}
+
+const TOOL_CALL_TIMEOUT_MS = Number(process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000);
+const EMPTY_SCHEMA = { type: "object", properties: {}, additionalProperties: true };
+
+function log(message: string): void {
+  process.stderr.write(`[agenta-pi-ext] ${message}\n`);
+}
+
+/** One /tools/call round-trip. Returns the result text; throws on failure (Pi turns a
+ *  thrown execute into a tool-error result, so the loop continues). */
+async function callAgentaTool(
+  endpoint: string,
+  authorization: string | undefined,
+  callRef: string,
+  toolCallId: string,
+  args: unknown,
+  signal?: AbortSignal,
+): Promise<string> {
+  const headers: Record<string, string> = { "content-type": "application/json" };
+  if (authorization) headers["authorization"] = authorization;
+
+  const timeoutSignal = AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS);
+  const anyOf = (AbortSignal as any).any;
+  const combined =
+    signal && typeof anyOf === "function" ? anyOf([signal, timeoutSignal]) : timeoutSignal;
+
+  let response: Response;
+  try {
+    response = await fetch(endpoint, {
+      method: "POST",
+      headers,
+      body: JSON.stringify({
+        data: {
+          id: toolCallId,
+          type: "function",
+          function: { name: callRef, arguments: args ?? {} },
+        },
+      }),
+      signal: combined,
+    });
+  } catch (err) {
+    throw new Error(`tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+
+  const bodyText = await response.text();
+  if (!response.ok) {
+    throw new Error(`tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`);
+  }
+  try {
+    const parsed = JSON.parse(bodyText);
+    const content = parsed?.call?.data?.content;
+    if (typeof content === "string") return content;
+    if (content != null) return JSON.stringify(content);
+    return bodyText;
+  } catch {
+    return bodyText;
+  }
+}
+
+/** Register the resolved tools (from env) as Pi tools that call back to Agenta. */
+function registerTools(pi: ExtensionAPI): void {
+  const raw = process.env.AGENTA_TOOL_SPECS;
+  const endpoint = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT;
+  if (!raw || !endpoint) return;
+
+  let specs: ToolSpec[] = [];
+  try {
+    specs = JSON.parse(raw);
+  } catch (err) {
+    log(`bad AGENTA_TOOL_SPECS: ${(err as Error).message}`);
+    return;
+  }
+  const authorization = process.env.AGENTA_TOOL_CALLBACK_AUTH;
+
+  for (const spec of specs) {
+    pi.registerTool({
+      name: spec.name,
+      label: spec.name,
+      description: spec.description ?? spec.name,
+      // Pi accepts plain JSON Schema here (non-TypeBox validation path).
+      parameters: (spec.inputSchema as any) ?? EMPTY_SCHEMA,
+      async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
+        const text = await callAgentaTool(
+          endpoint,
+          authorization,
+          spec.callRef,
+          toolCallId,
+          params,
+          signal,
+        );
+        return { content: [{ type: "text", text }], details: { callRef: spec.callRef } };
+      },
+    } as any);
+  }
+  log(`registered ${specs.length} tool(s) -> ${endpoint}`);
+}
+
+/** The Pi ExtensionFactory: tools + (env-driven) tracing + usage writeback. */
+const factory = (pi: ExtensionAPI): void => {
+  // Fully inert unless Agenta wired this run (so it is safe to install globally in a
+  // shared Pi agent dir — a normal `pi` session with no Agenta env does nothing).
+  const hasTracing = !!(process.env.AGENTA_TRACEPARENT || process.env.AGENTA_OTLP_ENDPOINT);
+  const hasTools = !!(process.env.AGENTA_TOOL_SPECS && process.env.AGENTA_TOOL_CALLBACK_ENDPOINT);
+  const usageOut = process.env.AGENTA_USAGE_OUT;
+  if (!hasTracing && !hasTools && !usageOut) return;
+
+  if (hasTools) registerTools(pi);
+  // Tracing exports the span tree (when the OTLP target is reachable, i.e. local runs).
+  // Usage accumulation is needed both for that export AND for the writeback the runner
+  // uses on Daytona (where the in-sandbox process can't reach Agenta's OTLP, so the
+  // runner traces from the event stream and only needs the token totals). So set up the
+  // otel state whenever either applies; only flush (export) when tracing is on.
+  if (!hasTracing && !usageOut) return;
+
+  const otel = createAgentaOtel({
+    traceparent: process.env.AGENTA_TRACEPARENT,
+    endpoint: process.env.AGENTA_OTLP_ENDPOINT,
+    authorization: process.env.AGENTA_OTLP_AUTHORIZATION,
+    captureContent: process.env.AGENTA_CAPTURE_CONTENT !== "false",
+  });
+  otel.register(pi); // lifecycle handlers (spans + usage accumulation)
+
+  pi.on("agent_end", async () => {
+    if (hasTracing) await otel.flush(); // invoke_agent has a remote parent → flush by id
+    if (usageOut) {
+      try {
+        writeFileSync(usageOut, JSON.stringify(otel.usage()), "utf-8");
+      } catch (err) {
+        log(`usage writeback skipped: ${(err as Error).message}`);
+      }
+    }
+  });
+};
+
+export default factory;
diff --git a/services/agent/src/runPi.ts b/services/agent/src/runPi.ts
index 4056d0dce7..74a7ab98ac 100644
--- a/services/agent/src/runPi.ts
+++ b/services/agent/src/runPi.ts
@@ -84,6 +84,15 @@ export interface ToolCallbackContext {
 }
 
 export interface AgentRunRequest {
+  /** Harness id for the rivet backend ("pi" / "claude"). Ignored by the Pi backend. */
+  harness?: string;
+  /** Sandbox for the rivet backend ("local" / "daytona"). Ignored by the Pi backend. */
+  sandbox?: string;
+  /** Continue a prior run by replaying its history. The rivet backend resumes by id. */
+  sessionId?: string;
+  /** Provider API keys as env vars ({OPENAI_API_KEY,...}), resolved from the vault.
+   *  Injected into the harness env; empty means the harness uses its own login (OAuth). */
+  secrets?: Record<string, string>;
   /** AGENTS.md text injected as the agent's instructions (in memory). */
   agentsMd?: string;
   /** Model id ("gpt-5.5") or "provider/id" ("openai-codex/gpt-5.5"). */
@@ -109,6 +118,8 @@ export interface AgentRunResult {
   model?: string;
   /** Trace id of the run (the caller's trace when a traceparent was passed). */
   traceId?: string;
+  /** Run token/cost totals, for roll-up onto the caller's workflow span. */
+  usage?: { input: number; output: number; total: number; cost: number };
   error?: string;
 }
 
diff --git a/services/agent/src/runRivet.ts b/services/agent/src/runRivet.ts
new file mode 100644
index 0000000000..45b5657834
--- /dev/null
+++ b/services/agent/src/runRivet.ts
@@ -0,0 +1,698 @@
+/**
+ * WP-8 rivet harness driver.
+ *
+ * Drives a coding harness (Pi, Claude Code, ...) over the Agent Client Protocol (ACP)
+ * through a rivet `sandbox-agent` daemon, instead of the bespoke Pi SDK calls in
+ * runPi.ts. It serves the same /run contract (AgentRunRequest -> AgentRunResult), so
+ * the Python side stays thin and the choice of harness/sandbox is config, not new code.
+ *
+ * Per invoke (cold), mirroring the shipped code-evaluator DaytonaRunner pattern:
+ *
+ *   SandboxAgent.start({ sandbox: local({ env }) | daytona({ create }) })
+ *     -> createSession({ agent: <harness>, cwd, model })
+ *       -> write AGENTS.md into cwd
+ *       -> session.prompt([{ type: "text", text }])
+ *         -> accumulate ACP `agent_message_chunk` text + build the trace
+ *           -> destroySandbox()
+ *
+ * Two orthogonal axes swap independently: the sandbox (where the daemon runs) and the
+ * harness (which engine). The ACP boundary is daemon-to-harness; the service-to-rivet
+ * hop stays harness-agnostic behind the Harness port.
+ *
+ * Tracing is built here from the ACP event stream (see agenta-otel.ts createRivetOtel),
+ * so it is uniform across every harness and always nests under the caller's /invoke
+ * span. stdout is reserved for the JSON result (see cli.ts); logs go to stderr.
+ */
+import { randomBytes } from "node:crypto";
+import {
+  chmodSync,
+  copyFileSync,
+  existsSync,
+  mkdirSync,
+  mkdtempSync,
+  readdirSync,
+  readFileSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { createRequire } from "node:module";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import { SandboxAgent, InMemorySessionPersistDriver } from "sandbox-agent";
+import { local } from "sandbox-agent/local";
+import { daytona } from "sandbox-agent/daytona";
+
+import { createRivetOtel } from "./agenta-otel.ts";
+import { buildToolMcpServers, type ResolvedToolSpec, type ToolCallbackContext } from "./toolBridge.ts";
+import type { AgentRunRequest, AgentRunResult, ChatMessage } from "./runPi.ts";
+
+const require = createRequire(import.meta.url);
+// services/agent/src/runRivet.ts -> services/agent
+const PKG_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
+const ADAPTER_BIN_DIR = join(PKG_ROOT, "node_modules", ".bin");
+
+/** Map node platform/arch to the @sandbox-agent CLI binary package. */
+const CLI_PACKAGES: Record<string, string> = {
+  "darwin-arm64": "@sandbox-agent/cli-darwin-arm64",
+  "darwin-x64": "@sandbox-agent/cli-darwin-x64",
+  "linux-x64": "@sandbox-agent/cli-linux-x64",
+  "linux-arm64": "@sandbox-agent/cli-linux-arm64",
+  "win32-x64": "@sandbox-agent/cli-win32-x64",
+};
+
+function log(message: string): void {
+  process.stderr.write(`[rivet-wrapper] ${message}\n`);
+}
+
+/**
+ * Resolve the sandbox-agent daemon binary. Prefers SANDBOX_AGENT_BIN, then the
+ * platform CLI package shipped with `sandbox-agent` (resolved from the SDK's own
+ * location, since pnpm nests it under `sandbox-agent`). Ensures it is executable
+ * (pnpm may skip the package's chmod postinstall). Returns undefined when not found;
+ * the local provider then runs its own resolution and surfaces a clear error.
+ */
+function resolveDaemonBinary(): string | undefined {
+  const fromEnv = process.env.SANDBOX_AGENT_BIN;
+  if (fromEnv && existsSync(fromEnv)) return ensureExecutable(fromEnv);
+
+  const pkg = CLI_PACKAGES[`${process.platform}-${process.arch}`];
+  if (!pkg) return undefined;
+  const bin = process.platform === "win32" ? "sandbox-agent.exe" : "sandbox-agent";
+  try {
+    // Resolve from the sandbox-agent package context (its node_modules sees the
+    // sibling CLI package in the pnpm layout); package.json blocks the subpath, so
+    // resolve from the main entry instead.
+    const sdkRequire = createRequire(require.resolve("sandbox-agent"));
+    const pkgJson = sdkRequire.resolve(`${pkg}/package.json`);
+    const resolved = join(dirname(pkgJson), "bin", bin);
+    if (existsSync(resolved)) return ensureExecutable(resolved);
+  } catch {
+    // fall through to a store scan
+  }
+  // Fallback: scan the pnpm store for the platform binary.
+  try {
+    const store = join(PKG_ROOT, "node_modules", ".pnpm");
+    for (const entry of readdirSync(store)) {
+      if (!entry.startsWith(`@sandbox-agent+cli-${process.platform}`)) continue;
+      const candidate = join(store, entry, "node_modules", pkg, "bin", bin);
+      if (existsSync(candidate)) return ensureExecutable(candidate);
+    }
+  } catch {
+    // store not present
+  }
+  return undefined;
+}
+
+function ensureExecutable(path: string): string {
+  try {
+    chmodSync(path, 0o755);
+  } catch {
+    // read-only fs (e.g. baked snapshot already +x): ignore
+  }
+  return path;
+}
+
+// The bundled Agenta Pi extension (tracing + tools). Built by `pnpm run build:extension`
+// and into the image; installed into Pi's agent dir so Pi loads it on every run.
+const EXTENSION_BUNDLE =
+  process.env.AGENTA_RIVET_EXTENSION_BUNDLE ?? join(PKG_ROOT, "dist", "extensions", "agenta.js");
+
+/**
+ * Env the Agenta Pi extension reads. Propagating the trace context here is what makes Pi
+ * emit its real spans under the caller's `/invoke` span; the tool spec + callback make Pi
+ * register the resolved tools natively (no MCP). Empty keys are omitted so the extension
+ * stays inert when nothing applies.
+ */
+function buildPiExtensionEnv(
+  request: AgentRunRequest,
+  tracing: boolean,
+): Record<string, string> {
+  const env: Record<string, string> = {};
+  // Tracing env is omitted when the harness process can't reach Agenta's OTLP (Daytona):
+  // there the runner traces from the event stream instead, and the extension only does
+  // tools + the usage writeback.
+  const trace = tracing ? request.trace : undefined;
+  if (trace?.traceparent) env.AGENTA_TRACEPARENT = trace.traceparent;
+  if (trace?.endpoint) env.AGENTA_OTLP_ENDPOINT = trace.endpoint;
+  if (trace?.authorization) env.AGENTA_OTLP_AUTHORIZATION = trace.authorization;
+  if (trace && trace.captureContent === false) env.AGENTA_CAPTURE_CONTENT = "false";
+
+  const specs = (request.customTools as ResolvedToolSpec[]) ?? [];
+  if (specs.length && request.toolCallback?.endpoint) {
+    env.AGENTA_TOOL_SPECS = JSON.stringify(specs);
+    env.AGENTA_TOOL_CALLBACK_ENDPOINT = request.toolCallback.endpoint;
+    if (request.toolCallback.authorization) {
+      env.AGENTA_TOOL_CALLBACK_AUTH = request.toolCallback.authorization;
+    }
+  }
+  return env;
+}
+
+/** Install the extension bundle into a local Pi agent dir's extensions/. Best-effort. */
+function installPiExtensionLocal(agentDir: string): void {
+  if (!existsSync(EXTENSION_BUNDLE)) {
+    log(`pi extension bundle missing at ${EXTENSION_BUNDLE} (run build:extension)`);
+    return;
+  }
+  try {
+    const dir = join(agentDir, "extensions");
+    mkdirSync(dir, { recursive: true });
+    copyFileSync(EXTENSION_BUNDLE, join(dir, "agenta.js"));
+  } catch (err) {
+    log(`pi extension install skipped: ${(err as Error).message}`);
+  }
+}
+
+/** Upload the extension bundle into a Daytona sandbox's Pi extensions dir. Best-effort. */
+async function uploadPiExtensionToSandbox(sandbox: any, agentDir: string): Promise<void> {
+  if (!existsSync(EXTENSION_BUNDLE)) return;
+  try {
+    const dir = `${agentDir}/extensions`;
+    await sandbox.mkdirFs({ path: dir });
+    await sandbox.writeFsFile({ path: `${dir}/agenta.js` }, readFileSync(EXTENSION_BUNDLE, "utf-8"));
+  } catch (err) {
+    log(`pi extension upload skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * The environment the daemon is born with. The local provider merges this into the
+ * `sandbox-agent server` subprocess, which passes it to the ACP adapter and then to
+ * the harness. This is also where per-invoke trace/secret injection would go for a
+ * warm-daemon model; under one-daemon-per-invoke the in-process tracer handles spans,
+ * so this only needs to make the adapters and harness resolvable + authed.
+ */
+function buildDaemonEnv(harness: string): Record<string, string> {
+  const env: Record<string, string> = {};
+
+  // Adapters (pi-acp, claude-agent-acp) and the pi CLI live in our node_modules/.bin;
+  // claude CLI is on the inherited PATH. Prepend ours, keep the inherited PATH.
+  const extra = process.env.AGENTA_RIVET_ADAPTER_PATH;
+  env.PATH = [ADAPTER_BIN_DIR, extra, process.env.PATH].filter(Boolean).join(":");
+
+  // Pi: point pi-acp at our pi bin and the agent dir that carries auth.json.
+  env.PI_ACP_PI_COMMAND =
+    process.env.AGENTA_RIVET_PI_COMMAND ?? join(ADAPTER_BIN_DIR, "pi");
+  const piAgentDir = process.env.PI_CODING_AGENT_DIR;
+  if (piAgentDir) env.PI_CODING_AGENT_DIR = piAgentDir;
+
+  // Keep HOME so harness logins (~/.pi/agent, ~/.claude) resolve.
+  if (process.env.HOME) env.HOME = process.env.HOME;
+
+  // Harness LLM auth passed as launch env, never written into the agent filesystem.
+  for (const key of [
+    "OPENAI_API_KEY",
+    "ANTHROPIC_API_KEY",
+    "ANTHROPIC_AUTH_TOKEN",
+    "CLAUDE_CODE_OAUTH_TOKEN",
+    "CLAUDE_CONFIG_DIR",
+    "GEMINI_API_KEY",
+  ]) {
+    const value = process.env[key];
+    if (value) env[key] = value;
+  }
+
+  return env;
+}
+
+/** The latest user turn: explicit prompt, else last user message content. */
+function resolvePrompt(request: AgentRunRequest): string {
+  if (request.prompt && request.prompt.trim()) return request.prompt;
+  const messages = request.messages ?? [];
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === "user" && messages[i].content) return messages[i].content;
+  }
+  return "";
+}
+
+/** Prior turns (everything before the latest user message) for trace + history. */
+function priorMessages(request: AgentRunRequest): ChatMessage[] {
+  const messages = request.messages ?? [];
+  const latest = resolvePrompt(request);
+  // Drop the trailing user turn (it is the prompt we send) to avoid double-counting.
+  if (messages.length && messages[messages.length - 1].role === "user") {
+    return messages.slice(0, -1);
+  }
+  // No trailing user message (prompt came in explicitly): keep turns that aren't it.
+  return messages.filter((m) => !(m.role === "user" && m.content === latest));
+}
+
+/**
+ * The text sent over ACP for this turn. Each invoke is a cold sandbox, so prior turns
+ * are replayed as transcript context ahead of the latest user message — this is the
+ * "persisted message history replayed" model, with the client/playground holding the
+ * history. Capped by AGENTA_AGENT_HISTORY_MAX_CHARS so replay tokens stay bounded.
+ */
+function buildTurnText(request: AgentRunRequest): string {
+  const latest = resolvePrompt(request);
+  const history = priorMessages(request).filter((m) => m.content);
+  if (history.length === 0) return latest;
+
+  const maxChars = Number(process.env.AGENTA_AGENT_HISTORY_MAX_CHARS ?? 24000);
+  let transcript = history.map((m) => `${m.role}: ${m.content}`).join("\n");
+  if (transcript.length > maxChars) transcript = transcript.slice(-maxChars);
+  return (
+    `Conversation so far:\n${transcript}\n\n` +
+    `Continue the conversation. The user now says:\n${latest}`
+  );
+}
+
+/**
+ * Pick the harness-specific model id for a requested name. Harnesses expose their own
+ * ids (Pi: "openai-codex/gpt-5.5"; Claude: its own). Match exact, then by the id after
+ * the provider prefix, so "gpt-5.5" resolves to "openai-codex/gpt-5.5".
+ */
+function pickModel(allowed: string[], wanted?: string): string | undefined {
+  if (!wanted) return undefined;
+  if (allowed.includes(wanted)) return wanted;
+  const suffix = (id: string) => id.slice(id.indexOf("/") + 1);
+  return (
+    allowed.find((id) => suffix(id) === wanted) ??
+    allowed.find((id) => suffix(id) === suffix(wanted)) ??
+    undefined
+  );
+}
+
+/** Enumerate the harness's selectable model ids from the session config options. */
+async function allowedModels(session: any): Promise<string[]> {
+  try {
+    const options = await session.getConfigOptions();
+    const modelOpt = (options ?? []).find(
+      (o: any) => o.category === "model" || o.id === "model",
+    );
+    const choices = modelOpt?.options ?? [];
+    return choices.map((c: any) => c.id).filter(Boolean);
+  } catch {
+    return [];
+  }
+}
+
+/** Parse the allowed model ids out of an UnsupportedSessionValueError message. */
+function allowedFromError(err: unknown): string[] {
+  const match = /Allowed values:\s*(.+?)\s*$/.exec(String((err as Error)?.message ?? err));
+  if (!match) return [];
+  return match[1]
+    .split(",")
+    .map((s) => s.trim())
+    .filter(Boolean);
+}
+
+/**
+ * Apply the requested model to a session, normalizing to the harness's own id. Tries the
+ * value as given first (already-qualified ids pass); on rejection it reads the allowed
+ * ids from the error (always listed there) or the session config and retries a match.
+ * Returns the id set, or undefined when no match exists (the harness keeps its default
+ * rather than failing the run).
+ */
+async function applyModel(session: any, wanted?: string): Promise<string | undefined> {
+  if (!wanted) return undefined;
+  try {
+    await session.setModel(wanted);
+    return wanted;
+  } catch (err) {
+    const allowed = allowedFromError(err);
+    const fallbackAllowed = allowed.length ? allowed : await allowedModels(session);
+    const match = pickModel(fallbackAllowed, wanted);
+    if (match && match !== wanted) {
+      try {
+        await session.setModel(match);
+        return match;
+      } catch {
+        // fall through to harness default
+      }
+    }
+    log(`model '${wanted}' not settable (${(err as Error).message}); using harness default`);
+    return undefined;
+  }
+}
+
+/**
+ * In-sandbox env for the Daytona daemon: where Pi reads its login, any provider keys,
+ * and the Agenta extension env (traceparent + OTLP + tool spec) so the remote Pi traces
+ * and runs tools exactly like local. No local-only paths (PATH/PI_ACP_PI_COMMAND) here.
+ */
+function daytonaEnvVars(
+  piExtEnv: Record<string, string>,
+  secrets: Record<string, string>,
+): Record<string, string> {
+  const env: Record<string, string> = {
+    PI_CODING_AGENT_DIR: DAYTONA_PI_DIR,
+    ...piExtEnv,
+    // Provider API keys from the vault: the in-sandbox harness authenticates with these.
+    ...secrets,
+  };
+  // Point pi-acp at the `pi` we install into the sandbox (the image lacks it).
+  if (DAYTONA_PI_INSTALL) {
+    env.PI_ACP_PI_COMMAND = `${DAYTONA_PI_INSTALL_DIR}/node_modules/.bin/pi`;
+  }
+  return env;
+}
+
+/**
+ * Build the rivet sandbox provider for the requested axis.
+ *
+ * Daytona needs an image that carries both the rivet daemon and the harness CLI. Rivet's
+ * `-full` image ships the daemon and the ACP adapters but NOT the `pi` CLI, so we run
+ * from a pre-baked snapshot (`AGENTA_RIVET_DAYTONA_SNAPSHOT`, default `agenta-rivet-pi`,
+ * built by poc/build_rivet_snapshot.py) that adds `pi`; this avoids a ~150s per-invoke
+ * `npm install pi`. `AGENTA_RIVET_DAYTONA_IMAGE` overrides with a plain image instead. The
+ * code-evaluator DAYTONA_SNAPSHOT is intentionally NOT reused (it has no daemon). The
+ * provider key comes from the vault env; Pi's OAuth login is only uploaded when no key.
+ */
+function buildSandboxProvider(
+  sandboxId: string,
+  env: Record<string, string>,
+  binaryPath: string | undefined,
+  piExtEnv: Record<string, string>,
+  secrets: Record<string, string>,
+) {
+  if (sandboxId === "daytona") {
+    const snapshot = process.env.AGENTA_RIVET_DAYTONA_SNAPSHOT;
+    const image = process.env.AGENTA_RIVET_DAYTONA_IMAGE;
+    const target = process.env.DAYTONA_TARGET;
+    return daytona({
+      ...(image ? { image } : {}),
+      create: {
+        // The rivet provider always sets a default `image`, which Daytona turns into a
+        // build entry that conflicts with `snapshot`. Spreading image:undefined last
+        // suppresses that so the snapshot is used as-is.
+        ...(snapshot ? { snapshot, image: undefined } : {}),
+        ...(target ? { target } : {}),
+        envVars: daytonaEnvVars(piExtEnv, secrets),
+        ephemeral: true,
+      } as any,
+    });
+  }
+  // local: spawn `sandbox-agent server` on this host with the daemon env merged in.
+  const logMode = (process.env.AGENTA_RIVET_DAEMON_LOG ?? "silent") as any;
+  return local({ env, binaryPath, log: logMode });
+}
+
+/** In-sandbox Pi agent dir on the rivet `-full` image (daemon runs as user `sandbox`). */
+const DAYTONA_PI_DIR = process.env.AGENTA_RIVET_DAYTONA_PI_DIR ?? "/home/sandbox/.pi/agent";
+// The rivet `-full` image ships the pi-acp adapter but NOT the `pi` CLI, so by default we
+// install it into the sandbox at session time and point pi-acp at it. A snapshot that
+// pre-installs `pi` should set AGENTA_RIVET_DAYTONA_INSTALL_PI=false (faster, no per-run
+// npm install). Version mirrors the wrapper's pinned Pi.
+const DAYTONA_PI_INSTALL_DIR = "/home/sandbox/.agenta-pi";
+const DAYTONA_PI_INSTALL = process.env.AGENTA_RIVET_DAYTONA_INSTALL_PI !== "false";
+const DAYTONA_PI_VERSION = process.env.AGENTA_RIVET_PI_VERSION ?? "0.79.4";
+
+/** Install the `pi` CLI into a Daytona sandbox (the rivet image lacks it). Best-effort. */
+async function installPiInSandbox(sandbox: any): Promise<void> {
+  try {
+    await sandbox.mkdirFs({ path: DAYTONA_PI_INSTALL_DIR });
+    const res = await sandbox.runProcess({
+      command: "npm",
+      args: [
+        "install",
+        "--no-fund",
+        "--no-audit",
+        `@earendil-works/pi-coding-agent@${DAYTONA_PI_VERSION}`,
+      ],
+      cwd: DAYTONA_PI_INSTALL_DIR,
+      timeoutMs: 180_000,
+    });
+    if (res?.exitCode !== 0) {
+      log(`pi install in sandbox exit=${res?.exitCode}: ${String(res?.stderr).slice(-400)}`);
+    }
+  } catch (err) {
+    log(`pi install in sandbox skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * Upload the local Pi login into a Daytona sandbox so the remote Pi authenticates with
+ * the dev's ChatGPT/Codex OAuth (it auto-refreshes from the token in auth.json). Must
+ * `mkdirFs` the parent first (a fresh sandbox lacks it) and pass a string body — a
+ * missing dir or a stream body is what produced the earlier "Stream Error". Best-effort:
+ * with no local login the remote run falls back to any provider key in the sandbox env.
+ */
+async function uploadPiAuthToSandbox(sandbox: any): Promise<void> {
+  const localDir = process.env.PI_CODING_AGENT_DIR || join(process.env.HOME ?? "", ".pi/agent");
+  const authPath = join(localDir, "auth.json");
+  if (!existsSync(authPath)) return;
+  try {
+    await sandbox.mkdirFs({ path: DAYTONA_PI_DIR });
+    await sandbox.writeFsFile({ path: `${DAYTONA_PI_DIR}/auth.json` }, readFileSync(authPath, "utf-8"));
+    const settingsPath = join(localDir, "settings.json");
+    if (existsSync(settingsPath)) {
+      await sandbox.writeFsFile(
+        { path: `${DAYTONA_PI_DIR}/settings.json` },
+        readFileSync(settingsPath, "utf-8"),
+      );
+    }
+  } catch (err) {
+    log(`pi auth upload skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * A `fetch` that persists cookies per host. Daytona's preview proxy authenticates with a
+ * `daytona-sandbox-auth-*` cookie set on the first response; Node's fetch keeps no cookie
+ * jar, so without this the proxy rejects later ACP requests with "Authentication
+ * required" / 502. The rivet SDK accepts a custom fetch, so we hand it this one.
+ */
+function createCookieFetch(): typeof fetch {
+  const jar = new Map<string, Map<string, string>>(); // host -> (name -> "name=value")
+  return async (input: any, init?: any) => {
+    const url = new URL(typeof input === "string" ? input : input.url);
+    const host = url.host;
+    const cookies = jar.get(host);
+    const headers = new Headers(init?.headers ?? (typeof input !== "string" ? input.headers : undefined));
+    if (cookies && cookies.size > 0) {
+      const existing = headers.get("cookie");
+      const merged = [...cookies.values()];
+      if (existing) merged.unshift(existing);
+      headers.set("cookie", merged.join("; "));
+    }
+    const response = await fetch(input, { ...init, headers });
+    const setCookies =
+      typeof (response.headers as any).getSetCookie === "function"
+        ? (response.headers as any).getSetCookie()
+        : (response.headers.get("set-cookie") ? [response.headers.get("set-cookie")] : []);
+    if (setCookies.length) {
+      const store = jar.get(host) ?? new Map<string, string>();
+      for (const sc of setCookies) {
+        const pair = String(sc).split(";")[0];
+        const name = pair.split("=")[0];
+        if (name) store.set(name, pair);
+      }
+      jar.set(host, store);
+    }
+    return response;
+  };
+}
+
+/** Read the run-total usage Pi wrote on agent_end (local fs or the sandbox FS API). */
+async function readRunUsage(
+  sandbox: any,
+  path: string | undefined,
+  isDaytona: boolean,
+): Promise<AgentRunResult["usage"]> {
+  if (!path) return undefined;
+  try {
+    let raw: string;
+    if (isDaytona) {
+      const bytes = await sandbox.readFsFile({ path });
+      raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes);
+    } else {
+      if (!existsSync(path)) return undefined;
+      raw = readFileSync(path, "utf-8");
+    }
+    const u = JSON.parse(raw);
+    return u && u.total > 0 ? u : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Turn a harness/SDK error into one clear line for the caller (the playground shows it
+ * verbatim), instead of dumping a full ACP/JS stack. Recognizes the common harness auth
+ * failures so the user sees what to fix.
+ */
+function conciseError(err: unknown, harness: string): string {
+  const raw = err instanceof Error ? err.message : String(err);
+  const msg = raw.split("\n")[0].trim();
+  const keyHint =
+    harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key";
+  if (/credit balance is too low/i.test(raw)) {
+    return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`;
+  }
+  if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) {
+    return `${harness}: model authentication failed — add ${keyHint} to the project vault, or log in (OAuth).`;
+  }
+  return msg || "agent run failed";
+}
+
+export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult> {
+  const harness = request.harness || process.env.AGENTA_AGENT_HARNESS || "pi";
+  const sandboxId = request.sandbox || process.env.AGENTA_AGENT_SANDBOX || "local";
+
+  const prompt = resolvePrompt(request);
+  if (!prompt) {
+    return { ok: false, error: "No user message to send (prompt/messages empty)." };
+  }
+  // What we actually send over ACP: the latest turn, with prior turns replayed as
+  // context when this is a continued conversation.
+  const turnText = buildTurnText(request);
+
+  const isPi = harness === "pi";
+  const isDaytona = sandboxId === "daytona";
+
+  // Provider API keys resolved from the vault (OPENAI_API_KEY/ANTHROPIC_API_KEY/...).
+  // Present => the harness authenticates with the key; absent => it uses its own login
+  // (OAuth: local Codex / a mounted-or-uploaded auth.json).
+  const secrets = request.secrets ?? {};
+  const harnessKeyVar = harness === "claude" ? "ANTHROPIC_API_KEY" : "OPENAI_API_KEY";
+  const hasApiKey = !!secrets[harnessKeyVar];
+
+  const env = buildDaemonEnv(harness);
+  Object.assign(env, secrets); // local daemon inherits the provider keys
+  // Pi self-instruments locally: propagate the trace context + tools into Pi via the
+  // Agenta extension (real spans + native tools). On Daytona the in-sandbox process
+  // can't reach Agenta's OTLP, so the extension skips tracing (tools + usage only) and
+  // the runner traces from the ACP event stream instead — hence emitSpans on Daytona.
+  const piExtEnv = isPi ? buildPiExtensionEnv(request, !isDaytona) : {};
+  Object.assign(env, piExtEnv); // local daemon inherits it; daytona gets it via envVars
+  // undefined is fine: the local provider runs its own resolution and errors clearly.
+  const binaryPath = resolveDaemonBinary();
+
+  // For local Pi, install the extension into the agent dir Pi loads from.
+  const localPiAgentDir = process.env.PI_CODING_AGENT_DIR;
+  if (isPi && !isDaytona && localPiAgentDir) installPiExtensionLocal(localPiAgentDir);
+
+  // Session cwd holds AGENTS.md. Local: a host temp dir. Daytona: an in-sandbox path
+  // (the host path would not exist on the remote sandbox).
+  const cwd = isDaytona
+    ? `/home/sandbox/agenta-${randomBytes(6).toString("hex")}`
+    : mkdtempSync(join(tmpdir(), "agenta-rivet-"));
+  const agentsMd = request.agentsMd?.trim();
+
+  // Pi writes its run totals here on agent_end; we read them back and return them so the
+  // caller can roll them onto the workflow span (separate OTLP batch, see piExtension).
+  const usageOutPath = isPi ? `${cwd}/.agenta-usage.json` : undefined;
+  if (usageOutPath) {
+    env.AGENTA_USAGE_OUT = usageOutPath;
+    piExtEnv.AGENTA_USAGE_OUT = usageOutPath;
+  }
+
+  log(`harness=${harness} sandbox=${sandboxId} cwd=${cwd}`);
+
+  // Persist events in-process so a follow-up turn can resume by session id.
+  const persist = new InMemorySessionPersistDriver();
+  const sandbox = await SandboxAgent.start({
+    sandbox: buildSandboxProvider(sandboxId, env, binaryPath, piExtEnv, secrets),
+    persist,
+    // Daytona's preview proxy authenticates with a per-sandbox cookie; carry it across
+    // requests so ACP calls after the first don't 401. Harmless for local.
+    ...(isDaytona ? { fetch: createCookieFetch() } : {}),
+  });
+
+  // Pi traces itself via the extension under the propagated traceparent; for other
+  // harnesses we build the span tree here from the ACP event stream. Created below, once
+  // the model is resolved, so the chat span carries the harness's actual model rather
+  // than the requested one. Declared here so the catch can flush a partial trace.
+  let otel: ReturnType<typeof createRivetOtel> | undefined;
+
+  try {
+    // On Daytona, push the harness login, the extension, and AGENTS.md into the remote
+    // sandbox via the filesystem API (nothing secret is baked into the image). Locally
+    // these use the host filesystem and the harness's own login (PI_CODING_AGENT_DIR).
+    if (isDaytona) {
+      if (isPi) {
+        // With a provider API key the harness authenticates via env; only fall back to
+        // uploading the Codex/OAuth login when no key is available.
+        if (!hasApiKey) await uploadPiAuthToSandbox(sandbox);
+        await uploadPiExtensionToSandbox(sandbox, DAYTONA_PI_DIR);
+        if (DAYTONA_PI_INSTALL) await installPiInSandbox(sandbox);
+      }
+      await sandbox.mkdirFs({ path: cwd }).catch(() => {});
+      if (agentsMd) await sandbox.writeFsFile({ path: `${cwd}/AGENTS.md` }, agentsMd);
+    } else if (agentsMd) {
+      writeFileSync(join(cwd, "AGENTS.md"), agentsMd, "utf-8");
+    }
+
+    // Pi gets tools via the extension (above); other harnesses via MCP.
+    const mcpServers = isPi
+      ? []
+      : buildToolMcpServers(
+          (request.customTools as ResolvedToolSpec[]) ?? [],
+          request.toolCallback as ToolCallbackContext | undefined,
+        );
+
+    const session = await sandbox.createSession({
+      agent: harness,
+      cwd,
+      sessionInit: { cwd, mcpServers },
+    });
+
+    // Resolve the model first: when the harness rejects the requested id and keeps its
+    // own default (e.g. Claude ignores "gpt-5.5"), `model` is undefined and the chat span
+    // is labelled "chat" instead of falsely claiming the requested model.
+    const model = await applyModel(session, request.model);
+
+    const run = createRivetOtel({
+      harness,
+      model,
+      traceparent: request.trace?.traceparent,
+      baggage: request.trace?.baggage,
+      endpoint: request.trace?.endpoint,
+      authorization: request.trace?.authorization,
+      captureContent: request.trace?.captureContent,
+      emitSpans: !isPi || isDaytona,
+    });
+    otel = run;
+
+    run.start({
+      prompt,
+      sessionId: session.id,
+      messages: [...priorMessages(request), { role: "user", content: prompt }],
+    });
+
+    session.onEvent((event: any) => {
+      const payload = event?.payload;
+      const update = payload?.params?.update ?? payload?.update;
+      if (update) run.handleUpdate(update);
+    });
+
+    // Auto-approve permission requests so a permission-gating harness (e.g. Claude
+    // Code) does not block on tool use. Tools are backend-resolved and trusted; the
+    // run is headless so there is no human to prompt. Set AGENTA_RIVET_DENY_PERMISSIONS
+    // to reject instead.
+    const denyPermissions = process.env.AGENTA_RIVET_DENY_PERMISSIONS === "true";
+    session.onPermissionRequest((req: any) => {
+      const replies: string[] = req?.availableReplies ?? [];
+      const reply = denyPermissions
+        ? "reject"
+        : replies.find((r) => r === "always") ?? replies.find((r) => r === "once") ?? "once";
+      if (req?.id) session.respondPermission(req.id, reply as any).catch(() => {});
+    });
+
+    const result = await session.prompt([{ type: "text", text: turnText }]);
+    log(`prompt stopReason=${(result as any)?.stopReason}`);
+
+    const output = run.finish();
+    await run.flush();
+
+    return {
+      ok: true,
+      output,
+      sessionId: session.id,
+      model: model ?? request.model,
+      traceId: run.traceId(),
+      usage: await readRunUsage(sandbox, usageOutPath, isDaytona),
+    };
+  } catch (err) {
+    otel?.finish();
+    await otel?.flush().catch(() => {});
+    return { ok: false, error: conciseError(err, harness) };
+  } finally {
+    await sandbox.destroySandbox().catch(() => {});
+    await sandbox.dispose().catch(() => {});
+    rmSync(cwd, { recursive: true, force: true });
+  }
+}
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
index 968ff6be4e..2eee90d1fc 100644
--- a/services/agent/src/server.ts
+++ b/services/agent/src/server.ts
@@ -12,10 +12,23 @@
  */
 import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
 
-import { runPi, type AgentRunRequest } from "./runPi.ts";
+import { runPi, type AgentRunRequest, type AgentRunResult } from "./runPi.ts";
+import { runRivet } from "./runRivet.ts";
 
 const PORT = Number(process.env.PORT ?? 8765);
 
+// Select the harness driver. `rivet` drives the harness over ACP via a rivet daemon
+// (WP-8); `pi` is the legacy in-process Pi path (WP-2). `auto` (default) routes by the
+// request: a rivet envelope carries `harness`/`sandbox`, so one sidecar serves both and
+// nothing regresses.
+const BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
+
+function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
+  if (BACKEND === "rivet") return runRivet(request);
+  if (BACKEND === "pi") return runPi(request);
+  return request.harness || request.sandbox ? runRivet(request) : runPi(request);
+}
+
 function send(res: ServerResponse, status: number, body: unknown): void {
   const payload = JSON.stringify(body);
   res.writeHead(status, {
@@ -48,7 +61,7 @@ const server = createServer(async (req, res) => {
         return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
       }
 
-      const result = await runPi(request);
+      const result = await runAgent(request);
       return send(res, result.ok ? 200 : 500, result);
     }
 
@@ -59,6 +72,20 @@ const server = createServer(async (req, res) => {
   }
 });
 
+// The rivet SDK can reject a background promise (e.g. an adapter install or the Daytona
+// preview SSE failing) outside any awaited path. Node's default turns that into an
+// uncaught exception that kills the whole process — taking every in-flight request with
+// it (the caller sees "Server disconnected"). Log and keep serving instead; the failing
+// run still returns its own error to its caller.
+process.on("unhandledRejection", (reason) => {
+  process.stderr.write(
+    `[pi-wrapper] unhandledRejection: ${reason instanceof Error ? (reason.stack ?? reason.message) : String(reason)}\n`,
+  );
+});
+process.on("uncaughtException", (err) => {
+  process.stderr.write(`[pi-wrapper] uncaughtException: ${err.stack ?? err.message}\n`);
+});
+
 server.listen(PORT, () => {
   process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`);
 });
diff --git a/services/agent/src/toolBridge.ts b/services/agent/src/toolBridge.ts
new file mode 100644
index 0000000000..6cf27b10cb
--- /dev/null
+++ b/services/agent/src/toolBridge.ts
@@ -0,0 +1,76 @@
+/**
+ * WP-8 tool delivery over rivet/ACP.
+ *
+ * The Pi backend (runPi.ts) injected resolved runnable tools (WP-7) as in-process Pi
+ * customTools. Over ACP the harness only accepts tools through MCP, so the same
+ * resolved specs are exposed as an MCP server whose tool bodies POST back to Agenta's
+ * /tools/call (the provider key and connection auth stay server-side, exactly as in
+ * the Pi path). `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to
+ * the session.
+ *
+ * Delivery: a stdio MCP bridge (toolBridgeServer.ts) launched by the daemon. The specs
+ * and callback are passed to it as env, so nothing tool-specific is written to the
+ * agent-visible filesystem.
+ */
+import { existsSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import type { ResolvedToolSpec, ToolCallbackContext } from "./runPi.ts";
+
+export type { ResolvedToolSpec, ToolCallbackContext } from "./runPi.ts";
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+// services/agent/src/toolBridge.ts -> services/agent/node_modules/.bin/tsx
+const TSX_BIN = join(HERE, "..", "node_modules", ".bin", "tsx");
+const SERVER = join(HERE, "toolBridgeServer.ts");
+
+/** Resolve how to launch the bridge: an explicit override, else the local tsx bin. */
+function bridgeLauncher(): { command: string; args: string[] } {
+  const override = process.env.AGENTA_TOOL_BRIDGE_COMMAND;
+  if (override) return { command: override, args: [SERVER] };
+  if (existsSync(TSX_BIN)) return { command: TSX_BIN, args: [SERVER] };
+  // Fall back to npx tsx (resolves from PATH wherever the daemon runs).
+  return { command: "npx", args: ["-y", "tsx", SERVER] };
+}
+
+/** ACP McpServerStdio entry: env is a list of {name, value}. */
+interface EnvVariable {
+  name: string;
+  value: string;
+}
+
+export interface McpServerStdio {
+  name: string;
+  command: string;
+  args: string[];
+  env: EnvVariable[];
+}
+
+/**
+ * Build the ACP `mcpServers` list that exposes the resolved tools to the harness.
+ * Empty when there are no tools or no callback (the no-tools path stays untouched).
+ */
+export function buildToolMcpServers(
+  specs: ResolvedToolSpec[],
+  callback: ToolCallbackContext | undefined,
+): McpServerStdio[] {
+  if (!specs || specs.length === 0) return [];
+  if (!callback?.endpoint) {
+    process.stderr.write(
+      `[tool-bridge] skipping ${specs.length} tool(s): missing toolCallback endpoint\n`,
+    );
+    return [];
+  }
+
+  const env: EnvVariable[] = [
+    { name: "AGENTA_TOOL_SPECS", value: JSON.stringify(specs) },
+    { name: "AGENTA_TOOL_CALLBACK_ENDPOINT", value: callback.endpoint },
+  ];
+  if (callback.authorization) {
+    env.push({ name: "AGENTA_TOOL_CALLBACK_AUTH", value: callback.authorization });
+  }
+
+  const { command, args } = bridgeLauncher();
+  return [{ name: "agenta-tools", command, args, env }];
+}
diff --git a/services/agent/src/toolBridgeServer.ts b/services/agent/src/toolBridgeServer.ts
new file mode 100644
index 0000000000..7a8dd44971
--- /dev/null
+++ b/services/agent/src/toolBridgeServer.ts
@@ -0,0 +1,170 @@
+/**
+ * WP-8 tool MCP bridge (stdio server).
+ *
+ * The harness only accepts tools over MCP when driven via ACP. This is a minimal,
+ * dependency-free MCP stdio server that exposes the backend-resolved runnable tools
+ * (WP-7) and routes each tool call back through Agenta's /tools/call — so the Composio
+ * key and connection auth stay server-side, exactly as in the in-process Pi path.
+ *
+ * Launched by the rivet daemon as a session MCP server (see toolBridge.ts). It reads
+ * everything from env so nothing tool-specific is written to the agent filesystem:
+ *   AGENTA_TOOL_SPECS            JSON array of { name, description, inputSchema, callRef }
+ *   AGENTA_TOOL_CALLBACK_ENDPOINT  full /tools/call URL
+ *   AGENTA_TOOL_CALLBACK_AUTH      Authorization header value (optional)
+ *
+ * Protocol: JSON-RPC 2.0 over stdio, newline-delimited (the MCP stdio framing). Handles
+ * initialize, tools/list, tools/call; ignores notifications. stdout carries protocol
+ * messages only; logs go to stderr.
+ */
+interface ToolSpec {
+  name: string;
+  description?: string;
+  inputSchema?: Record<string, unknown> | null;
+  callRef: string;
+}
+
+const SPECS: ToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]");
+const ENDPOINT = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT ?? "";
+const AUTH = process.env.AGENTA_TOOL_CALLBACK_AUTH;
+const SPEC_BY_NAME = new Map(SPECS.map((s) => [s.name, s]));
+const TOOL_CALL_TIMEOUT_MS = Number(process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000);
+const DEFAULT_PROTOCOL = "2025-06-18";
+
+const EMPTY_SCHEMA = { type: "object", properties: {}, additionalProperties: true };
+
+function log(message: string): void {
+  process.stderr.write(`[tool-bridge] ${message}\n`);
+}
+
+function send(message: unknown): void {
+  process.stdout.write(`${JSON.stringify(message)}\n`);
+}
+
+/** One /tools/call round-trip. Returns the result text; throws on failure. */
+async function callAgentaTool(callRef: string, args: unknown): Promise<string> {
+  const headers: Record<string, string> = { "content-type": "application/json" };
+  if (AUTH) headers["authorization"] = AUTH;
+
+  let response: Response;
+  try {
+    response = await fetch(ENDPOINT, {
+      method: "POST",
+      headers,
+      body: JSON.stringify({
+        data: {
+          id: `tool-${Date.now()}`,
+          type: "function",
+          function: { name: callRef, arguments: args ?? {} },
+        },
+      }),
+      signal: AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS),
+    });
+  } catch (err) {
+    throw new Error(`tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`);
+  }
+
+  const bodyText = await response.text();
+  if (!response.ok) {
+    throw new Error(`tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`);
+  }
+  // ToolCallResponse -> { call: { data: { content }, status } }; content is the result
+  // serialized as a string, handed to the model verbatim.
+  try {
+    const parsed = JSON.parse(bodyText);
+    const content = parsed?.call?.data?.content;
+    if (typeof content === "string") return content;
+    if (content != null) return JSON.stringify(content);
+    return bodyText;
+  } catch {
+    return bodyText;
+  }
+}
+
+async function handle(message: any): Promise<unknown | undefined> {
+  const { id, method, params } = message ?? {};
+
+  // Notifications (no id) need no response.
+  if (id === undefined || id === null) {
+    return undefined;
+  }
+
+  if (method === "initialize") {
+    return {
+      jsonrpc: "2.0",
+      id,
+      result: {
+        protocolVersion: params?.protocolVersion ?? DEFAULT_PROTOCOL,
+        capabilities: { tools: {} },
+        serverInfo: { name: "agenta-tools", version: "0.1.0" },
+      },
+    };
+  }
+
+  if (method === "tools/list") {
+    return {
+      jsonrpc: "2.0",
+      id,
+      result: {
+        tools: SPECS.map((s) => ({
+          name: s.name,
+          description: s.description ?? s.name,
+          inputSchema: (s.inputSchema as Record<string, unknown>) ?? EMPTY_SCHEMA,
+        })),
+      },
+    };
+  }
+
+  if (method === "tools/call") {
+    const name = params?.name;
+    const spec = SPEC_BY_NAME.get(name);
+    if (!spec) {
+      return { jsonrpc: "2.0", id, error: { code: -32602, message: `unknown tool: ${name}` } };
+    }
+    try {
+      const text = await callAgentaTool(spec.callRef, params?.arguments);
+      return { jsonrpc: "2.0", id, result: { content: [{ type: "text", text }] } };
+    } catch (err) {
+      // Surface as an MCP tool error (isError) so the model can recover, not a crash.
+      return {
+        jsonrpc: "2.0",
+        id,
+        result: {
+          content: [{ type: "text", text: err instanceof Error ? err.message : String(err) }],
+          isError: true,
+        },
+      };
+    }
+  }
+
+  return { jsonrpc: "2.0", id, error: { code: -32601, message: `method not found: ${method}` } };
+}
+
+function main(): void {
+  log(`serving ${SPECS.length} tool(s) -> ${ENDPOINT || "(no endpoint)"}`);
+  let buffer = "";
+  process.stdin.setEncoding("utf8");
+  process.stdin.on("data", (chunk: string) => {
+    buffer += chunk;
+    let newline: number;
+    while ((newline = buffer.indexOf("\n")) !== -1) {
+      const line = buffer.slice(0, newline).trim();
+      buffer = buffer.slice(newline + 1);
+      if (!line) continue;
+      let parsed: any;
+      try {
+        parsed = JSON.parse(line);
+      } catch {
+        log(`skipping non-JSON line: ${line.slice(0, 120)}`);
+        continue;
+      }
+      Promise.resolve(handle(parsed))
+        .then((response) => {
+          if (response) send(response);
+        })
+        .catch((err) => log(`handler error: ${err?.message ?? err}`));
+    }
+  });
+  process.stdin.on("end", () => process.exit(0));
+}
+
+main();
diff --git a/services/oss/src/agent.py b/services/oss/src/agent.py
index 42f9b1832c..90f98ae948 100644
--- a/services/oss/src/agent.py
+++ b/services/oss/src/agent.py
@@ -17,6 +17,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import httpx
+from opentelemetry import trace as otel_trace
 
 import agenta as ag
 from agenta.sdk.engines.tracing.propagation import inject
@@ -27,6 +28,7 @@
 from oss.src.agent_pi.pi_harness import PiHarness
 from oss.src.agent_pi.pi_http_harness import PiHttpHarness
 from oss.src.agent_pi.ports import Harness, HarnessRequest, ToolCallback, TraceContext
+from oss.src.agent_pi.rivet_harness import RivetHarness
 from oss.src.agent_pi.schemas import AGENT_SCHEMAS
 
 log = get_module_logger(__name__)
@@ -41,13 +43,38 @@
 _TOOLS_RESOLVE_TIMEOUT = float(os.getenv("AGENTA_AGENT_TOOLS_TIMEOUT", "30"))
 
 
-def _build_harness() -> Harness:
+def _build_harness(
+    harness: Optional[str] = None,
+    sandbox: Optional[str] = None,
+) -> Harness:
     """Pick the harness adapter for the current deployment.
 
-    - ``AGENTA_AGENT_PI_URL`` set (docker): call the Pi sidecar over HTTP.
+    Runtime axis (``AGENTA_AGENT_RUNTIME``):
+    - ``rivet``: drive the harness over ACP via a rivet daemon (WP-8). The harness
+      (pi/claude) and sandbox (local/daytona) are independent config axes, taken from
+      the request config when set (so they are editable in the playground), else the
+      ``AGENTA_AGENT_HARNESS`` / ``AGENTA_AGENT_SANDBOX`` env defaults.
+    - default (``pi``): the legacy in-process Pi path (WP-2), kept so nothing regresses.
+
+    Transport axis (both runtimes):
+    - ``AGENTA_AGENT_PI_URL`` set (docker): call the TS wrapper sidecar over HTTP.
     - otherwise (local): spawn the TS wrapper as a subprocess.
     """
     pi_url = os.getenv("AGENTA_AGENT_PI_URL")
+    runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
+
+    if runtime == "rivet":
+        harness = (harness or os.getenv("AGENTA_AGENT_HARNESS", "pi")).lower()
+        sandbox = (sandbox or os.getenv("AGENTA_AGENT_SANDBOX", "local")).lower()
+        if pi_url:
+            return RivetHarness(harness=harness, sandbox=sandbox, base_url=pi_url)
+        return RivetHarness(
+            harness=harness,
+            sandbox=sandbox,
+            runtime=LocalRuntime(),
+            wrapper_dir=str(wrapper_dir()),
+        )
+
     if pi_url:
         return PiHttpHarness(pi_url)
     return PiHarness(LocalRuntime(), wrapper_dir=str(wrapper_dir()))
@@ -110,6 +137,62 @@ def _latest_user_message(messages: Optional[List[Any]]) -> str:
     return ""
 
 
+# Map a vault standard-provider kind to the env var the harness (Pi/Claude/litellm)
+# reads. Only providers an agent harness can use are listed.
+_PROVIDER_ENV_VARS = {
+    "openai": "OPENAI_API_KEY",
+    "anthropic": "ANTHROPIC_API_KEY",
+    "gemini": "GEMINI_API_KEY",
+    "mistral": "MISTRAL_API_KEY",
+    "mistralai": "MISTRAL_API_KEY",
+    "groq": "GROQ_API_KEY",
+    "together_ai": "TOGETHERAI_API_KEY",
+    "openrouter": "OPENROUTER_API_KEY",
+}
+
+
+async def _resolve_harness_secrets() -> Dict[str, str]:
+    """Resolve provider API keys from the project vault into harness env vars.
+
+    The agent authenticates the harness with the same provider keys the project
+    configured for LLM access. We fetch the project's vault ``provider_key`` secrets
+    from the backend directly (same backend + caller credential the tool resolver uses)
+    and inject each as its standard env var, so the harness uses whichever its model
+    needs. The SDK's per-request secret context does not propagate to this custom route,
+    so we resolve here rather than reading it. Empty when the vault has none (the harness
+    then falls back to its own login / OAuth — see ``runRivet``). Best-effort.
+    """
+    api_base = _agenta_api_base()
+    if not api_base:
+        return {}
+    headers = {"Content-Type": "application/json"}
+    authorization = _request_authorization()
+    if authorization:
+        headers["Authorization"] = authorization
+
+    try:
+        async with httpx.AsyncClient(timeout=_TOOLS_RESOLVE_TIMEOUT) as client:
+            response = await client.get(f"{api_base}/secrets/", headers=headers)
+        if response.status_code >= 400:
+            log.warning("agent: vault secrets fetch HTTP %s", response.status_code)
+            return {}
+        secrets = response.json() or []
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: vault secrets fetch failed", exc_info=True)
+        return {}
+
+    env: Dict[str, str] = {}
+    for secret in secrets:
+        if not isinstance(secret, dict) or secret.get("kind") != "provider_key":
+            continue
+        data = secret.get("data") or {}
+        env_var = _PROVIDER_ENV_VARS.get(str(data.get("kind", "")).lower())
+        key = (data.get("provider") or {}).get("key")
+        if env_var and key:
+            env.setdefault(env_var, key)
+    return env
+
+
 def _trace_context() -> Optional[TraceContext]:
     """Capture the active workflow span's trace context for the harness.
 
@@ -324,7 +407,12 @@ async def _agent(
 
     builtins, custom_tools, tool_callback = await _resolve_tools(tools_config)
 
-    harness = _build_harness()
+    # Harness (pi/claude) and sandbox (local/daytona) are editable config (see
+    # schemas.py), so a playground run can switch engine or environment; unset falls
+    # back to the env defaults inside _build_harness.
+    harness_id = params.get("harness")
+    sandbox_id = params.get("sandbox")
+    harness = _build_harness(harness=harness_id, sandbox=sandbox_id)
 
     await harness.setup()
     try:
@@ -338,14 +426,43 @@ async def _agent(
                 custom_tools=custom_tools,
                 tool_callback=tool_callback,
                 trace=_trace_context(),
+                secrets=await _resolve_harness_secrets(),
             )
         )
     finally:
         await harness.shutdown()
 
+    _record_usage(result.usage)
+
     return {"role": "assistant", "content": result.output}
 
 
+def _record_usage(usage: Optional[Dict[str, Any]]) -> None:
+    """Stamp the agent's token/cost totals onto the active ``/invoke`` workflow span.
+
+    The harness emits its own span tree (turns, LLM, tools) in a separate OTLP batch, so
+    Agenta's per-batch cumulative roll-up cannot bridge the totals onto the workflow
+    span. Setting ``gen_ai.usage.*`` here records them directly on that span (the root of
+    its batch), so the trace shows the run's tokens and cost. Best-effort.
+    """
+    if not usage or not usage.get("total"):
+        return
+    try:
+        span = otel_trace.get_current_span()
+        input_tokens = int(usage.get("input") or 0)
+        output_tokens = int(usage.get("output") or 0)
+        span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
+        span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
+        span.set_attribute("gen_ai.usage.prompt_tokens", input_tokens)
+        span.set_attribute("gen_ai.usage.completion_tokens", output_tokens)
+        span.set_attribute("gen_ai.usage.total_tokens", int(usage.get("total") or 0))
+        cost = usage.get("cost")
+        if cost:
+            span.set_attribute("gen_ai.usage.cost", float(cost))
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: failed to record usage on workflow span", exc_info=True)
+
+
 def create_agent_app():
     app = ag.create_app()
     # No builtin URI yet: registering the agent as a first-class workflow type
diff --git a/services/oss/src/agent_pi/ports.py b/services/oss/src/agent_pi/ports.py
index 4b436db6c6..dc768a29cd 100644
--- a/services/oss/src/agent_pi/ports.py
+++ b/services/oss/src/agent_pi/ports.py
@@ -114,6 +114,12 @@ class HarnessRequest:
     model: Optional[str] = None
     prompt: Optional[str] = None
     messages: List[Any] = field(default_factory=list)
+    # Continue a prior run by id (rivet path resumes/replays its history). None = new.
+    session_id: Optional[str] = None
+    # Provider API keys resolved from the project vault, as harness env vars
+    # ({"OPENAI_API_KEY": "...", ...}). Injected into the harness environment (local
+    # daemon + Daytona env_vars). Empty => the harness uses its own login (OAuth).
+    secrets: Dict[str, str] = field(default_factory=dict)
     tools: List[str] = field(default_factory=list)
     # Resolved runnable tool specs, already in the camelCase wire shape the TS
     # wrapper turns into Pi customTools: {name, description, inputSchema, callRef}.
@@ -129,6 +135,10 @@ class HarnessResult:
     output: str
     session_id: Optional[str] = None
     model: Optional[str] = None
+    # Run token/cost totals ({input, output, total, cost}). The harness span tree is
+    # exported in a separate OTLP batch from the workflow span, so the service rolls
+    # these onto the workflow span itself (see agent.py). None when unavailable.
+    usage: Optional[Dict[str, Any]] = None
 
 
 class Harness(ABC):
diff --git a/services/oss/src/agent_pi/rivet_harness.py b/services/oss/src/agent_pi/rivet_harness.py
new file mode 100644
index 0000000000..cd84939ab1
--- /dev/null
+++ b/services/oss/src/agent_pi/rivet_harness.py
@@ -0,0 +1,143 @@
+"""Rivet harness adapter (WP-8): drives the agent over ACP via a rivet daemon.
+
+Same ``Harness`` port as the Pi adapters, but the transport behind it runs the chosen
+harness (Pi, Claude Code, ...) over the Agent Client Protocol through a rivet
+``sandbox-agent`` daemon, rather than the bespoke Pi SDK calls. The ``/invoke`` contract
+is unchanged; harness and sandbox become config values carried on the wire to the TS
+runner (``runRivet.ts``, selected by ``AGENT_BACKEND=rivet``).
+
+Two transports, mirroring the Pi adapters:
+
+- HTTP (docker): POST the envelope to the wrapper running as a sidecar. Selected when a
+  base URL is provided (``AGENTA_AGENT_PI_URL``); the sidecar runs in rivet mode.
+- subprocess (local): spawn the TS CLI with ``AGENT_BACKEND=rivet`` and hand it the
+  envelope over stdio.
+
+The envelope adds ``harness``, ``sandbox``, and ``sessionId`` to the Pi-shaped fields;
+everything else (agentsMd, model, prompt, messages, tools, customTools, toolCallback,
+trace) is identical, so the Python side stays thin.
+"""
+
+import json
+import os
+from typing import List, Optional, Sequence
+
+import httpx
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from .ports import Harness, HarnessRequest, HarnessResult, Runtime
+
+log = get_module_logger(__name__)
+
+_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
+_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
+
+
+def _rivet_payload(request: HarnessRequest, harness: str, sandbox: str) -> dict:
+    """Build the wire envelope: the Pi-shaped fields plus harness/sandbox/sessionId."""
+    return {
+        "harness": harness,
+        "sandbox": sandbox,
+        "sessionId": request.session_id,
+        "secrets": request.secrets or {},
+        "agentsMd": request.agents_md,
+        "model": request.model,
+        "prompt": request.prompt,
+        "messages": request.messages,
+        "tools": request.tools,
+        "customTools": request.custom_tools,
+        "toolCallback": request.tool_callback.to_wire()
+        if request.tool_callback
+        else None,
+        "trace": request.trace.to_wire() if request.trace else None,
+    }
+
+
+def _to_result(data: dict) -> HarnessResult:
+    if not data.get("ok"):
+        raise RuntimeError(f"Rivet run failed: {data.get('error')}")
+    return HarnessResult(
+        output=data.get("output", ""),
+        session_id=data.get("sessionId"),
+        model=data.get("model"),
+        usage=data.get("usage"),
+    )
+
+
+class RivetHarness(Harness):
+    """Drive the harness over ACP via rivet, over HTTP or a local subprocess.
+
+    Pass ``base_url`` for the HTTP sidecar transport; otherwise a ``runtime`` plus
+    ``wrapper_dir`` runs the TS CLI as a subprocess. ``harness`` (pi/claude) and
+    ``sandbox`` (local/daytona) are the two orthogonal swap axes.
+    """
+
+    def __init__(
+        self,
+        *,
+        harness: str,
+        sandbox: str,
+        base_url: Optional[str] = None,
+        runtime: Optional[Runtime] = None,
+        wrapper_dir: Optional[str] = None,
+        command: Optional[Sequence[str]] = None,
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        if not base_url and not runtime:
+            raise ValueError(
+                "RivetHarness needs either base_url (HTTP) or runtime (subprocess)"
+            )
+        self._harness = harness
+        self._sandbox = sandbox
+        self._base_url = base_url.rstrip("/") if base_url else None
+        self._runtime = runtime
+        self._wrapper_dir = wrapper_dir
+        self._command: List[str] = list(command or _DEFAULT_COMMAND)
+        self._timeout = timeout
+
+    async def setup(self) -> None:
+        if self._runtime:
+            await self._runtime.start()
+
+    async def shutdown(self) -> None:
+        if self._runtime:
+            await self._runtime.shutdown()
+
+    async def invoke(self, request: HarnessRequest) -> HarnessResult:
+        payload = _rivet_payload(request, self._harness, self._sandbox)
+        if self._base_url:
+            return await self._invoke_http(payload)
+        return await self._invoke_subprocess(payload)
+
+    async def _invoke_http(self, payload: dict) -> HarnessResult:
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            response = await client.post(f"{self._base_url}/run", json=payload)
+        if response.status_code >= 500:
+            raise RuntimeError(
+                f"Rivet wrapper HTTP {response.status_code}: {response.text[:1000]}"
+            )
+        return _to_result(response.json())
+
+    async def _invoke_subprocess(self, payload: dict) -> HarnessResult:
+        assert self._runtime is not None
+        result = await self._runtime.exec(
+            self._command,
+            json.dumps(payload).encode("utf-8"),
+            cwd=self._wrapper_dir,
+            env={**os.environ, "AGENT_BACKEND": "rivet"},
+            timeout=self._timeout,
+        )
+        if not result.stdout.strip():
+            raise RuntimeError(
+                "Rivet wrapper returned no output. "
+                f"exit={result.code} stderr={result.stderr[-2000:]}"
+            )
+        try:
+            data = json.loads(result.stdout)
+        except json.JSONDecodeError as exc:
+            raise RuntimeError(
+                "Rivet wrapper returned invalid JSON. "
+                f"stdout={result.stdout[:500]} stderr={result.stderr[-1000:]}"
+            ) from exc
+        return _to_result(data)
diff --git a/services/oss/src/agent_pi/schemas.py b/services/oss/src/agent_pi/schemas.py
index cef2440679..7dc6af2580 100644
--- a/services/oss/src/agent_pi/schemas.py
+++ b/services/oss/src/agent_pi/schemas.py
@@ -59,6 +59,23 @@
                 "llm_config": {"model": _DEFAULT_MODEL, "tools": []},
             },
         },
+        # The two orthogonal runtime axes, editable in the playground so a run can
+        # switch engine (pi/claude) or where it runs (local/daytona) without redeploy.
+        # Read in agent.py and threaded to the rivet harness; fall back to env defaults.
+        "harness": {
+            "type": "string",
+            "title": "Harness",
+            "enum": ["pi", "claude"],
+            "default": "pi",
+            "description": "Coding agent engine to drive over ACP (pi or claude).",
+        },
+        "sandbox": {
+            "type": "string",
+            "title": "Sandbox",
+            "enum": ["local", "daytona"],
+            "default": "local",
+            "description": "Where the agent runs: local daemon or a Daytona sandbox.",
+        },
     },
 }
 

From 2aa4c0293d6c6e8ba4aa88032d7a0d544a4342bf Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 13:06:20 +0200
Subject: [PATCH 04/10] feat(agent): session-shaped harness/runtime port (rivet
 + legacy Pi)

Evolve the agent service ports toward the rivet sandbox-agent session shape, so the
rivet (ACP) and legacy in-process Pi backends share one clean, capability-aware port.

- ports.py: Environment + Harness seams, a first-class AgentSession (create/prompt/
  destroy), HarnessCapabilities, ContentBlock, Message, AgentEvent, structured AgentResult.
- harness.py: SubprocessHarness + HttpHarness share one wire contract (wire.py),
  replacing the pi_harness/pi_http_harness/rivet_harness trio. The engine is an env value.
- TS: shared protocol.ts; runPi/runRivet return the enriched result; runRivet probes
  getAgent() capabilities and routes tools by mcpTools, not the harness name; usage flows
  on the rivet path (split from PromptResponse.usage); one shared toolClient.ts replaces
  the triplicated /tools/call client.
- agent.py uses the session API; _select_backend upgrades pi/local to rivet when the
  selected harness/sandbox needs it. permission_policy added to /inspect.

Verified live: pi, rivet+pi+local, rivet+claude+local, rivet+pi+daytona; playground run
succeeds with usage; invoke_agent nests under the /invoke span. Design notes under
docs/design/agent-workflows/harness-port-redesign/.
---
 docs/design/agent-workflows/README.md         |   4 +
 .../harness-port-redesign/README.md           |  67 +++
 .../harness-port-redesign/implementation.md   | 147 ++++++
 .../harness-port-redesign/plan.md             |  98 ++++
 .../harness-port-redesign/proposal.md         | 169 +++++++
 .../harness-port-redesign/research.md         | 196 ++++++++
 .../harness-port-redesign/status.md           |  62 +++
 services/agent/src/agenta-otel.ts             |  29 +-
 services/agent/src/cli.ts                     |  11 +-
 services/agent/src/piExtension.ts             |  67 +--
 services/agent/src/protocol.ts                | 164 +++++++
 services/agent/src/runPi.ts                   | 305 ++++---------
 services/agent/src/runRivet.ts                | 150 +++++--
 services/agent/src/server.ts                  |  18 +-
 services/agent/src/toolBridge.ts              |   4 +-
 services/agent/src/toolBridgeServer.ts        |  63 +--
 services/agent/src/toolClient.ts              |  87 ++++
 services/oss/src/agent.py                     | 135 +++---
 services/oss/src/agent_pi/__init__.py         |  52 ++-
 .../{local_runtime.py => environment.py}      |  20 +-
 services/oss/src/agent_pi/harness.py          | 145 ++++++
 services/oss/src/agent_pi/pi_harness.py       |  88 ----
 services/oss/src/agent_pi/pi_http_harness.py  |  68 ---
 services/oss/src/agent_pi/ports.py            | 425 ++++++++++++++----
 services/oss/src/agent_pi/rivet_harness.py    | 143 ------
 services/oss/src/agent_pi/schemas.py          |  10 +
 services/oss/src/agent_pi/wire.py             |  73 +++
 27 files changed, 1966 insertions(+), 834 deletions(-)
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/README.md
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/implementation.md
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/plan.md
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/proposal.md
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/research.md
 create mode 100644 docs/design/agent-workflows/harness-port-redesign/status.md
 create mode 100644 services/agent/src/protocol.ts
 create mode 100644 services/agent/src/toolClient.ts
 rename services/oss/src/agent_pi/{local_runtime.py => environment.py} (72%)
 create mode 100644 services/oss/src/agent_pi/harness.py
 delete mode 100644 services/oss/src/agent_pi/pi_harness.py
 delete mode 100644 services/oss/src/agent_pi/pi_http_harness.py
 delete mode 100644 services/oss/src/agent_pi/rivet_harness.py
 create mode 100644 services/oss/src/agent_pi/wire.py

diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md
index d8ae4537ce..dc3451cfda 100644
--- a/docs/design/agent-workflows/README.md
+++ b/docs/design/agent-workflows/README.md
@@ -122,5 +122,9 @@ running agent.
 
 ## Related work
 
+- [`harness-port-redesign/`](harness-port-redesign/README.md) — research and proposal for
+  evolving the `Harness` and `Runtime` ports to learn from the rivet `sandbox-agent` SDK
+  (sessions, structured event streaming, capabilities, attachments, lifecycle). Follows on
+  from WP-8.
 - [`../prompt-runtime-unification/`](../prompt-runtime-unification/README.md) — the
   prompt-side runtime that "future agent-style services" were already anticipated against.
diff --git a/docs/design/agent-workflows/harness-port-redesign/README.md b/docs/design/agent-workflows/harness-port-redesign/README.md
new file mode 100644
index 0000000000..3402652d15
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/README.md
@@ -0,0 +1,67 @@
+# Harness + Runtime port redesign
+
+Status: research and proposal, scope approved (full A to E arc, cold per invoke). Not
+implemented. Read this, then [`research.md`](research.md) (the side by side), then
+[`proposal.md`](proposal.md) (the recommended port shape), then [`plan.md`](plan.md) (the
+phased build), with [`status.md`](status.md) holding decisions and open questions.
+
+## Why this exists
+
+WP-8 adopted [`rivet-dev/sandbox-agent`](https://github.com/rivet-dev/sandbox-agent)
+unmodified and kept our `Harness` and `Runtime` ports unchanged on purpose (see
+[`../wp-8-rivet-acp-runtime/`](../wp-8-rivet-acp-runtime/README.md)). That shipped, but
+it also exposed how thin our ports are next to rivet's SDK. Our `Harness.invoke()`
+takes a request and returns one string. Rivet's SDK models sessions, a live structured
+event stream, per harness capabilities, multimodal input, permissions, and an explicit
+lifecycle.
+
+This folder compares the two interfaces and proposes how to evolve our ports so they
+borrow rivet's vocabulary without giving up the neutral seam (rivet stays one adapter
+behind the port, so the legacy Pi path and a future non-rivet harness still fit).
+
+## The one screen summary
+
+Rivet splits the surface into three planes. The split is the main lesson.
+
+| Plane | Rivet owns it via | Belongs in our port? |
+| --- | --- | --- |
+| Runtime / sandbox (where the daemon runs, lifecycle) | `SandboxAgent` + providers (`local`, `daytona`, `e2b`, `docker`, ...) | Yes, as the environment seam |
+| Agent session (prompt, config, events, permissions) | `Session` (`prompt`, `onEvent`, `setModel`, ...) | Yes, this is the heart of the port |
+| System (filesystem, process, desktop) | `SandboxAgent.readFsFile` / `runProcess` / `clickDesktop` ... | No. Provisioning only, never exposed to the config author |
+
+Our current `Harness` port collapses the first two planes into a single blocking
+`invoke()` and ignores most of what the session plane offers.
+
+## Verdicts on the proposed scope
+
+The starting hypothesis was: sessions, skills, tools, hooks, and attachments belong in
+the port; system (filesystem) does not; streaming and session destroy are worth adopting.
+Mostly right. The corrections:
+
+- **Sessions** — adopt. Make a session a first class object with create, continue,
+  destroy, and a pluggable persistence driver, the way rivet does. Today a session is
+  just a `session_id` string and the history is replayed as prompt text.
+- **Skills** — adopt, but as config artifacts laid into the workspace, not a new verb.
+  Rivet exposes `setSkillsConfig(directory, ...)`; the harness reads them from disk.
+- **Tools** — adopt and generalize. WP-7 already passes tools as `custom_tools` plus a
+  callback. Make delivery capability gated (MCP vs native) instead of `if harness == pi`.
+- **Hooks** — **correction.** Rivet has no hook API. Hooks are a harness level concept
+  (Pi and Claude read them from their own config dirs). Model them as part of the agent
+  config bundle laid into the workspace, not as a port method rivet would host.
+- **Attachments** — adopt. Rivet prompts take ACP content blocks (text, image, audio,
+  resource, resource_link). Our prompt is a bare string, so images and files cannot pass.
+- **System (filesystem etc.)** — correct, keep it out of the `Harness` port. It is part
+  of the runtime/sandbox provider surface and we already use `writeFsFile`/`mkdirFs` only
+  to provision (upload AGENTS.md, auth, the extension) on Daytona.
+- **Communication / streaming** — adopt. Replace the one shot string return with a
+  structured event stream plus a final result, so tracing, multi message output, and
+  client streaming all read from one source.
+- **Destroy / lifecycle** — adopt. Rivet has `destroySession`, `destroySandbox`,
+  `pauseSandbox`, `killSandbox`, `dispose`. Our `Runtime.pause` is a no-op stub.
+
+## What this does not propose
+
+A rewrite. The recommendation is a phased evolution (see [`proposal.md`](proposal.md))
+that keeps `/invoke` and `/inspect` working at every step and leaves rivet behind the
+port. The folder jail, multi tenant isolation, and the warm shared daemon stay deferred
+to [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md).
diff --git a/docs/design/agent-workflows/harness-port-redesign/implementation.md b/docs/design/agent-workflows/harness-port-redesign/implementation.md
new file mode 100644
index 0000000000..93a698bd40
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/implementation.md
@@ -0,0 +1,147 @@
+# Implementation notes
+
+How the approved A to E arc lands in code, with the cold + replay constraint. This is
+the as-built reference for the rewrite (kept in sync with the code).
+
+## Module layout
+
+### Python (`services/oss/src/agent_pi/`)
+
+| File | Holds |
+| --- | --- |
+| `ports.py` | The neutral types and the two seams. Types: `HarnessCapabilities`, `ContentBlock`, `Message`, `AgentEvent`, `TraceContext`, `ToolCallback`, `SessionConfig`, `AgentRequest`, `AgentResult`. Seams: `Environment` (where it runs) and `Harness` (the agent). Plus the concrete `AgentSession` sugar. |
+| `wire.py` | One place that serializes an `AgentRequest` to the camelCase `/run` JSON and parses an `AgentResult` back. Shared by every transport so the wire shape lives once. |
+| `environment.py` | `LocalEnvironment` (subprocess on this host). Replaces `local_runtime.py`. |
+| `harness.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. Replaces `pi_harness.py`, `pi_http_harness.py`, `rivet_harness.py`. |
+| `config.py` | Unchanged: load the file-backed `AgentConfig`. |
+| `schemas.py` | The `/inspect` schemas. Gains the permission-policy parameter. |
+
+The backend engine (legacy in-process Pi vs rivet ACP) is no longer a Python class. It
+is one env value (`AGENT_BACKEND`) the transport passes to the TS runner, or the sidecar
+auto-routes by request shape. So Python has two transports, not three backend adapters.
+
+### TypeScript (`services/agent/src/`)
+
+| File | Holds |
+| --- | --- |
+| `protocol.ts` | Shared wire types: `AgentRunRequest`, `AgentRunResult`, `AgentEvent`, `ContentBlock`, `HarnessCapabilities`. Both runners import from here (no more `runRivet` importing types out of `runPi`). |
+| `runPi.ts` | Legacy backend: drive the Pi SDK in-process. Returns the enriched result. |
+| `runRivet.ts` | Rivet backend: drive a harness over ACP. Probes `getAgent(harness).capabilities` and branches on capability flags, not on the harness name. Returns the enriched result, including usage for both Pi and Claude. |
+| `agenta-otel.ts` | The Pi-extension tracer and the ACP-event tracer. Also accumulates the structured event log. |
+| `piExtension.ts`, `toolBridge*.ts` | Unchanged tool/trace delivery. |
+| `cli.ts`, `server.ts` | Route to the backend by `AGENT_BACKEND` (auto by request shape on the sidecar). |
+
+## The seams
+
+```python
+class Harness(ABC):
+    async def setup(self) -> None: ...
+    async def shutdown(self) -> None: ...
+    async def invoke(self, request: AgentRequest, *, on_event=None) -> AgentResult: ...
+    async def destroy_session(self, session_id: str | None) -> None: ...   # cold: no-op
+    def create_session(self, config: SessionConfig) -> AgentSession: ...
+
+class AgentSession:                 # sugar over invoke; the first-class session abstraction
+    async def prompt(self, messages, *, on_event=None) -> AgentResult: ...
+    async def destroy(self) -> None: ...
+```
+
+`invoke` is the single transport call (one cold run). `AgentSession` is the rivet-shaped
+abstraction on top: `create_session(config)` then `session.prompt(messages)`. Under cold +
+replay the session holds no warm daemon; continuation replays the caller-supplied history
+into a fresh run, exactly as WP-8 does today. Server-side persisted history is the
+deferred Phase C bit (see Deferred below).
+
+## Capabilities: probed in TS, reported in the result
+
+A separate capability probe would cost a whole daemon spin-up under the cold model. So the
+rivet runner probes `getAgent(harness).capabilities` while its daemon is already up, drives
+tool delivery and tracing off the flags (`mcpTools`, `usage`, `streamingDeltas`, ...), and
+returns the capabilities in the result. Python keeps a small static table only for input
+shaping (for example, do not send image blocks to a harness without `images`). This is
+what removes the `if harness == "pi"` branching: the decision moves to where the live
+answer is, the TS runner.
+
+## Wire contract (`/run`)
+
+Request (camelCase), superset of today: `harness`, `sandbox`, `sessionId`, `agentsMd`,
+`model`, `messages` (each `content` is a string or a `ContentBlock[]`), `prompt`,
+`secrets`, `tools`, `customTools`, `toolCallback`, `permissionPolicy` (`auto` | `deny`),
+`trace`.
+
+Result: `ok`, `output` (final text), `messages` (structured assistant messages), `events`
+(the `AgentEvent` log for the turn), `usage` (`{input, output, total, cost}`, now for the
+rivet path too), `stopReason`, `capabilities`, `sessionId`, `model`, `traceId`, `error`.
+
+## What each phase delivers here
+
+- **A** capabilities + structured result: `HarnessCapabilities`, the enriched `AgentResult`
+  (messages, usage, stopReason, capabilities), and capability-driven branching in `runRivet`.
+- **B** event stream through the port: `AgentEvent` log on the result, plus an optional
+  `on_event` callback on `invoke`/`prompt`. The HTTP edge (`/invoke`) stays request and
+  response; live SSE to the playground is deferred (ties to WP-4).
+- **C** first-class sessions: `AgentSession` create / prompt / destroy. Continuation stays
+  cold + replay with caller-held history. A server-side `SessionStore` is deferred.
+- **D** content blocks, permissions, skills, hooks: `ContentBlock` on the turn (text now,
+  image-ready), `permissionPolicy`, and skills/hooks carried as workspace artifacts.
+- **E** retire the exec port: `Runtime` becomes `Environment`; `exec` survives only as the
+  subprocess transport's mechanism.
+
+## Verification
+
+Local CLI runs against real models (2026-06-17), driving `services/agent/src/cli.ts`:
+
+| Combo | Result | Usage source | Live capabilities |
+| --- | --- | --- | --- |
+| `pi` (legacy in-process) | reply ok | Pi extension (`otel.usage()`) | mcpTools=false |
+| `rivet` + `pi` + `local` | reply ok | extension usage file | probed: mcpTools=false, images=true |
+| `rivet` + `claude` + `local` | reply ok | ACP `usage_update` | probed: mcpTools=true, permissions=true |
+
+The capability probe returns the harness's real flags (Pi and Claude differ), and tool
+delivery routes off `mcpTools`. The structured result carries output, messages, events,
+usage (token split + cost), stopReason, capabilities, sessionId, model, traceId. Python
+compiles and passes `ruff`; TypeScript passes `tsc --strict --noEmit`.
+
+### Review
+
+A high-effort recall review (8 finder angles, 36 candidates, single-vote verify) found 10
+issues, all fixed and re-verified:
+
+- usage_update read non-existent `input`/`output` fields, so the Claude/Codex token split
+  was always 0. Fixed: read the split from `PromptResponse.usage` in `runRivet`. Verified
+  Claude now reports input/output (3327/6).
+- `Message.to_wire()` crashed on list (content-block) content. Fixed: `Message.from_raw`
+  coerces blocks into `ContentBlock`; `to_wire` tolerates dicts. Verified a content-block
+  turn returns cleanly.
+- `priorMessages` dropped every prior user turn equal to the prompt, not just the latest.
+- The legacy Pi engine silently swallowed a `claude`/`daytona` selection. Fixed:
+  `_select_backend` upgrades to rivet when the harness/sandbox needs it.
+- The `/tools/call` client was triplicated across `runPi`, `piExtension`, and
+  `toolBridgeServer`. Fixed: one shared `toolClient.ts`.
+- Dead code removed: the `RunCall` alias and a stale type re-export block.
+
+### Live verification (dev stack, 2026-06-17)
+
+Run on the dev box with the agent-pi sidecar and services container reloaded onto this
+branch (both bind-mount the repo):
+
+- **Daytona**: `rivet+pi+daytona` through the live sidecar returned a correct answer in
+  ~14s with usage read back from the in-sandbox extension file.
+- **Full playground run**: the agent app in the `pi-agents` project answered "Hello! The
+  capital of Germany is Berlin." with status Success, 6.54s, 1.2K tokens. The new
+  Harness/Sandbox config selectors render from `schemas.py`.
+- **Trace nesting**: the trace shows `invoke_agent` nested directly under the `_agent`
+  workflow root span (same trace, usage propagated). The agent's run joins the `/invoke`
+  trace as required.
+
+Remaining manual check: a Composio tool end to end through the playground (the tool
+routing is verified by capability; the WP-7 resolution path is unchanged).
+
+## Deferred (documented, not built in this pass)
+
+- Server-side persisted session history (the `SessionStore` / DB). Today the playground
+  holds history and replays it; the session abstraction is in place for when we add the store.
+- Live SSE streaming to the playground client (the event stream is delivered through the
+  port as a log + callback; the HTTP edge stays request and response).
+- Image content blocks end to end (the type is plumbed; the playground does not send images yet).
+- `session/fork`, the folder jail, and the warm shared daemon (all out of scope per WP-8).
diff --git a/docs/design/agent-workflows/harness-port-redesign/plan.md b/docs/design/agent-workflows/harness-port-redesign/plan.md
new file mode 100644
index 0000000000..f7cbc39a99
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/plan.md
@@ -0,0 +1,98 @@
+# Build plan
+
+Scope set by the user (2026-06-17): full A to E arc, cold per invoke (no warm daemon).
+See [`status.md`](status.md) for the decisions and [`proposal.md`](proposal.md) for the
+target shape. Each phase ships independently and keeps `/invoke` and `/inspect` working.
+
+Reading key for the file column: `ports.py`, `rivet_harness.py`, `schemas.py`,
+`agent.py`, `pi_harness.py`, `pi_http_harness.py` are under `services/oss/src/`;
+`runRivet.ts`, `runPi.ts`, `server.ts`, `cli.ts`, `toolBridge*.ts`, `agenta-otel.ts` are
+under `services/agent/src/`.
+
+## Phase A. Capabilities and a structured result
+
+Goal: kill the `if harness == "pi"` branches and stop flattening the run to one string.
+
+| Task | Files |
+| --- | --- |
+| Add a `HarnessCapabilities` dataclass (the rivet `AgentCapabilities` flags we use: `mcp_tools`, `images`, `file_attachments`, `plan_mode`, `reasoning`, `permissions`, `usage`, `streaming_deltas`, `session_lifecycle`) | `ports.py` |
+| Probe capabilities once per harness via the rivet SDK `getAgent(id)`; cache; pass to the result | `runRivet.ts` |
+| Replace harness-name branches (tools native vs MCP, tracing `emitSpans`) with capability checks | `runRivet.ts` |
+| Widen `HarnessResult` / `AgentRunResult` to carry `messages`, `usage`, `tool_calls`, `stop_reason`, `capabilities` (data already accumulates in the event handler) | `ports.py`, `runPi.ts`, `rivet_harness.py` |
+| Keep `output` as the derived final string so `/invoke` is unchanged | `agent.py` |
+
+Done when: a Pi run and a Claude run both return a structured result; no code path reads
+`harness == "pi"`; the `/invoke` response body is byte-identical for a simple turn.
+
+## Phase B. Event streaming through the port
+
+Goal: forward the rivet `session/update` stream through the port instead of consuming it
+privately for tracing.
+
+| Task | Files |
+| --- | --- |
+| Define an `AgentEvent` type (variants: `message`, `thought`, `tool_call`, `plan`, `usage`, `done`) mapped from ACP `session/update` | `ports.py`, `runPi.ts` |
+| Add an event sink to `invoke` (callback or async generator); tracing reads from it rather than its own `session.onEvent` | `ports.py`, `rivet_harness.py`, `runRivet.ts`, `agenta-otel.ts` |
+| Transport: stream events over the `/run` hop (NDJSON or SSE) for the HTTP sidecar; keep a final JSON result frame | `server.ts`, `cli.ts`, `rivet_harness.py` |
+| Optional: expose a streaming surface from `agent.py` (feeds WP-4 multi message output); `/invoke` still returns the final message | `agent.py` |
+
+Done when: tracing is built from the forwarded event stream (no private subscription in
+`runRivet.ts`); a caller can observe `message`/`tool_call`/`usage` events live; `/invoke`
+still returns one final message.
+
+## Phase C. First class sessions (cold, replay backed)
+
+Goal: a real `AgentSession` object backed by persisted history. Continue a conversation by
+replaying persisted events into a fresh cold sandbox, not by the caller passing transcript
+text and not by a warm ACP `session/load`.
+
+| Task | Files |
+| --- | --- |
+| Add `create_session(config) -> AgentSession`, `resume_session(id)`, `AgentSession.prompt(...)`, `AgentSession.destroy()` to the port | `ports.py` |
+| Define a `SessionStore` analogue of rivet's `SessionPersistDriver` (`get_session`, `list_events`, `insert_event`); persist the `AgentEvent` stream from Phase B | new module under `services/oss/src/agent_pi/` |
+| Implement continuation as replay: on `resume`, load persisted events, rebuild turn context, run in a fresh cold sandbox (replaces `buildTurnText` transcript replay) | `rivet_harness.py`, `runRivet.ts` |
+| Wire the store: backend DB on the platform, file standalone (default assumption, open Q3) | `agent.py`, new module |
+| Optional: model `session/fork` for "try N variations of a turn" (defer unless a caller exists, open Q5) | `ports.py`, `runRivet.ts` |
+
+Done when: a second turn against a `session_id` reconstructs context from the store (not
+from caller-supplied `messages`); destroying a session drops its history; cold lifecycle
+is unchanged (no warm daemon).
+
+## Phase D. Content blocks, permissions, skills, hooks
+
+Goal: richer input and the remaining config surface.
+
+| Task | Files |
+| --- | --- |
+| Turn `prompt` into ACP content blocks (`text`, `image`, `audio`, `resource`, `resource_link`); gate images/files on `images`/`file_attachments` capability | `ports.py`, `runRivet.ts`, `runPi.ts` |
+| Surface attachments in the workflow input schema so the playground can send them | `schemas.py` |
+| Add a `permission_policy` to the session config (auto-allow, deny, delegate-to-callback); replace the hardcoded auto-approve | `ports.py`, `runRivet.ts` |
+| Optional: surface permission requests as events for human in the loop | `ports.py`, `runRivet.ts`, `agent.py` |
+| Add `skills` to the session config, resolved before the run and laid into `cwd` (or via rivet `setSkillsConfig`) | `ports.py`, `rivet_harness.py`, `runRivet.ts` |
+| Add `hooks` as config artifacts laid into the workspace / agent dir (not a port verb; same shape as the Pi extension install) | `ports.py`, `runRivet.ts` |
+
+Done when: an image attachment reaches a capable harness; a deny policy blocks a tool; a
+skill file and a hook artifact are present in the run and exercised.
+
+## Phase E. Retire the `Runtime` exec port
+
+Goal: fold "where it runs" fully into the environment seam backed by rivet providers.
+
+| Task | Files |
+| --- | --- |
+| Rename/replace `Runtime` with an `Environment` seam (`start`, `dispose`, `destroy`, `pause`, provisioning `put_file`); back lifecycle with `destroySandbox`/`dispose`/`pauseSandbox` | `ports.py`, `rivet_harness.py`, `local_runtime.py` |
+| Move provisioning (AGENTS.md, auth, extension upload) behind `Environment.put_file` | `runRivet.ts`, `rivet_harness.py` |
+| Keep `exec` only while the legacy in-process Pi subprocess transport needs it; otherwise remove | `ports.py`, `pi_harness.py` |
+| Update `_build_harness` to construct the environment from provider config, not an exec runtime | `agent.py` |
+
+Done when: the rivet path no longer depends on `Runtime.exec`; lifecycle calls map to
+rivet provider lifecycle; the legacy Pi path still runs or is explicitly retired.
+
+## Cross cutting
+
+- **Legacy adapters.** `PiHarness` and `PiHttpHarness` must satisfy the widened port at
+  each phase, or be adapted behind a shim. Decide per phase whether to keep them.
+- **Tracing.** The `createRivetOtel` event-stream tracer is the reference consumer of the
+  Phase B stream; keep its output stable so existing traces do not regress.
+- **No regressions to `/invoke` / `/inspect`.** Verify after every phase with a live
+  playground run (the WP-8 verification path).
diff --git a/docs/design/agent-workflows/harness-port-redesign/proposal.md b/docs/design/agent-workflows/harness-port-redesign/proposal.md
new file mode 100644
index 0000000000..8adde2e18f
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/proposal.md
@@ -0,0 +1,169 @@
+# Proposal: evolve the ports toward a session shaped seam
+
+## Principle
+
+Borrow rivet's vocabulary, keep the neutral seam. Rivet stays one adapter behind the
+port so the legacy in process Pi path and any future non rivet harness still fit. We are
+not adopting the rivet SDK as our public interface. We are reshaping our port so the rich
+session that rivet already gives us stops getting flattened to a string at the boundary.
+
+Three moves carry most of the value:
+
+1. Split the port into an **Environment** seam (where it runs, its lifecycle) and an
+   **AgentSession** seam (the conversation), matching rivet's plane A and plane B.
+2. Make the turn call **event shaped**: stream structured events, return a structured
+   result. Stop returning one string.
+3. Make a **session a first class object** with create, continue, destroy, backed by a
+   persistence driver, so "continue" uses ACP `session/load` instead of replaying
+   transcript text.
+
+Everything else (capabilities, content blocks, permissions, skills, lifecycle) hangs off
+those three.
+
+## Target shape (conceptual, Python)
+
+Not final signatures. The intent, so the phased plan has a destination.
+
+```python
+# Plane A: where the agent runs and its lifecycle. Rivet providers live below this.
+class Environment(ABC):
+    async def start(self) -> None: ...
+    async def dispose(self) -> None: ...
+    async def destroy(self) -> None: ...        # tear the sandbox down
+    async def pause(self) -> None: ...          # optional, provider dependent
+    # provisioning only, never exposed to the agent author:
+    async def put_file(self, path: str, body: bytes) -> None: ...
+
+# Capabilities the runtime probed from the harness (rivet AgentCapabilities).
+@dataclass
+class HarnessCapabilities:
+    mcp_tools: bool = False
+    images: bool = False
+    file_attachments: bool = False
+    plan_mode: bool = False
+    reasoning: bool = False
+    permissions: bool = False
+    usage: bool = False
+    session_lifecycle: bool = False
+    streaming_deltas: bool = False
+    # ... the rest of the 18 flags
+
+# Plane B: the agent conversation.
+class AgentSession(ABC):
+    id: str
+    capabilities: HarnessCapabilities
+
+    async def prompt(self, blocks: list[ContentBlock]) -> AsyncIterator[AgentEvent]: ...
+    async def destroy(self) -> None: ...
+    # config the harness honors (each is capability gated):
+    async def set_model(self, model: str) -> None: ...
+    async def set_mode(self, mode: str) -> None: ...
+    async def on_permission(self, request: PermissionRequest) -> PermissionReply: ...
+
+class Harness(ABC):
+    async def get_capabilities(self) -> HarnessCapabilities: ...
+    async def create_session(self, config: SessionConfig) -> AgentSession: ...
+    async def resume_session(self, session_id: str) -> AgentSession: ...
+```
+
+`SessionConfig` is the agent config bundle: `agents_md`, `model`, `skills`, `tools`
+(definition plus body plus delivery), `mcp`, `hooks` (as artifacts), `harness`,
+`permission_policy`. `ContentBlock` mirrors ACP: `text | image | audio | resource |
+resource_link`. `AgentEvent` mirrors the `session/update` variants:
+`message`, `thought`, `tool_call`, `plan`, `usage`, `done`.
+
+## Field by field: where today's fields go
+
+| Today (`HarnessRequest`) | Tomorrow |
+| --- | --- |
+| `agents_md` | `SessionConfig.agents_md` (still written as `AGENTS.md`) |
+| `model` | `SessionConfig.model`, applied via `set_model` (capability gated) |
+| `prompt` | a `text` content block in `prompt(blocks)` |
+| `messages` | prior turns become `session/load` replay, not transcript text |
+| `session_id` | `resume_session(id)` returning an `AgentSession` |
+| `tools` / `custom_tools` / `tool_callback` | `SessionConfig.tools`, delivered by capability (MCP vs native) |
+| `trace` | unchanged; still injected at the environment's birth |
+| (new) attachments / images | `image` / `resource` content blocks |
+| (new) per harness behavior | `HarnessCapabilities` instead of `if harness == "pi"` |
+
+`HarnessResult.output` becomes the terminal `done` event plus the accumulated `message`
+events. The single string is still trivially derivable for `/invoke`'s current response.
+
+## How each piece maps to rivet
+
+- **Sessions** → `createSession` / `resumeSession` / `resumeOrCreateSession` /
+  `destroySession`, plus a `SessionPersistDriver`. Adopt the persist driver interface
+  shape so the platform backs it with Postgres and a standalone run backs it with a file,
+  exactly as rivet already splits in memory vs Postgres.
+- **Streaming** → `session.onEvent`. `runRivet.ts` already subscribes for tracing
+  (`otel.handleUpdate`). The change is to forward those events through the port instead of
+  consuming them privately and returning a string.
+- **Capabilities** → `getAgent().capabilities`. Probe once per harness, cache, branch on
+  flags.
+- **Attachments** → ACP content blocks on `prompt`. Gate on `fileAttachments` / `images`.
+- **Skills** → `setSkillsConfig(directory, ...)` or laid into `cwd` as files. Part of
+  `SessionConfig`, resolved before the run like AGENTS.md.
+- **Tools** → keep WP-7's definition plus body plus callback. Deliver over MCP when
+  `mcpTools` is set, native when the harness wants native (today's Pi extension path).
+- **Hooks** → **not a rivet call.** Lay them into the workspace or agent dir as artifacts,
+  the way we already install the Pi tracing extension. Model `hooks` as files in
+  `SessionConfig`, not a port verb.
+- **Permissions** → `onPermissionRequest` / `respondPermission`. Replace the hardcoded
+  auto approve with a `permission_policy` on `SessionConfig` (auto allow, deny, or
+  delegate to a callback), and later surface requests as events for true human in the
+  loop.
+- **Lifecycle / destroy** → `Environment.destroy` / `dispose` and `AgentSession.destroy`,
+  mapping to `destroySandbox` / `dispose` / `destroySession`. Retire the `Runtime.pause`
+  no-op or back it with `pauseSandbox` where the provider supports it.
+
+## What stays out of the port
+
+The system plane: filesystem, process, desktop. We use `writeFsFile` / `mkdirFs` only to
+provision a Daytona sandbox (upload AGENTS.md, auth, the extension). Keep that inside the
+`Environment` adapter as provisioning. Never surface it to the agent config author. The
+agent author sees AGENTS.md, skills, tools, model, harness, attachments. Not a filesystem.
+
+## Phased path (each phase ships and keeps `/invoke` working)
+
+The phases are ordered by value over risk. Stop wherever the payoff flattens.
+
+- **Phase A. Capabilities and structured result.** Probe `getAgent().capabilities`,
+  thread a `HarnessCapabilities` object through, and replace the `harness == "pi"`
+  branches in `runRivet.ts` with capability checks. Widen `HarnessResult` to carry
+  `messages`, `usage`, `tool_calls`, `stop_reason` (the data is already in the event
+  stream). Low risk, immediately removes brittle harness name checks.
+
+- **Phase B. Event streaming through the port.** Add an event channel to `invoke`
+  (callback or async generator) carrying the `session/update` variants. Tracing reads from
+  it instead of a private subscription. `/invoke` still returns the final message, so the
+  HTTP contract is unchanged; client side streaming (WP-4) becomes a small add on.
+
+- **Phase C. First class sessions.** Introduce `create_session` / `resume_session` /
+  `destroy` and a `SessionPersistDriver` analogue. Continue a conversation with ACP
+  `session/load` instead of `buildTurnText` transcript replay. This needs the warm daemon
+  decision (see open questions) because cold per invoke sandboxes cannot hold a session
+  across turns without replay.
+
+- **Phase D. Content blocks, permissions, skills, hooks.** Turn `prompt` into content
+  blocks (attachments, images). Add `permission_policy`. Move skills and hooks into
+  `SessionConfig` as resolved artifacts.
+
+- **Phase E. Retire the `Runtime` exec port.** Fold "where it runs" fully into the
+  `Environment` seam backed by rivet providers. Keep `exec` only as long as the legacy
+  subprocess Pi transport needs it.
+
+## Risks and caveats
+
+- **Cold per invoke lifecycle fights first class sessions.** Phase C is the moment to
+  decide warm vs cold (the WP-8 status calls this out). First class sessions and ACP
+  `session/load` want a daemon that survives between turns, which reopens the per session
+  env and folder jail questions in
+  [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md).
+- **Harness capability gaps are real.** Pi 0.79.4 has no MCP, so `mcpTools` is false and
+  Pi tools still go native. The capability model makes that explicit instead of surprising.
+- **Usage is harness dependent.** Pi emits no `usage_update` over ACP; Claude does. The
+  structured result must tolerate missing usage (the WP-8 tracing deviation already notes
+  this).
+- **Neutral seam vs rivet coupling.** Mirroring rivet's names risks the port drifting into
+  a rivet wrapper. Keep the port types ours (content blocks, events, capabilities as our
+  dataclasses) and translate in the adapter, so a non rivet harness can still implement it.
diff --git a/docs/design/agent-workflows/harness-port-redesign/research.md b/docs/design/agent-workflows/harness-port-redesign/research.md
new file mode 100644
index 0000000000..e8913189a6
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/research.md
@@ -0,0 +1,196 @@
+# Research: our ports vs the rivet SDK
+
+Source verified June 2026 against the installed `sandbox-agent@0.4.2` SDK
+(`services/agent/node_modules/.pnpm/sandbox-agent@0.4.2.../dist/index.d.ts`), the
+`acp-http-client@0.4.2` client, the `@agentclientprotocol/sdk` schema, and our own code
+(`services/oss/src/agent_pi/ports.py`, `services/agent/src/runRivet.ts`,
+`services/oss/src/agent.py`). Method and type names below are copied from those files.
+
+## 1. Our current ports
+
+### `Harness` (`services/oss/src/agent_pi/ports.py`)
+
+```python
+class Harness(ABC):
+    async def setup(self) -> None: ...
+    async def invoke(self, request: HarnessRequest) -> HarnessResult: ...
+    async def shutdown(self) -> None: ...
+```
+
+`HarnessRequest`: `agents_md`, `model`, `prompt`, `messages`, `session_id`, `tools`,
+`custom_tools`, `tool_callback`, `trace`.
+`HarnessResult`: `output` (one string), `session_id`, `model`.
+
+Properties of this port:
+
+- **One shot and blocking.** One turn in, one string out. No incremental events.
+- **Session is a string.** `session_id` is threaded through; "continue" means replaying
+  prior turns as transcript text inside the prompt (`buildTurnText` in `runRivet.ts`),
+  not loading an ACP session.
+- **No capability model.** The service branches on `harness == "pi"` to decide tools
+  delivery and tracing (see `runRivet.ts`).
+- **Text only.** `prompt` is a string; `messages` are `{role, content: str}`.
+- **No permissions, modes, thought level, plan, usage, tool call surfacing.**
+
+### `Runtime` (`services/oss/src/agent_pi/ports.py`)
+
+```python
+class Runtime(ABC):
+    async def start(self) -> None: ...
+    async def shutdown(self) -> None: ...
+    async def pause(self) -> None: ...            # no-op default
+    async def connect_volume(self, ...) -> None:  # no-op default
+    async def exec(self, command, input_bytes, *, cwd, env, timeout) -> ExecResult: ...
+```
+
+This is a generic "run a subprocess and feed it stdin" port. It predates rivet. The rivet
+path only uses `exec` for the local subprocess transport; the real "where it runs" choice
+(local vs daytona) now lives in `runRivet.ts` as the rivet provider. So this port is now
+half vestigial.
+
+### The wire contract (`AgentRunRequest` / `AgentRunResult` in `runPi.ts`)
+
+Mirrors `HarnessRequest`/`HarnessResult` plus `harness`, `sandbox`, `traceId`. Also one
+shot. `/run` returns the final result; no streaming endpoint exists.
+
+## 2. The rivet SDK surface
+
+Rivet splits cleanly into three planes.
+
+### Plane A. Runtime / sandbox: `SandboxAgent`
+
+The control plane and the environment.
+
+- Construct and connect: `SandboxAgent.start({ sandbox, persist, replayMaxEvents, replayMaxChars, token, signal })`, `SandboxAgent.connect({ baseUrl })`.
+- **Lifecycle:** `dispose()`, `destroySandbox()`, `pauseSandbox()`, `killSandbox()`.
+- **Session registry:** `createSession`, `resumeSession`, `resumeOrCreateSession`,
+  `destroySession`, `listSessions`, `getSession`, `getEvents`.
+- **Capability discovery:** `listAgents`, `getAgent` (returns `AgentInfo` with
+  `capabilities`, `configOptions`, `installed`, `credentialsAvailable`), `installAgent`.
+- **Config plane (per directory):** `getSkillsConfig`/`setSkillsConfig`/`deleteSkillsConfig`
+  and `getMcpConfig`/`setMcpConfig`/`deleteMcpConfig`.
+
+The sandbox is chosen by a provider passed to `start`: `local`, `daytona`, `e2b`,
+`docker`, `vercel`, `cloudflare`, `modal`, `computesdk`, `sprites`. This is the real
+environment seam, and it is richer than our `Runtime.exec`.
+
+### Plane B. Agent session: `Session`
+
+The agent conversation. This is the heart of what we should adopt.
+
+```ts
+class Session {
+  prompt(prompt: ContentBlock[]): Promise<PromptResponse>;
+  setModel(model): ...; setMode(modeId): ...; setThoughtLevel(level): ...;
+  setConfigOption(id, value): ...;
+  getConfigOptions(): ...; getModes(): ...;
+  onEvent(listener): () => void;
+  onPermissionRequest(listener): () => void;
+  respondPermission(permissionId, reply): ...;   // reply: "once" | "always" | "reject"
+  rawSend(method, params): ...;                   // escape hatch
+}
+```
+
+- **Multimodal input.** `prompt` takes ACP content blocks. The block `type` is one of
+  `text`, `image`, `audio`, `resource`, `resource_link`. Attachments and images ride here.
+- **Live structured events.** `onEvent` delivers ACP `session/update` notifications.
+  The variants (verified in the ACP schema):
+
+  | `sessionUpdate` | Meaning |
+  | --- | --- |
+  | `agent_message_chunk` | assistant text delta or snapshot |
+  | `agent_thought_chunk` | reasoning / thinking |
+  | `user_message_chunk` | echoed user content |
+  | `tool_call` / `tool_call_update` | a tool started / progressed / finished |
+  | `plan` | the agent's plan (plan mode) |
+  | `available_commands_update` | slash commands available |
+  | `config_option_update` / `current_mode_update` | config or mode changed mid run |
+  | `usage_update` | token usage |
+  | `session_info_update` | session metadata |
+
+- **Permissions / human in the loop.** `onPermissionRequest` + `respondPermission`.
+  Today `runRivet.ts` auto approves these; the policy is hardcoded, not expressed in the
+  port.
+
+### Plane C. System: filesystem, process, desktop
+
+`SandboxAgent` also exposes the sandbox internals: `readFsFile`, `writeFsFile`, `mkdirFs`,
+`moveFs`, `uploadFsBatch`; `runProcess`, `createProcess`, `followProcessLogs`,
+`connectProcessTerminal`; and a full desktop API (mouse, keyboard, screenshot, recording,
+WebRTC stream). These are **not** part of the agent config contract. We use a few of them
+(`writeFsFile`, `mkdirFs`) only to provision a Daytona sandbox in `runRivet.ts`. They
+belong to the runtime/sandbox adapter, never to the agent author.
+
+### Persistence and replay
+
+`SandboxAgent.start({ persist })` takes a `SessionPersistDriver`:
+
+```ts
+interface SessionPersistDriver {
+  getSession(id): Promise<SessionRecord | undefined>;
+  listSessions(req?): Promise<ListPage<SessionRecord>>;
+  updateSession(session): Promise<void>;
+  listEvents(req): Promise<ListPage<SessionEvent>>;
+  insertEvent(sessionId, event): Promise<void>;
+}
+```
+
+`InMemorySessionPersistDriver` ships; Postgres is wired in the daemon. A `SessionEvent`
+carries `eventIndex`, `sender` ("client" | "agent"), and the ACP `payload`. Replay is
+bounded by `replayMaxEvents` / `replayMaxChars`. `runRivet.ts` already constructs an
+`InMemorySessionPersistDriver`, but because each invoke is a cold sandbox, it never spans
+turns. The continue path falls back to transcript text instead.
+
+### Capability model: `AgentCapabilities`
+
+`getAgent(id)` returns capabilities the runtime probed from the harness:
+
+```
+commandExecution, errorEvents, fileAttachments, fileChanges, images, itemStarted,
+mcpTools, permissions, planMode, questions, reasoning, sessionLifecycle, sharedProcess,
+status, streamingDeltas, textMessages, toolCalls, toolResults
+```
+
+This is the clean answer to the `if harness == "pi"` branching we do today. The service
+should ask "does this harness support `mcpTools` / `images` / `planMode` / `usage`" and
+degrade, rather than hardcode harness names.
+
+### Session lifecycle in ACP (what the protocol allows)
+
+The ACP schema defines `session/new`, `session/load` (replay), `session/prompt`,
+`session/cancel`, plus `ForkSessionRequest`/`ForkSessionResponse` and
+`ResumeSessionRequest`/`ResumeSessionResponse`. **Fork is a first class ACP operation.**
+That connects to [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md):
+a forked session is a cheap branch point for "try N variations of a turn", separate from
+the filesystem jail discussed there.
+
+### Hooks: not in the SDK
+
+A grep for `hook` across `sandbox-agent/dist` and `acp-http-client` returns nothing.
+Rivet has no hook concept. Hooks exist inside the harnesses (Pi loads extensions and
+settings from `~/.pi/agent`; Claude reads its own hook config). So "set up hooks" is not a
+rivet control plane call. It is an agent config artifact: files and settings laid into the
+workspace or agent dir before the run. Our Pi tracing extension is exactly this shape
+already (`installPiExtensionLocal` / `uploadPiExtensionToSandbox` in `runRivet.ts`).
+
+## 3. Side by side
+
+| Concern | Our `Harness` port today | Rivet SDK |
+| --- | --- | --- |
+| Turn call | `invoke(req) -> str` (blocking) | `session.prompt(blocks)` + `onEvent` stream + `PromptResponse` |
+| Output | single string | structured events: text, thought, tool calls, plan, usage |
+| Session | `session_id` string, transcript replay | `Session` object; create / load / resume / fork / destroy |
+| Persistence | none (history held by caller) | `SessionPersistDriver` (in memory or Postgres), bounded replay |
+| Input modality | text only | content blocks (text, image, audio, resource, resource_link) |
+| Model / mode | `model` field | `setModel`, `setMode`, `setThoughtLevel`, `getConfigOptions` |
+| Capabilities | `if harness == "pi"` | `getAgent().capabilities` (18 flags) |
+| Tools | `custom_tools` + `tool_callback` | per directory MCP config + capability `mcpTools` |
+| Skills | not in port | per directory `setSkillsConfig` (artifacts on disk) |
+| Hooks | not in port | not in rivet either; harness config artifacts |
+| Permissions | hardcoded auto approve in `runRivet.ts` | `onPermissionRequest` / `respondPermission` policy |
+| Environment | `Runtime.exec(cmd, stdin)` | sandbox providers (local, daytona, e2b, docker, ...) |
+| Lifecycle | `Runtime.pause` no-op stub | `destroySession`, `destroySandbox`, `pauseSandbox`, `killSandbox`, `dispose` |
+| System (fs/proc/desktop) | absent (correct) | present on `SandboxAgent`, used only for provisioning |
+
+The gap is not that our port is wrong. It is that it stops at "send a turn, get text",
+while rivet models the whole session as a first class, observable, resumable object.
diff --git a/docs/design/agent-workflows/harness-port-redesign/status.md b/docs/design/agent-workflows/harness-port-redesign/status.md
new file mode 100644
index 0000000000..3301844dae
--- /dev/null
+++ b/docs/design/agent-workflows/harness-port-redesign/status.md
@@ -0,0 +1,62 @@
+# Status
+
+Source of truth for this design effort. Keep it current.
+
+## Current state
+
+Research and proposal drafted (2026-06-17). Nothing implemented. The comparison is in
+[`research.md`](research.md); the recommended shape and phased path are in
+[`proposal.md`](proposal.md). This builds on the shipped WP-8 runtime
+([`../wp-8-rivet-acp-runtime/status.md`](../wp-8-rivet-acp-runtime/status.md)), which
+adopted rivet unmodified and kept the ports unchanged on purpose.
+
+## Recommendation in one line
+
+Evolve the ports in phases (A: capabilities + structured result, B: event streaming,
+C: first class sessions, D: content blocks + permissions + skills + hooks, E: retire the
+`Runtime.exec` port), keeping rivet behind the seam and `/invoke` working at every step.
+
+## Decisions taken
+
+| Decision | Rationale |
+| --- | --- |
+| Keep a neutral port; rivet stays one adapter behind it | Legacy Pi path and future non rivet harnesses still fit; avoids the port becoming a rivet wrapper |
+| Split the port into Environment (plane A) and AgentSession (plane B) | Matches rivet's own split; our single `invoke` collapses both today |
+| System plane (fs/process/desktop) stays out of the harness port | It is provisioning, used only by the Environment adapter; never exposed to the agent author |
+| Hooks are config artifacts, not a port verb | Rivet has no hook API; hooks live inside the harnesses, read from disk |
+| Adopt a capability model over `if harness == "pi"` | Rivet already probes `getAgent().capabilities`; removes brittle name checks |
+| Structured result + event stream replace the single string | The data already flows through `runRivet.ts` for tracing; the port flattens it |
+
+## User decisions (2026-06-17)
+
+1. **Ambition: full A to E arc.** Plan all five phases, including first class sessions and
+   retiring the `Runtime.exec` port. See [`plan.md`](plan.md).
+2. **Session model: stay cold and replay.** Keep WP-8's one daemon per invoke. Do not
+   stand up a warm daemon. This avoids the per session env channel and the folder jail.
+
+### Reconciling "first class sessions" with "stay cold"
+
+A warm daemon is the usual way to get ACP `session/load`. We are not doing that. So Phase
+C gives a first class `AgentSession` object in the **port** backed by a persisted history,
+and the adapter implements "continue" by **replaying persisted events into a fresh cold
+sandbox** each turn (the WP-8 model, but the history lives in a persistence driver instead
+of being passed in by the caller). The session abstraction is real and stable; the
+continuation mechanism stays replay. ACP `session/load` is reserved for a future warm
+daemon and is explicitly out of scope.
+
+## Open questions (still need the user)
+
+3. **Persistence ownership.** Where does the event history live: the backend DB on the
+   platform, a file standalone, or rivet's own Postgres? Default assumption in
+   [`plan.md`](plan.md): backend DB on the platform, file standalone, mirroring how WP-8
+   framed the history store.
+4. **Streaming at the HTTP edge.** Phase B streams events through the port but keeps
+   `/invoke` request/response. A streaming endpoint (ties into WP-4 multi message output)
+   is planned as a Phase B option, not a hard requirement. Confirm if wanted now.
+5. **Fork.** ACP exposes `session/fork`. Plan treats it as a Phase C optional add for "try
+   N variations of a turn". Defer unless there is a caller.
+
+## Next step
+
+Build plan is in [`plan.md`](plan.md). Phase A is the entry point. Open questions 3 to 5
+do not block Phase A or B; settle them before Phase C.
diff --git a/services/agent/src/agenta-otel.ts b/services/agent/src/agenta-otel.ts
index 35bdeffd6a..65045a2dd9 100644
--- a/services/agent/src/agenta-otel.ts
+++ b/services/agent/src/agenta-otel.ts
@@ -52,6 +52,8 @@ import type {
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
 import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
 
+import type { AgentEvent, AgentUsage } from "./protocol.ts";
+
 // ---------------------------------------------------------------------------
 // Shared, process-wide tracing infrastructure
 // ---------------------------------------------------------------------------
@@ -250,6 +252,8 @@ export interface RunConfig {
   authorization?: string;
   /** W3C traceparent from the caller; nests invoke_agent under that span. */
   traceparent?: string;
+  /** W3C baggage from the caller (carried for future use). */
+  baggage?: string;
   /** Drop prompt/completion/tool I/O from spans when false. */
   captureContent: boolean;
   /** Pi session id, set after createAgentSession so spans carry session.id. */
@@ -680,6 +684,10 @@ export interface RivetOtel {
   traceId(): string | undefined;
   /** Accumulated assistant output text so far. */
   output(): string;
+  /** The structured event log built from the ACP stream (tool calls, usage, final message). */
+  events(): AgentEvent[];
+  /** Run token/cost totals from the stream, when the harness reported `usage_update`. */
+  usage(): AgentUsage | undefined;
 }
 
 /**
@@ -703,7 +711,8 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
   let llmSpan: Span | undefined;
   let runTraceId: string | undefined;
   let accumulated = "";
-  let usage: { cost?: number; total?: number } | undefined;
+  let usage: AgentUsage | undefined;
+  const events: AgentEvent[] = [];
   const toolSpans = new Map<string, { span: Span; name: string }>();
 
   function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void {
@@ -775,6 +784,7 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
       if (update.rawInput != null)
         setInputs(span, update.rawInput as Record<string, unknown>, capture);
       toolSpans.set(id, { span, name: String(name) });
+      events.push({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput });
       // A tool_call can arrive already completed (status set up front).
       maybeCloseTool(id, update);
       return;
@@ -786,12 +796,18 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     }
 
     if (kind === "usage_update") {
+      // ACP usage_update carries only `used` (context tokens) and `cost.amount`. The
+      // per-call input/output split is NOT on the stream; it rides on the PromptResponse,
+      // which runRivet.ts reads. Keep total + cost here and leave the split to the caller.
       const cost = update.cost?.amount;
       const total = update.used;
       usage = {
-        cost: typeof cost === "number" ? cost : usage?.cost,
-        total: typeof total === "number" ? total : usage?.total,
+        input: usage?.input ?? 0,
+        output: usage?.output ?? 0,
+        total: typeof total === "number" ? total : usage?.total ?? 0,
+        cost: typeof cost === "number" ? cost : usage?.cost ?? 0,
       };
+      events.push({ type: "usage", ...usage });
     }
   }
 
@@ -807,10 +823,15 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR });
     entry.span.end();
     toolSpans.delete(id);
+    events.push({ type: "tool_result", id, output: out, isError: status === "failed" });
   }
 
   function finish(): string {
     const text = stripStartupBanner(accumulated.trim());
+    // The event log is independent of span emission, so build its tail either way: the
+    // final assistant message, then the terminal done marker.
+    if (text) events.push({ type: "message", text });
+    events.push({ type: "done" });
     if (!emitSpans) return text;
     if (llmSpan) {
       emitMessages(
@@ -849,5 +870,7 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     flush: () => flushTrace(runTraceId),
     traceId: () => runTraceId,
     output: () => accumulated,
+    events: () => events,
+    usage: () => usage,
   };
 }
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
index 3eacd78abc..8de7d8a97c 100644
--- a/services/agent/src/cli.ts
+++ b/services/agent/src/cli.ts
@@ -6,14 +6,15 @@
  * to stderr. This is the one-shot "json adapter" the design doc describes; a
  * long-lived RPC adapter can replace it later behind the same Python-side port.
  */
-import { runPi, type AgentRunRequest, type AgentRunResult } from "./runPi.ts";
+import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
+import { runPi } from "./runPi.ts";
 import { runRivet } from "./runRivet.ts";
 
-// `rivet` drives the harness over ACP via a rivet daemon (WP-8); default = legacy Pi.
-const BACKEND = (process.env.AGENT_BACKEND ?? "pi").toLowerCase();
-
+// Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
+// legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
 function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
-  return BACKEND === "rivet" ? runRivet(request) : runPi(request);
+  const backend = (request.backend ?? process.env.AGENT_BACKEND ?? "pi").toLowerCase();
+  return backend === "rivet" ? runRivet(request) : runPi(request);
 }
 
 async function readStdin(): Promise<string> {
diff --git a/services/agent/src/piExtension.ts b/services/agent/src/piExtension.ts
index 94418a137f..9af88d8ee9 100644
--- a/services/agent/src/piExtension.ts
+++ b/services/agent/src/piExtension.ts
@@ -27,79 +27,20 @@ import { writeFileSync } from "node:fs";
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
 
 import { createAgentaOtel } from "./agenta-otel.ts";
-
-interface ToolSpec {
-  name: string;
-  description?: string;
-  inputSchema?: Record<string, unknown> | null;
-  callRef: string;
-}
-
-const TOOL_CALL_TIMEOUT_MS = Number(process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000);
-const EMPTY_SCHEMA = { type: "object", properties: {}, additionalProperties: true };
+import type { ResolvedToolSpec } from "./protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
 
 function log(message: string): void {
   process.stderr.write(`[agenta-pi-ext] ${message}\n`);
 }
 
-/** One /tools/call round-trip. Returns the result text; throws on failure (Pi turns a
- *  thrown execute into a tool-error result, so the loop continues). */
-async function callAgentaTool(
-  endpoint: string,
-  authorization: string | undefined,
-  callRef: string,
-  toolCallId: string,
-  args: unknown,
-  signal?: AbortSignal,
-): Promise<string> {
-  const headers: Record<string, string> = { "content-type": "application/json" };
-  if (authorization) headers["authorization"] = authorization;
-
-  const timeoutSignal = AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS);
-  const anyOf = (AbortSignal as any).any;
-  const combined =
-    signal && typeof anyOf === "function" ? anyOf([signal, timeoutSignal]) : timeoutSignal;
-
-  let response: Response;
-  try {
-    response = await fetch(endpoint, {
-      method: "POST",
-      headers,
-      body: JSON.stringify({
-        data: {
-          id: toolCallId,
-          type: "function",
-          function: { name: callRef, arguments: args ?? {} },
-        },
-      }),
-      signal: combined,
-    });
-  } catch (err) {
-    throw new Error(`tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`);
-  }
-
-  const bodyText = await response.text();
-  if (!response.ok) {
-    throw new Error(`tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`);
-  }
-  try {
-    const parsed = JSON.parse(bodyText);
-    const content = parsed?.call?.data?.content;
-    if (typeof content === "string") return content;
-    if (content != null) return JSON.stringify(content);
-    return bodyText;
-  } catch {
-    return bodyText;
-  }
-}
-
 /** Register the resolved tools (from env) as Pi tools that call back to Agenta. */
 function registerTools(pi: ExtensionAPI): void {
   const raw = process.env.AGENTA_TOOL_SPECS;
   const endpoint = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT;
   if (!raw || !endpoint) return;
 
-  let specs: ToolSpec[] = [];
+  let specs: ResolvedToolSpec[] = [];
   try {
     specs = JSON.parse(raw);
   } catch (err) {
@@ -114,7 +55,7 @@ function registerTools(pi: ExtensionAPI): void {
       label: spec.name,
       description: spec.description ?? spec.name,
       // Pi accepts plain JSON Schema here (non-TypeBox validation path).
-      parameters: (spec.inputSchema as any) ?? EMPTY_SCHEMA,
+      parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
       async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
         const text = await callAgentaTool(
           endpoint,
diff --git a/services/agent/src/protocol.ts b/services/agent/src/protocol.ts
new file mode 100644
index 0000000000..4880e6f093
--- /dev/null
+++ b/services/agent/src/protocol.ts
@@ -0,0 +1,164 @@
+/**
+ * The `/run` wire contract, shared by both backends.
+ *
+ * The Python side mirrors these names in `services/oss/src/agent_pi/wire.py`. Keeping the
+ * request/result/event/capability types here (rather than in one runner that the other
+ * imports from) is what lets `runPi.ts` and `runRivet.ts` stay peers.
+ */
+
+/** One piece of a message. `text` is all the playground sends today; the rest is plumbed. */
+export interface ContentBlock {
+  type: "text" | "image" | "resource" | string;
+  text?: string;
+  data?: string;
+  mimeType?: string;
+  uri?: string;
+}
+
+export interface ChatMessage {
+  role: string;
+  /** A plain string, or ACP-style content blocks (text/image/resource). */
+  content: string | ContentBlock[];
+}
+
+/**
+ * Trace context threaded in from the Agenta service so the agent run joins the caller's
+ * /invoke trace instead of starting its own. All fields optional; with none set the run is
+ * traced standalone (or not at all) using env config.
+ */
+export interface TraceContext {
+  traceparent?: string;
+  baggage?: string;
+  endpoint?: string;
+  authorization?: string;
+  captureContent?: boolean;
+}
+
+/**
+ * A runnable tool the backend already resolved from the agent config: name + description +
+ * JSON-Schema params for the model, plus the `callRef` slug the execution bridge sends back
+ * to Agenta's /tools/call. The Composio key and connection auth stay server-side.
+ */
+export interface ResolvedToolSpec {
+  name: string;
+  description?: string;
+  inputSchema?: Record<string, unknown> | null;
+  callRef: string;
+}
+
+/** Where and how to route a tool call back through Agenta. */
+export interface ToolCallbackContext {
+  endpoint: string;
+  authorization?: string;
+}
+
+/**
+ * What a harness can do, probed from the runtime (rivet `AgentCapabilities`). The runner
+ * branches on these flags instead of the harness name, and returns them in the result.
+ */
+export interface HarnessCapabilities {
+  textMessages?: boolean;
+  images?: boolean;
+  fileAttachments?: boolean;
+  mcpTools?: boolean;
+  toolCalls?: boolean;
+  reasoning?: boolean;
+  planMode?: boolean;
+  permissions?: boolean;
+  usage?: boolean;
+  streamingDeltas?: boolean;
+  sessionLifecycle?: boolean;
+}
+
+/** One structured run event. Mirrors the ACP `session/update` variants we surface. */
+export type AgentEvent =
+  | { type: "message"; text: string }
+  | { type: "thought"; text: string }
+  | { type: "tool_call"; id?: string; name?: string; input?: unknown }
+  | { type: "tool_result"; id?: string; output?: string; isError?: boolean }
+  | { type: "usage"; input?: number; output?: number; total?: number; cost?: number }
+  | { type: "error"; message: string }
+  | { type: "done"; stopReason?: string };
+
+/** Run token/cost totals, rolled up onto the caller's workflow span. */
+export interface AgentUsage {
+  input: number;
+  output: number;
+  total: number;
+  cost: number;
+}
+
+export interface AgentRunRequest {
+  /** Engine: "rivet" (ACP) or "pi" (legacy in-process). Routed on by cli.ts/server.ts. */
+  backend?: string;
+  /** Harness id for the rivet backend ("pi" / "claude"). */
+  harness?: string;
+  /** Sandbox for the rivet backend ("local" / "daytona"). */
+  sandbox?: string;
+  /** Continue a prior run by replaying its history. */
+  sessionId?: string;
+  /** Provider API keys as env vars ({OPENAI_API_KEY,...}), resolved from the vault. */
+  secrets?: Record<string, string>;
+  /** AGENTS.md text injected as the agent's instructions. */
+  agentsMd?: string;
+  /** Model id ("gpt-5.5") or "provider/id" ("openai-codex/gpt-5.5"). */
+  model?: string;
+  /** Explicit latest turn. Falls back to the last user message in `messages`. */
+  prompt?: string;
+  /** The conversation so far; the runner picks the latest turn and replays the rest. */
+  messages?: ChatMessage[];
+  /** Built-in tools to enable. */
+  tools?: string[];
+  /** Resolved runnable tools (WP-7). */
+  customTools?: ResolvedToolSpec[];
+  /** Where customTools route their calls back to. Required when customTools is set. */
+  toolCallback?: ToolCallbackContext;
+  /** How a permission-gating harness handles tool-use prompts: "auto" (default) | "deny". */
+  permissionPolicy?: string;
+  /** Tracing: thread the Agenta trace context across the boundary. */
+  trace?: TraceContext;
+}
+
+export interface AgentRunResult {
+  ok: boolean;
+  /** Final assistant text (what the playground renders). */
+  output?: string;
+  /** Structured assistant messages for the turn. */
+  messages?: ChatMessage[];
+  /** Structured event log for the turn. */
+  events?: AgentEvent[];
+  /** Run token/cost totals, for roll-up onto the caller's workflow span. */
+  usage?: AgentUsage;
+  /** Why the turn ended (harness-reported when available). */
+  stopReason?: string;
+  /** What the harness was probed to support this run. */
+  capabilities?: HarnessCapabilities;
+  sessionId?: string;
+  model?: string;
+  /** Trace id of the run (the caller's trace when a traceparent was passed). */
+  traceId?: string;
+  error?: string;
+}
+
+/** Flatten a message's content (string or content blocks) to its text. */
+export function messageText(content: string | ContentBlock[] | undefined): string {
+  if (!content) return "";
+  if (typeof content === "string") return content;
+  return content
+    .filter((block) => block?.type === "text" && typeof block.text === "string")
+    .map((block) => block.text)
+    .join("");
+}
+
+/** The latest user turn: explicit prompt, else last user message content. */
+export function resolvePromptText(request: AgentRunRequest): string {
+  if (request.prompt && request.prompt.trim()) return request.prompt;
+  const messages = request.messages ?? [];
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === "user") {
+      const text = messageText(messages[i].content);
+      if (text) return text;
+    }
+  }
+  return "";
+}
diff --git a/services/agent/src/runPi.ts b/services/agent/src/runPi.ts
index 74a7ab98ac..1ae732f555 100644
--- a/services/agent/src/runPi.ts
+++ b/services/agent/src/runPi.ts
@@ -1,20 +1,20 @@
 /**
- * WP-2 Pi harness driver.
+ * Legacy backend: drive the Pi SDK in-process for one cold run.
  *
- * This is the concrete "harness" behind the service's Harness port. It drives the
- * Pi SDK (`createAgentSession`) for a single run: it injects the agent's AGENTS.md
- * in memory, resolves the model, sends one user turn, and returns the final
- * assistant text. It also turns the backend-resolved runnable tools (WP-7) into Pi
- * customTools that route back through Agenta's /tools/call. No streaming and no
- * session persistence yet; those are later work packages.
+ * This is the non-rivet engine. It drives Pi's `createAgentSession` directly: injects
+ * AGENTS.md in memory, resolves the model, sends one user turn, and returns the structured
+ * result (final text, messages, events, usage, capabilities). It also turns the
+ * backend-resolved runnable tools (WP-7) into Pi customTools that route back through
+ * Agenta's /tools/call. The rivet backend (`runRivet.ts`) is the ACP path; both serve the
+ * same `/run` contract (see `protocol.ts`).
  *
- * Auth: uses `AuthStorage.create()`, which reads ~/.pi/agent/auth.json (the local
- * Pi login). Set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment as an
- * alternative. Nothing invocation-specific is written to a persistent disk: the
- * session is in-memory and the working dir is a throwaway temp dir.
+ * Auth: provider keys arrive as `request.secrets` (applied to the env) or fall back to the
+ * local Pi login (`AuthStorage.create()` reads ~/.pi/agent/auth.json). Nothing
+ * invocation-specific is written to a persistent disk: the session is in-memory and the
+ * working dir is a throwaway temp dir.
  *
- * Important: stdout is reserved for the JSON result (see cli.ts). Everything here
- * logs to stderr so it never pollutes the result channel.
+ * Important: stdout is reserved for the JSON result (see cli.ts). Everything here logs to
+ * stderr so it never pollutes the result channel.
  */
 import { mkdtempSync, rmSync } from "node:fs";
 import { tmpdir } from "node:os";
@@ -31,127 +31,55 @@ import {
 } from "@earendil-works/pi-coding-agent";
 
 import { createAgentaOtel } from "./agenta-otel.ts";
-
-export interface ChatMessage {
-  role: string;
-  content: string;
-}
-
-/**
- * Trace context threaded in from the Agenta service so the agent run joins the
- * caller's /invoke trace instead of starting its own. All fields are optional;
- * with none set the run is traced standalone (or not at all) using env config.
- */
-export interface TraceContext {
-  /** W3C traceparent of the caller's workflow span. Nests invoke_agent under it. */
-  traceparent?: string;
-  /** W3C baggage from the caller (carried for future use). */
-  baggage?: string;
-  /** OTLP traces endpoint (e.g. https://host/api/otlp/v1/traces). */
-  endpoint?: string;
-  /** Full Authorization header for the OTLP export (e.g. "ApiKey ..." / "Secret ..."). */
-  authorization?: string;
-  /** Drop prompt/completion/tool I/O from spans when false. Default true. */
-  captureContent?: boolean;
-}
-
-/**
- * A runnable tool the backend already resolved from the agent config: name +
- * description + JSON-Schema params for the model, plus the `callRef` slug the
- * execution bridge sends back to Agenta's /tools/call. The Composio key and the
- * connection auth stay server-side; this sandbox never sees them.
- */
-export interface ResolvedToolSpec {
-  /** Function name shown to the model (e.g. "gmail__SEND_EMAIL"). */
-  name: string;
-  /** Description shown to the model. Resolved live from the provider catalog. */
-  description?: string;
-  /** JSON Schema for the tool arguments. Pi accepts plain JSON Schema here. */
-  inputSchema?: Record<string, unknown> | null;
-  /** "tools.{provider}.{integration}.{action}.{connection}" — the /tools/call slug. */
-  callRef: string;
-}
-
-/**
- * Where and how to route a tool call back through Agenta. The backend builds the
- * full /tools/call URL and threads the same credential the OTLP export rides on.
- */
-export interface ToolCallbackContext {
-  /** Full /tools/call URL. */
-  endpoint: string;
-  /** Authorization header value for the callback (project-scoped). */
-  authorization?: string;
-}
-
-export interface AgentRunRequest {
-  /** Harness id for the rivet backend ("pi" / "claude"). Ignored by the Pi backend. */
-  harness?: string;
-  /** Sandbox for the rivet backend ("local" / "daytona"). Ignored by the Pi backend. */
-  sandbox?: string;
-  /** Continue a prior run by replaying its history. The rivet backend resumes by id. */
-  sessionId?: string;
-  /** Provider API keys as env vars ({OPENAI_API_KEY,...}), resolved from the vault.
-   *  Injected into the harness env; empty means the harness uses its own login (OAuth). */
-  secrets?: Record<string, string>;
-  /** AGENTS.md text injected as the agent's instructions (in memory). */
-  agentsMd?: string;
-  /** Model id ("gpt-5.5") or "provider/id" ("openai-codex/gpt-5.5"). */
-  model?: string;
-  /** The user turn to send. Falls back to the last user message. */
-  prompt?: string;
-  /** Optional prior message history. MVP sends the latest user turn only. */
-  messages?: ChatMessage[];
-  /** Built-in tools to enable. MVP default: none. */
-  tools?: string[];
-  /** Resolved runnable tools (WP-7), turned into Pi customTools below. */
-  customTools?: ResolvedToolSpec[];
-  /** Where customTools route their calls back to. Required when customTools is set. */
-  toolCallback?: ToolCallbackContext;
-  /** Tracing: thread the Agenta trace context across the boundary. */
-  trace?: TraceContext;
-}
-
-export interface AgentRunResult {
-  ok: boolean;
-  output?: string;
-  sessionId?: string;
-  model?: string;
-  /** Trace id of the run (the caller's trace when a traceparent was passed). */
-  traceId?: string;
-  /** Run token/cost totals, for roll-up onto the caller's workflow span. */
-  usage?: { input: number; output: number; total: number; cost: number };
-  error?: string;
-}
+import {
+  type AgentEvent,
+  type AgentRunRequest,
+  type AgentRunResult,
+  type ChatMessage,
+  type HarnessCapabilities,
+  type ResolvedToolSpec,
+  type ToolCallbackContext,
+  resolvePromptText,
+} from "./protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
+
+/** What the in-process Pi engine supports. Static (no daemon to probe, unlike rivet). */
+const PI_CAPABILITIES: HarnessCapabilities = {
+  textMessages: true,
+  toolCalls: true,
+  reasoning: true,
+  usage: true,
+  streamingDeltas: true,
+  images: false,
+  fileAttachments: false,
+  mcpTools: false,
+  planMode: false,
+  permissions: false,
+  sessionLifecycle: false,
+};
 
 function log(message: string): void {
   process.stderr.write(`[pi-wrapper] ${message}\n`);
 }
 
+/** Apply vault-resolved provider keys to the process env so Pi's model auth can see them. */
+function applySecrets(secrets: Record<string, string> | undefined): void {
+  for (const [key, value] of Object.entries(secrets ?? {})) {
+    if (value) process.env[key] = value;
+  }
+}
+
 /** Pick the requested model, else gpt-5.5, else a sensible non-mini default. */
 function pickModel(available: any[], wanted?: string): any {
   return (
     (wanted &&
-      available.find(
-        (m) => m.id === wanted || `${m.provider}/${m.id}` === wanted,
-      )) ||
+      available.find((m) => m.id === wanted || `${m.provider}/${m.id}` === wanted)) ||
     available.find((m) => m.id === "gpt-5.5") ||
     available.find((m) => !/spark|mini/i.test(m.id)) ||
     available[0]
   );
 }
 
-/** The latest user turn: explicit prompt, else last user message content. */
-function resolvePrompt(request: AgentRunRequest): string {
-  if (request.prompt && request.prompt.trim()) return request.prompt;
-  const messages = request.messages ?? [];
-  for (let i = messages.length - 1; i >= 0; i--) {
-    if (messages[i].role === "user" && messages[i].content) {
-      return messages[i].content;
-    }
-  }
-  return "";
-}
-
 /** Concatenate the text blocks of the last assistant message. */
 function extractAssistantText(messages: any[]): string {
   for (let i = messages.length - 1; i >= 0; i--) {
@@ -170,23 +98,21 @@ function extractAssistantText(messages: any[]): string {
   return "";
 }
 
-/** Per-tool budget for the /tools/call round-trip. Surfaced as a tool error on timeout. */
-const TOOL_CALL_TIMEOUT_MS = Number(
-  process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000,
-);
-
-/** Permissive default when a resolved tool has no input schema. */
-const EMPTY_OBJECT_SCHEMA = {
-  type: "object",
-  properties: {},
-  additionalProperties: true,
-};
+/** The stop reason of the last assistant message, when Pi set one. */
+function lastStopReason(messages: any[]): string | undefined {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant" && messages[i].stopReason) {
+      return String(messages[i].stopReason);
+    }
+  }
+  return undefined;
+}
 
 /**
- * Turn resolved tool specs into Pi customTools. Each tool's `execute` does one
- * POST back through Agenta's /tools/call, so Pi runs the loop while the Composio
- * key and connection auth stay server-side. A failed call throws, which Pi turns
- * into a tool-error result (the loop continues) rather than a run failure.
+ * Turn resolved tool specs into Pi customTools. Each tool's `execute` does one POST back
+ * through Agenta's /tools/call, so Pi runs the loop while the Composio key and connection
+ * auth stay server-side. A failed call throws, which Pi turns into a tool-error result
+ * (the loop continues) rather than a run failure.
  */
 export function buildCustomTools(
   specs: ResolvedToolSpec[],
@@ -202,12 +128,13 @@ export function buildCustomTools(
     name: spec.name,
     label: spec.name,
     description: spec.description ?? spec.name,
-    // Pi accepts a plain JSON Schema for `parameters` (its validator has a
-    // non-TypeBox path); the schema is resolved live from the provider catalog.
+    // Pi accepts a plain JSON Schema for `parameters` (its validator has a non-TypeBox
+    // path); the schema is resolved live from the provider catalog.
     parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
     async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
       const text = await callAgentaTool(
-        callback,
+        callback.endpoint,
+        callback.authorization,
         spec.callRef,
         toolCallId,
         params,
@@ -221,72 +148,13 @@ export function buildCustomTools(
   }));
 }
 
-/** One /tools/call round-trip. Returns the result string; throws on failure. */
-async function callAgentaTool(
-  callback: ToolCallbackContext,
-  callRef: string,
-  toolCallId: string,
-  params: unknown,
-  signal?: AbortSignal,
-): Promise<string> {
-  const headers: Record<string, string> = { "content-type": "application/json" };
-  if (callback.authorization) headers["authorization"] = callback.authorization;
-
-  // Combine Pi's abort signal (if any) with a per-tool timeout.
-  const timeoutSignal = AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS);
-  const anyOf = (AbortSignal as any).any;
-  const combined =
-    signal && typeof anyOf === "function"
-      ? anyOf([signal, timeoutSignal])
-      : timeoutSignal;
-
-  let response: Response;
-  try {
-    response = await fetch(callback.endpoint, {
-      method: "POST",
-      headers,
-      body: JSON.stringify({
-        data: {
-          id: toolCallId,
-          type: "function",
-          // Arguments as an object (not a JSON string) to avoid double-encoding.
-          function: { name: callRef, arguments: params ?? {} },
-        },
-      }),
-      signal: combined,
-    });
-  } catch (err) {
-    throw new Error(
-      `tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`,
-    );
-  }
-
-  const bodyText = await response.text();
-  if (!response.ok) {
-    throw new Error(
-      `tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`,
-    );
-  }
-
-  // ToolCallResponse -> { call: { data: { content }, status } }. `content` is the
-  // execution result serialized as a JSON string; hand it to the model verbatim.
-  try {
-    const parsed = JSON.parse(bodyText);
-    const content = parsed?.call?.data?.content;
-    if (typeof content === "string") return content;
-    if (content != null) return JSON.stringify(content);
-    return bodyText;
-  } catch {
-    return bodyText;
-  }
-}
-
 export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
-  const prompt = resolvePrompt(request);
+  const prompt = resolvePromptText(request);
   if (!prompt) {
     return { ok: false, error: "No user message to send (prompt/messages empty)." };
   }
 
+  applySecrets(request.secrets);
   const cwd = mkdtempSync(join(tmpdir(), "agenta-agent-"));
 
   try {
@@ -304,9 +172,9 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
     const model = pickModel(available, request.model);
     log(`model: ${model.provider}/${model.id}`);
 
-    // Tracing: turn this run into OTel spans. When the caller passed a
-    // traceparent, invoke_agent nests under their /invoke span so the whole
-    // agent run is part of the same trace (just like completion/chat).
+    // Tracing: turn this run into OTel spans. When the caller passed a traceparent,
+    // invoke_agent nests under their /invoke span so the whole agent run is part of the
+    // same trace (just like completion/chat).
     const otel = createAgentaOtel({
       traceparent: request.trace?.traceparent,
       baggage: request.trace?.baggage,
@@ -331,12 +199,9 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
     });
     await loader.reload();
 
-    // Build runnable tools from the resolved specs. Pi's allowlist gates custom
-    // tools too, so their names must be in `tools` for the model to see them.
-    const customTools = buildCustomTools(
-      request.customTools ?? [],
-      request.toolCallback,
-    );
+    // Build runnable tools from the resolved specs. Pi's allowlist gates custom tools too,
+    // so their names must be in `tools` for the model to see them.
+    const customTools = buildCustomTools(request.customTools ?? [], request.toolCallback);
     const toolAllowlist = [
       ...(request.tools ?? []),
       ...customTools.map((tool) => tool.name),
@@ -377,16 +242,36 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
 
     const output = streamed.trim() || extractAssistantText(session.messages);
     const sessionId = session.sessionId;
+    const stopReason = lastStopReason(session.messages);
+    const usage = otel.usage();
     session.dispose();
 
-    // Ship this run's trace before the result is returned (and before the CLI
-    // process exits): invoke_agent has a remote parent, so the per-trace flush
-    // is what exports it.
+    // Ship this run's trace before the result is returned (and before the CLI process
+    // exits): invoke_agent has a remote parent, so the per-trace flush is what exports it.
     await otel.flush();
 
+    // The structured stream is thinner here than on the rivet path: Pi's in-process tool
+    // events feed the trace spans, while the result-level event log carries the final
+    // message, usage, and stop reason (enough for the platform without double-plumbing).
+    const events: AgentEvent[] = [];
+    if (output) events.push({ type: "message", text: output });
+    if (usage.total > 0) {
+      events.push({ type: "usage", ...usage });
+    }
+    events.push({ type: "done", stopReason });
+
+    const messages: ChatMessage[] = output
+      ? [{ role: "assistant", content: output }]
+      : [];
+
     return {
       ok: true,
       output,
+      messages,
+      events,
+      usage,
+      stopReason,
+      capabilities: PI_CAPABILITIES,
       sessionId,
       model: `${model.provider}/${model.id}`,
       traceId: otel.config.traceId,
diff --git a/services/agent/src/runRivet.ts b/services/agent/src/runRivet.ts
index 45b5657834..88e4c020a4 100644
--- a/services/agent/src/runRivet.ts
+++ b/services/agent/src/runRivet.ts
@@ -45,8 +45,17 @@ import { local } from "sandbox-agent/local";
 import { daytona } from "sandbox-agent/daytona";
 
 import { createRivetOtel } from "./agenta-otel.ts";
-import { buildToolMcpServers, type ResolvedToolSpec, type ToolCallbackContext } from "./toolBridge.ts";
-import type { AgentRunRequest, AgentRunResult, ChatMessage } from "./runPi.ts";
+import { buildToolMcpServers } from "./toolBridge.ts";
+import {
+  type AgentRunRequest,
+  type AgentRunResult,
+  type ChatMessage,
+  type HarnessCapabilities,
+  type ResolvedToolSpec,
+  type ToolCallbackContext,
+  messageText,
+  resolvePromptText,
+} from "./protocol.ts";
 
 const require = createRequire(import.meta.url);
 // services/agent/src/runRivet.ts -> services/agent
@@ -217,15 +226,8 @@ function buildDaemonEnv(harness: string): Record<string, string> {
   return env;
 }
 
-/** The latest user turn: explicit prompt, else last user message content. */
-function resolvePrompt(request: AgentRunRequest): string {
-  if (request.prompt && request.prompt.trim()) return request.prompt;
-  const messages = request.messages ?? [];
-  for (let i = messages.length - 1; i >= 0; i--) {
-    if (messages[i].role === "user" && messages[i].content) return messages[i].content;
-  }
-  return "";
-}
+/** The latest user turn (shared protocol helper; flattens content blocks to text). */
+const resolvePrompt = resolvePromptText;
 
 /** Prior turns (everything before the latest user message) for trace + history. */
 function priorMessages(request: AgentRunRequest): ChatMessage[] {
@@ -235,8 +237,17 @@ function priorMessages(request: AgentRunRequest): ChatMessage[] {
   if (messages.length && messages[messages.length - 1].role === "user") {
     return messages.slice(0, -1);
   }
-  // No trailing user message (prompt came in explicitly): keep turns that aren't it.
-  return messages.filter((m) => !(m.role === "user" && m.content === latest));
+  // No trailing user message (prompt came in explicitly): drop only the LAST user turn
+  // whose text matches the prompt being sent, not every matching turn (repeated short
+  // turns like "yes"/"continue" would otherwise vanish from the replayed history).
+  let lastMatch = -1;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === "user" && messageText(messages[i].content) === latest) {
+      lastMatch = i;
+      break;
+    }
+  }
+  return lastMatch === -1 ? messages : messages.filter((_, i) => i !== lastMatch);
 }
 
 /**
@@ -247,11 +258,11 @@ function priorMessages(request: AgentRunRequest): ChatMessage[] {
  */
 function buildTurnText(request: AgentRunRequest): string {
   const latest = resolvePrompt(request);
-  const history = priorMessages(request).filter((m) => m.content);
+  const history = priorMessages(request).filter((m) => messageText(m.content));
   if (history.length === 0) return latest;
 
   const maxChars = Number(process.env.AGENTA_AGENT_HISTORY_MAX_CHARS ?? 24000);
-  let transcript = history.map((m) => `${m.role}: ${m.content}`).join("\n");
+  let transcript = history.map((m) => `${m.role}: ${messageText(m.content)}`).join("\n");
   if (transcript.length > maxChars) transcript = transcript.slice(-maxChars);
   return (
     `Conversation so far:\n${transcript}\n\n` +
@@ -528,6 +539,59 @@ function conciseError(err: unknown, harness: string): string {
   return msg || "agent run failed";
 }
 
+/**
+ * Map a rivet `AgentInfo` to our capability flags. Falls back to a per-harness static
+ * guess when the probe is unavailable, so tool delivery and tracing still pick a sane
+ * path. Rivet has no `usage` capability flag (usage rides on `usage_update` events), so we
+ * derive it from the harness: Pi reports usage through its extension, others over ACP.
+ */
+function mapCapabilities(harness: string, info: any): HarnessCapabilities {
+  const c = info?.capabilities;
+  if (c) {
+    return {
+      textMessages: c.textMessages ?? true,
+      images: !!c.images,
+      fileAttachments: !!c.fileAttachments,
+      mcpTools: !!c.mcpTools,
+      toolCalls: !!c.toolCalls,
+      reasoning: !!c.reasoning,
+      planMode: !!c.planMode,
+      permissions: !!c.permissions,
+      streamingDeltas: !!c.streamingDeltas,
+      sessionLifecycle: !!c.sessionLifecycle,
+      usage: true,
+    };
+  }
+  // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do.
+  const isPiHarness = harness === "pi";
+  return {
+    textMessages: true,
+    images: false,
+    fileAttachments: false,
+    mcpTools: !isPiHarness,
+    toolCalls: true,
+    reasoning: true,
+    planMode: !isPiHarness,
+    permissions: !isPiHarness,
+    streamingDeltas: true,
+    sessionLifecycle: true,
+    usage: true,
+  };
+}
+
+/** Probe the harness's capabilities from the daemon (best-effort, static fallback). */
+async function probeCapabilities(
+  sandbox: any,
+  harness: string,
+): Promise<HarnessCapabilities> {
+  try {
+    const info = await sandbox.getAgent(harness, { config: true });
+    return mapCapabilities(harness, info);
+  } catch {
+    return mapCapabilities(harness, undefined);
+  }
+}
+
 export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult> {
   const harness = request.harness || process.env.AGENTA_AGENT_HARNESS || "pi";
   const sandboxId = request.sandbox || process.env.AGENTA_AGENT_SANDBOX || "local";
@@ -616,13 +680,19 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
       writeFileSync(join(cwd, "AGENTS.md"), agentsMd, "utf-8");
     }
 
-    // Pi gets tools via the extension (above); other harnesses via MCP.
-    const mcpServers = isPi
-      ? []
-      : buildToolMcpServers(
-          (request.customTools as ResolvedToolSpec[]) ?? [],
-          request.toolCallback as ToolCallbackContext | undefined,
-        );
+    // Probe what this harness supports and branch on capabilities, not on the harness
+    // name. Tool delivery: Pi loads our extension (native tools, set up above); any other
+    // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not
+    // forward MCP, Claude/Codex do).
+    const capabilities = await probeCapabilities(sandbox, harness);
+    const toolSpecs = (request.customTools as ResolvedToolSpec[]) ?? [];
+    const mcpServers =
+      !isPi && capabilities.mcpTools
+        ? buildToolMcpServers(toolSpecs, request.toolCallback as ToolCallbackContext | undefined)
+        : [];
+    if (!isPi && toolSpecs.length > 0 && !capabilities.mcpTools) {
+      log(`harness '${harness}' lacks MCP tool support; ${toolSpecs.length} tool(s) not delivered`);
+    }
 
     const session = await sandbox.createSession({
       agent: harness,
@@ -660,10 +730,12 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
     });
 
     // Auto-approve permission requests so a permission-gating harness (e.g. Claude
-    // Code) does not block on tool use. Tools are backend-resolved and trusted; the
-    // run is headless so there is no human to prompt. Set AGENTA_RIVET_DENY_PERMISSIONS
-    // to reject instead.
-    const denyPermissions = process.env.AGENTA_RIVET_DENY_PERMISSIONS === "true";
+    // Code) does not block on tool use. Tools are backend-resolved and trusted; the run
+    // is headless so there is no human to prompt. The per-run `permissionPolicy` (or the
+    // AGENTA_RIVET_DENY_PERMISSIONS env) flips this to reject.
+    const denyPermissions =
+      request.permissionPolicy === "deny" ||
+      process.env.AGENTA_RIVET_DENY_PERMISSIONS === "true";
     session.onPermissionRequest((req: any) => {
       const replies: string[] = req?.availableReplies ?? [];
       const reply = denyPermissions
@@ -673,18 +745,40 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
     });
 
     const result = await session.prompt([{ type: "text", text: turnText }]);
-    log(`prompt stopReason=${(result as any)?.stopReason}`);
+    const stopReason = (result as any)?.stopReason;
+    log(`prompt stopReason=${stopReason}`);
 
     const output = run.finish();
     await run.flush();
 
+    // Usage: Pi writes its totals to a file via the extension. Other harnesses report the
+    // input/output token split on the PromptResponse and the cost on ACP `usage_update`,
+    // so combine the two (the stream alone carries no per-call token split).
+    let usage = await readRunUsage(sandbox, usageOutPath, isDaytona);
+    if (!usage) {
+      const promptUsage = (result as any)?.usage;
+      const streamUsage = run.usage();
+      const inputTokens = promptUsage?.inputTokens ?? streamUsage?.input ?? 0;
+      const outputTokens = promptUsage?.outputTokens ?? streamUsage?.output ?? 0;
+      const total = inputTokens + outputTokens || streamUsage?.total || 0;
+      const cost = streamUsage?.cost ?? 0;
+      usage =
+        total > 0 || cost > 0
+          ? { input: inputTokens, output: outputTokens, total, cost }
+          : undefined;
+    }
+
     return {
       ok: true,
       output,
+      messages: output ? [{ role: "assistant", content: output }] : [],
+      events: run.events(),
+      usage,
+      stopReason,
+      capabilities,
       sessionId: session.id,
       model: model ?? request.model,
       traceId: run.traceId(),
-      usage: await readRunUsage(sandbox, usageOutPath, isDaytona),
     };
   } catch (err) {
     otel?.finish();
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
index 2eee90d1fc..6096198699 100644
--- a/services/agent/src/server.ts
+++ b/services/agent/src/server.ts
@@ -12,20 +12,22 @@
  */
 import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
 
-import { runPi, type AgentRunRequest, type AgentRunResult } from "./runPi.ts";
+import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
+import { runPi } from "./runPi.ts";
 import { runRivet } from "./runRivet.ts";
 
 const PORT = Number(process.env.PORT ?? 8765);
 
-// Select the harness driver. `rivet` drives the harness over ACP via a rivet daemon
-// (WP-8); `pi` is the legacy in-process Pi path (WP-2). `auto` (default) routes by the
-// request: a rivet envelope carries `harness`/`sandbox`, so one sidecar serves both and
-// nothing regresses.
-const BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
+// Select the engine. `rivet` drives a harness over ACP via a rivet daemon; `pi` is the
+// legacy in-process Pi path. The request's explicit `backend` (set by the Python
+// transport) wins; the AGENT_BACKEND env is the sidecar default; `auto` falls back to the
+// request shape (a rivet request carries `harness`/`sandbox`).
+const DEFAULT_BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
 
 function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
-  if (BACKEND === "rivet") return runRivet(request);
-  if (BACKEND === "pi") return runPi(request);
+  const backend = (request.backend ?? DEFAULT_BACKEND).toLowerCase();
+  if (backend === "rivet") return runRivet(request);
+  if (backend === "pi") return runPi(request);
   return request.harness || request.sandbox ? runRivet(request) : runPi(request);
 }
 
diff --git a/services/agent/src/toolBridge.ts b/services/agent/src/toolBridge.ts
index 6cf27b10cb..56db4dfb52 100644
--- a/services/agent/src/toolBridge.ts
+++ b/services/agent/src/toolBridge.ts
@@ -16,9 +16,9 @@ import { existsSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 
-import type { ResolvedToolSpec, ToolCallbackContext } from "./runPi.ts";
+import type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
 
-export type { ResolvedToolSpec, ToolCallbackContext } from "./runPi.ts";
+export type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
 
 const HERE = dirname(fileURLToPath(import.meta.url));
 // services/agent/src/toolBridge.ts -> services/agent/node_modules/.bin/tsx
diff --git a/services/agent/src/toolBridgeServer.ts b/services/agent/src/toolBridgeServer.ts
index 7a8dd44971..45a666f3de 100644
--- a/services/agent/src/toolBridgeServer.ts
+++ b/services/agent/src/toolBridgeServer.ts
@@ -16,22 +16,15 @@
  * initialize, tools/list, tools/call; ignores notifications. stdout carries protocol
  * messages only; logs go to stderr.
  */
-interface ToolSpec {
-  name: string;
-  description?: string;
-  inputSchema?: Record<string, unknown> | null;
-  callRef: string;
-}
+import type { ResolvedToolSpec } from "./protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
 
-const SPECS: ToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]");
+const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]");
 const ENDPOINT = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT ?? "";
 const AUTH = process.env.AGENTA_TOOL_CALLBACK_AUTH;
 const SPEC_BY_NAME = new Map(SPECS.map((s) => [s.name, s]));
-const TOOL_CALL_TIMEOUT_MS = Number(process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000);
 const DEFAULT_PROTOCOL = "2025-06-18";
 
-const EMPTY_SCHEMA = { type: "object", properties: {}, additionalProperties: true };
-
 function log(message: string): void {
   process.stderr.write(`[tool-bridge] ${message}\n`);
 }
@@ -40,46 +33,6 @@ function send(message: unknown): void {
   process.stdout.write(`${JSON.stringify(message)}\n`);
 }
 
-/** One /tools/call round-trip. Returns the result text; throws on failure. */
-async function callAgentaTool(callRef: string, args: unknown): Promise<string> {
-  const headers: Record<string, string> = { "content-type": "application/json" };
-  if (AUTH) headers["authorization"] = AUTH;
-
-  let response: Response;
-  try {
-    response = await fetch(ENDPOINT, {
-      method: "POST",
-      headers,
-      body: JSON.stringify({
-        data: {
-          id: `tool-${Date.now()}`,
-          type: "function",
-          function: { name: callRef, arguments: args ?? {} },
-        },
-      }),
-      signal: AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS),
-    });
-  } catch (err) {
-    throw new Error(`tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`);
-  }
-
-  const bodyText = await response.text();
-  if (!response.ok) {
-    throw new Error(`tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`);
-  }
-  // ToolCallResponse -> { call: { data: { content }, status } }; content is the result
-  // serialized as a string, handed to the model verbatim.
-  try {
-    const parsed = JSON.parse(bodyText);
-    const content = parsed?.call?.data?.content;
-    if (typeof content === "string") return content;
-    if (content != null) return JSON.stringify(content);
-    return bodyText;
-  } catch {
-    return bodyText;
-  }
-}
-
 async function handle(message: any): Promise<unknown | undefined> {
   const { id, method, params } = message ?? {};
 
@@ -108,7 +61,7 @@ async function handle(message: any): Promise<unknown | undefined> {
         tools: SPECS.map((s) => ({
           name: s.name,
           description: s.description ?? s.name,
-          inputSchema: (s.inputSchema as Record<string, unknown>) ?? EMPTY_SCHEMA,
+          inputSchema: (s.inputSchema as Record<string, unknown>) ?? EMPTY_OBJECT_SCHEMA,
         })),
       },
     };
@@ -121,7 +74,13 @@ async function handle(message: any): Promise<unknown | undefined> {
       return { jsonrpc: "2.0", id, error: { code: -32602, message: `unknown tool: ${name}` } };
     }
     try {
-      const text = await callAgentaTool(spec.callRef, params?.arguments);
+      const text = await callAgentaTool(
+        ENDPOINT,
+        AUTH,
+        spec.callRef,
+        `tool-${Date.now()}`,
+        params?.arguments,
+      );
       return { jsonrpc: "2.0", id, result: { content: [{ type: "text", text }] } };
     } catch (err) {
       // Surface as an MCP tool error (isError) so the model can recover, not a crash.
diff --git a/services/agent/src/toolClient.ts b/services/agent/src/toolClient.ts
new file mode 100644
index 0000000000..330e63c611
--- /dev/null
+++ b/services/agent/src/toolClient.ts
@@ -0,0 +1,87 @@
+/**
+ * Shared Agenta /tools/call client.
+ *
+ * One implementation of the tool round-trip used by every delivery path:
+ *  - runPi.ts buildCustomTools (in-process Pi customTools)
+ *  - piExtension.ts registerTools (Pi under rivet/ACP, via the bundled extension)
+ *  - toolBridgeServer.ts (the MCP stdio bridge for non-Pi harnesses)
+ *
+ * Each call POSTs the OpenAI-style envelope to Agenta's /tools/call, so the Composio key
+ * and connection auth stay server-side. Keeping the request envelope and response parse in
+ * one place means a change to the /tools/call contract is a one-line edit, not three.
+ */
+export type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
+
+/** Per-tool budget for the /tools/call round-trip. Surfaced as a tool error on timeout. */
+export const TOOL_CALL_TIMEOUT_MS = Number(
+  process.env.AGENTA_AGENT_TOOL_CALL_TIMEOUT_MS ?? 30000,
+);
+
+/** Permissive default when a resolved tool has no input schema. */
+export const EMPTY_OBJECT_SCHEMA = {
+  type: "object",
+  properties: {},
+  additionalProperties: true,
+};
+
+/**
+ * One /tools/call round-trip. Returns the result text; throws on failure. Callers turn a
+ * throw into a tool-error result so the model loop continues rather than crashing the run.
+ * An optional caller `signal` is combined with the per-tool timeout.
+ */
+export async function callAgentaTool(
+  endpoint: string,
+  authorization: string | undefined,
+  callRef: string,
+  toolCallId: string,
+  args: unknown,
+  signal?: AbortSignal,
+): Promise<string> {
+  const headers: Record<string, string> = { "content-type": "application/json" };
+  if (authorization) headers["authorization"] = authorization;
+
+  const timeoutSignal = AbortSignal.timeout(TOOL_CALL_TIMEOUT_MS);
+  const anyOf = (AbortSignal as any).any;
+  const combined =
+    signal && typeof anyOf === "function" ? anyOf([signal, timeoutSignal]) : timeoutSignal;
+
+  let response: Response;
+  try {
+    response = await fetch(endpoint, {
+      method: "POST",
+      headers,
+      body: JSON.stringify({
+        data: {
+          id: toolCallId,
+          type: "function",
+          // Arguments as an object (not a JSON string) to avoid double-encoding.
+          function: { name: callRef, arguments: args ?? {} },
+        },
+      }),
+      signal: combined,
+    });
+  } catch (err) {
+    throw new Error(
+      `tool call ${callRef} failed: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+
+  const bodyText = await response.text();
+  if (!response.ok) {
+    throw new Error(
+      `tool call ${callRef} returned HTTP ${response.status}: ${bodyText.slice(0, 500)}`,
+    );
+  }
+
+  // ToolCallResponse -> { call: { data: { content }, status } }. `content` is the
+  // execution result serialized as a JSON string; hand it to the model verbatim.
+  try {
+    const parsed = JSON.parse(bodyText);
+    const content = parsed?.call?.data?.content;
+    if (typeof content === "string") return content;
+    if (content != null) return JSON.stringify(content);
+    return bodyText;
+  } catch {
+    return bodyText;
+  }
+}
diff --git a/services/oss/src/agent.py b/services/oss/src/agent.py
index 90f98ae948..e76ae8d6d4 100644
--- a/services/oss/src/agent.py
+++ b/services/oss/src/agent.py
@@ -24,11 +24,15 @@
 from agenta.sdk.utils.logging import get_module_logger
 
 from oss.src.agent_pi.config import load_config, wrapper_dir
-from oss.src.agent_pi.local_runtime import LocalRuntime
-from oss.src.agent_pi.pi_harness import PiHarness
-from oss.src.agent_pi.pi_http_harness import PiHttpHarness
-from oss.src.agent_pi.ports import Harness, HarnessRequest, ToolCallback, TraceContext
-from oss.src.agent_pi.rivet_harness import RivetHarness
+from oss.src.agent_pi.environment import LocalEnvironment
+from oss.src.agent_pi.harness import HttpHarness, SubprocessHarness
+from oss.src.agent_pi.ports import (
+    Harness,
+    Message,
+    SessionConfig,
+    ToolCallback,
+    TraceContext,
+)
 from oss.src.agent_pi.schemas import AGENT_SCHEMAS
 
 log = get_module_logger(__name__)
@@ -43,41 +47,35 @@
 _TOOLS_RESOLVE_TIMEOUT = float(os.getenv("AGENTA_AGENT_TOOLS_TIMEOUT", "30"))
 
 
-def _build_harness(
-    harness: Optional[str] = None,
-    sandbox: Optional[str] = None,
-) -> Harness:
-    """Pick the harness adapter for the current deployment.
-
-    Runtime axis (``AGENTA_AGENT_RUNTIME``):
-    - ``rivet``: drive the harness over ACP via a rivet daemon (WP-8). The harness
-      (pi/claude) and sandbox (local/daytona) are independent config axes, taken from
-      the request config when set (so they are editable in the playground), else the
-      ``AGENTA_AGENT_HARNESS`` / ``AGENTA_AGENT_SANDBOX`` env defaults.
-    - default (``pi``): the legacy in-process Pi path (WP-2), kept so nothing regresses.
+def _select_backend(harness_id: str, sandbox_id: str) -> str:
+    """Choose the engine (``rivet`` or ``pi``) for a run.
 
-    Transport axis (both runtimes):
-    - ``AGENTA_AGENT_PI_URL`` set (docker): call the TS wrapper sidecar over HTTP.
-    - otherwise (local): spawn the TS wrapper as a subprocess.
+    ``rivet`` drives a harness over ACP via a rivet daemon; ``pi`` is the legacy
+    in-process Pi path. The legacy path only runs the ``pi`` harness locally, so any other
+    harness or sandbox forces ``rivet`` rather than silently dropping the selection.
+    ``AGENTA_AGENT_RUNTIME=rivet`` forces rivet for everything.
     """
-    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
     runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
+    if runtime == "rivet" or harness_id != "pi" or sandbox_id != "local":
+        return "rivet"
+    return "pi"
 
-    if runtime == "rivet":
-        harness = (harness or os.getenv("AGENTA_AGENT_HARNESS", "pi")).lower()
-        sandbox = (sandbox or os.getenv("AGENTA_AGENT_SANDBOX", "local")).lower()
-        if pi_url:
-            return RivetHarness(harness=harness, sandbox=sandbox, base_url=pi_url)
-        return RivetHarness(
-            harness=harness,
-            sandbox=sandbox,
-            runtime=LocalRuntime(),
-            wrapper_dir=str(wrapper_dir()),
-        )
 
+def _build_harness(backend: str) -> Harness:
+    """Pick the transport to the TypeScript runner for the current deployment.
+
+    The ``backend`` (engine) is chosen by :func:`_select_backend`. The transport is
+    env-driven: ``AGENTA_AGENT_PI_URL`` set (docker) -> call the sidecar over HTTP; unset
+    (local) -> spawn the runner as a subprocess.
+    """
+    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
     if pi_url:
-        return PiHttpHarness(pi_url)
-    return PiHarness(LocalRuntime(), wrapper_dir=str(wrapper_dir()))
+        return HttpHarness(pi_url, backend=backend)
+    return SubprocessHarness(
+        LocalEnvironment(),
+        wrapper_dir=str(wrapper_dir()),
+        backend=backend,
+    )
 
 
 def _system_text(messages: Optional[List[Any]]) -> str:
@@ -127,14 +125,18 @@ def _resolve_run_config(
     return model, agents_md, raw_tools
 
 
-def _latest_user_message(messages: Optional[List[Any]]) -> str:
-    for message in reversed(messages or []):
-        if not isinstance(message, dict):
-            continue
-        if message.get("role") == "user" and message.get("content"):
-            content = message["content"]
-            return content if isinstance(content, str) else str(content)
-    return ""
+def _to_messages(raw: Optional[List[Any]]) -> List[Message]:
+    """Coerce the playground's loose message dicts into :class:`Message` objects.
+
+    The runner picks the latest user turn and replays the rest as context, so we hand it
+    the whole conversation rather than pre-extracting a single prompt.
+    """
+    messages: List[Message] = []
+    for item in raw or []:
+        message = Message.from_raw(item)
+        if message is not None:
+            messages.append(message)
+    return messages
 
 
 # Map a vault standard-provider kind to the env var the harness (Pi/Claude/litellm)
@@ -402,33 +404,40 @@ async def _agent(
     elif not isinstance(tools_config, list):
         tools_config = []
 
-    msgs = messages or (inputs or {}).get("messages") or []
-    prompt = _latest_user_message(msgs)
+    msgs = _to_messages(messages or (inputs or {}).get("messages") or [])
 
     builtins, custom_tools, tool_callback = await _resolve_tools(tools_config)
 
-    # Harness (pi/claude) and sandbox (local/daytona) are editable config (see
-    # schemas.py), so a playground run can switch engine or environment; unset falls
-    # back to the env defaults inside _build_harness.
-    harness_id = params.get("harness")
-    sandbox_id = params.get("sandbox")
-    harness = _build_harness(harness=harness_id, sandbox=sandbox_id)
+    # Harness (pi/claude), sandbox (local/daytona), and permission policy are editable
+    # config (see schemas.py), so a playground run can switch engine or environment;
+    # unset falls back to the env defaults. They ride on the per-run SessionConfig.
+    harness_id = (
+        params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
+    ).lower()
+    sandbox_id = (
+        params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
+    ).lower()
+    session_config = SessionConfig(
+        instructions=agents_md,
+        model=model,
+        harness=harness_id,
+        sandbox=sandbox_id,
+        secrets=await _resolve_harness_secrets(),
+        builtin_tools=builtins,
+        custom_tools=custom_tools,
+        tool_callback=tool_callback,
+        permission_policy=(params.get("permission_policy") or "auto").lower(),
+        trace=_trace_context(),
+    )
 
+    # The engine follows the selected harness/sandbox: a claude harness or a daytona
+    # sandbox needs rivet, so the legacy pi path never silently swallows the selection.
+    harness = _build_harness(_select_backend(harness_id, sandbox_id))
     await harness.setup()
     try:
-        result = await harness.invoke(
-            HarnessRequest(
-                agents_md=agents_md,
-                model=model,
-                prompt=prompt,
-                messages=msgs,
-                tools=builtins,
-                custom_tools=custom_tools,
-                tool_callback=tool_callback,
-                trace=_trace_context(),
-                secrets=await _resolve_harness_secrets(),
-            )
-        )
+        session = harness.create_session(session_config)
+        result = await session.prompt(msgs)
+        await session.destroy()
     finally:
         await harness.shutdown()
 
diff --git a/services/oss/src/agent_pi/__init__.py b/services/oss/src/agent_pi/__init__.py
index 91ee583c51..11321c7cd8 100644
--- a/services/oss/src/agent_pi/__init__.py
+++ b/services/oss/src/agent_pi/__init__.py
@@ -1,11 +1,47 @@
-"""Agent runtime: ports and adapters for the WP-2 agent service.
+"""Agent runtime: ports and adapters for the agent service.
 
-The Python service is "our agent implementation". It owns two ports the design doc
-calls out:
+The Python service is "our agent implementation". It owns two seams (see
+``docs/design/agent-workflows/harness-port-redesign/``):
 
-- ``Harness``: the seam between our service and the agent engine. ``PiHarness`` is the
-  Pi implementation; it drives the TypeScript Pi wrapper in ``services/agent``.
-- ``Runtime``: the seam for the run environment (start, shutdown, pause, connect
-  volume). ``LocalRuntime`` runs the harness as a local subprocess. A Daytona adapter
-  lands later behind the same port.
+- ``Harness``: the agent engine. ``SubprocessHarness`` and ``HttpHarness`` (in
+  ``harness.py``) are the two transports to the TypeScript runner; the engine (legacy
+  in-process Pi vs rivet over ACP) is an env value, not a class. ``create_session``
+  returns an :class:`AgentSession`, the rivet-shaped session abstraction.
+- ``Environment``: where the harness process runs. ``LocalEnvironment`` runs it as a local
+  subprocess; a sandbox environment is selected inside the rivet runner.
 """
+
+from .environment import LocalEnvironment
+from .harness import HttpHarness, SubprocessHarness
+from .ports import (
+    AgentEvent,
+    AgentRequest,
+    AgentResult,
+    AgentSession,
+    ContentBlock,
+    Environment,
+    HarnessCapabilities,
+    Harness,
+    Message,
+    SessionConfig,
+    ToolCallback,
+    TraceContext,
+)
+
+__all__ = [
+    "AgentEvent",
+    "AgentRequest",
+    "AgentResult",
+    "AgentSession",
+    "ContentBlock",
+    "Environment",
+    "Harness",
+    "HarnessCapabilities",
+    "HttpHarness",
+    "LocalEnvironment",
+    "Message",
+    "SessionConfig",
+    "SubprocessHarness",
+    "ToolCallback",
+    "TraceContext",
+]
diff --git a/services/oss/src/agent_pi/local_runtime.py b/services/oss/src/agent_pi/environment.py
similarity index 72%
rename from services/oss/src/agent_pi/local_runtime.py
rename to services/oss/src/agent_pi/environment.py
index d50d97edd8..d66889c710 100644
--- a/services/oss/src/agent_pi/local_runtime.py
+++ b/services/oss/src/agent_pi/environment.py
@@ -1,26 +1,24 @@
-"""Local runtime adapter: runs the harness as a subprocess on this host.
+"""Local environment: run the harness as a subprocess on this host.
 
-This is the parity baseline for the design doc. The Node process is the run
-environment. A Daytona adapter (WP-3) implements the same port by running the command
-inside a sandbox instead.
+This is the parity baseline. The Node process is the run environment. A sandbox
+environment (Daytona) is selected on the rivet path inside the TypeScript runner, so it
+does not need a separate Python ``Environment`` here.
 """
 
+from __future__ import annotations
+
 import asyncio
 from typing import Dict, Optional, Sequence
 
 from agenta.sdk.utils.logging import get_module_logger
 
-from .ports import ExecResult, Runtime
+from .ports import Environment, ExecResult
 
 log = get_module_logger(__name__)
 
 
-class LocalRuntime(Runtime):
-    async def start(self) -> None:
-        return None
-
-    async def shutdown(self) -> None:
-        return None
+class LocalEnvironment(Environment):
+    """Run a command as a subprocess on this host, feeding it the request on stdin."""
 
     async def exec(
         self,
diff --git a/services/oss/src/agent_pi/harness.py b/services/oss/src/agent_pi/harness.py
new file mode 100644
index 0000000000..45c79b0b98
--- /dev/null
+++ b/services/oss/src/agent_pi/harness.py
@@ -0,0 +1,145 @@
+"""The two harness transports: subprocess and HTTP.
+
+Both speak the same ``/run`` wire contract (see ``wire.py``) and differ only in how they
+reach the TypeScript runner:
+
+- ``SubprocessHarness`` spawns the TS CLI through an :class:`Environment`, handing it the
+  request on stdin. It sets ``AGENT_BACKEND`` to pick the engine (``rivet`` for the ACP
+  path, ``pi`` for the legacy in-process Pi path).
+- ``HttpHarness`` POSTs to the wrapper running as a sidecar. The sidecar auto-routes to the
+  engine by request shape (a rivet request carries ``harness``/``sandbox``), so the
+  transport itself stays engine-agnostic.
+
+The engine is therefore config, not a Python class. This is what collapsed the old
+``PiHarness`` / ``PiHttpHarness`` / ``RivetHarness`` trio into two transports.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from typing import List, Optional, Sequence
+
+import httpx
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from .ports import AgentRequest, AgentResult, Environment, EventSink, Harness
+from .wire import request_to_wire, result_from_wire
+
+log = get_module_logger(__name__)
+
+_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
+_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
+
+
+def _emit_events(result: AgentResult, on_event: Optional[EventSink]) -> None:
+    """Replay the result's event log to a live sink.
+
+    The one-shot transports receive the whole run at once, so events arrive as a batch
+    rather than live. Firing them here keeps the ``on_event`` API working; true streaming
+    (NDJSON over ``/run``) is a documented follow-on.
+    """
+    if not on_event:
+        return
+    for event in result.events:
+        try:
+            on_event(event)
+        except Exception:  # pylint: disable=broad-except
+            log.warning("agent: on_event sink raised", exc_info=True)
+
+
+class SubprocessHarness(Harness):
+    """Drive the TS runner as a subprocess on this host, request on stdin.
+
+    ``backend`` selects the engine via ``AGENT_BACKEND`` (``rivet`` or ``pi``).
+    """
+
+    def __init__(
+        self,
+        environment: Environment,
+        *,
+        wrapper_dir: str,
+        backend: str = "rivet",
+        command: Optional[Sequence[str]] = None,
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        self._environment = environment
+        self._wrapper_dir = wrapper_dir
+        self._backend = backend
+        self._command: List[str] = list(command or _DEFAULT_COMMAND)
+        self._timeout = timeout
+
+    async def setup(self) -> None:
+        await self._environment.start()
+
+    async def shutdown(self) -> None:
+        await self._environment.dispose()
+
+    async def invoke(
+        self,
+        request: AgentRequest,
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        wire = request_to_wire(request)
+        wire["backend"] = self._backend
+        payload = json.dumps(wire).encode("utf-8")
+        exec_result = await self._environment.exec(
+            self._command,
+            payload,
+            cwd=self._wrapper_dir,
+            env={**os.environ, "AGENT_BACKEND": self._backend},
+            timeout=self._timeout,
+        )
+
+        if not exec_result.stdout.strip():
+            raise RuntimeError(
+                "Agent runner returned no output. "
+                f"exit={exec_result.code} stderr={exec_result.stderr[-2000:]}"
+            )
+        try:
+            data = json.loads(exec_result.stdout)
+        except json.JSONDecodeError as exc:
+            raise RuntimeError(
+                "Agent runner returned invalid JSON. "
+                f"stdout={exec_result.stdout[:500]} stderr={exec_result.stderr[-1000:]}"
+            ) from exc
+
+        result = result_from_wire(data)
+        _emit_events(result, on_event)
+        return result
+
+
+class HttpHarness(Harness):
+    """Drive the TS runner over HTTP (the sidecar). The sidecar picks the engine."""
+
+    def __init__(
+        self,
+        base_url: str,
+        *,
+        backend: str = "rivet",
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        self._base_url = base_url.rstrip("/")
+        self._backend = backend
+        self._timeout = timeout
+
+    async def invoke(
+        self,
+        request: AgentRequest,
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        payload = request_to_wire(request)
+        payload["backend"] = self._backend
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            response = await client.post(f"{self._base_url}/run", json=payload)
+        if response.status_code >= 500:
+            raise RuntimeError(
+                f"Agent runner HTTP {response.status_code}: {response.text[:1000]}"
+            )
+
+        result = result_from_wire(response.json())
+        _emit_events(result, on_event)
+        return result
diff --git a/services/oss/src/agent_pi/pi_harness.py b/services/oss/src/agent_pi/pi_harness.py
deleted file mode 100644
index 266e9cb9a0..0000000000
--- a/services/oss/src/agent_pi/pi_harness.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Pi harness adapter: drives the TypeScript Pi wrapper in ``services/agent``.
-
-The transport is a one-shot JSON-over-stdio call: we send the run request as JSON on
-the wrapper's stdin and read its JSON result from stdout. This is the "json adapter"
-the design doc describes. A long-lived RPC adapter (``pi --mode rpc``) can replace it
-later behind this same Harness port without touching the service.
-"""
-
-import json
-import os
-from typing import List, Optional, Sequence
-
-from agenta.sdk.utils.logging import get_module_logger
-
-from .ports import Harness, HarnessRequest, HarnessResult, Runtime
-
-log = get_module_logger(__name__)
-
-_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
-_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
-
-
-class PiHarness(Harness):
-    def __init__(
-        self,
-        runtime: Runtime,
-        *,
-        wrapper_dir: str,
-        command: Optional[Sequence[str]] = None,
-        timeout: float = _DEFAULT_TIMEOUT,
-    ) -> None:
-        self._runtime = runtime
-        self._wrapper_dir = wrapper_dir
-        self._command: List[str] = list(command or _DEFAULT_COMMAND)
-        self._timeout = timeout
-
-    async def setup(self) -> None:
-        await self._runtime.start()
-
-    async def shutdown(self) -> None:
-        await self._runtime.shutdown()
-
-    async def invoke(self, request: HarnessRequest) -> HarnessResult:
-        payload = json.dumps(
-            {
-                "agentsMd": request.agents_md,
-                "model": request.model,
-                "prompt": request.prompt,
-                "messages": request.messages,
-                "tools": request.tools,
-                "customTools": request.custom_tools,
-                "toolCallback": request.tool_callback.to_wire()
-                if request.tool_callback
-                else None,
-                "trace": request.trace.to_wire() if request.trace else None,
-            }
-        ).encode("utf-8")
-
-        result = await self._runtime.exec(
-            self._command,
-            payload,
-            cwd=self._wrapper_dir,
-            env={**os.environ},
-            timeout=self._timeout,
-        )
-
-        if not result.stdout.strip():
-            raise RuntimeError(
-                "Pi wrapper returned no output. "
-                f"exit={result.code} stderr={result.stderr[-2000:]}"
-            )
-
-        try:
-            data = json.loads(result.stdout)
-        except json.JSONDecodeError as exc:
-            raise RuntimeError(
-                "Pi wrapper returned invalid JSON. "
-                f"stdout={result.stdout[:500]} stderr={result.stderr[-1000:]}"
-            ) from exc
-
-        if not data.get("ok"):
-            raise RuntimeError(f"Pi run failed: {data.get('error')}")
-
-        return HarnessResult(
-            output=data.get("output", ""),
-            session_id=data.get("sessionId"),
-            model=data.get("model"),
-        )
diff --git a/services/oss/src/agent_pi/pi_http_harness.py b/services/oss/src/agent_pi/pi_http_harness.py
deleted file mode 100644
index 0435319011..0000000000
--- a/services/oss/src/agent_pi/pi_http_harness.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Pi harness adapter over HTTP.
-
-Same Harness port as ``PiHarness`` (the local subprocess one), but talks to the Pi
-wrapper running as a separate HTTP service (a sidecar container). The transport is a
-JSON ``POST /run``. This is what the dockerized agent uses, since the Python service
-container has no Node; the Pi wrapper runs in its own container.
-"""
-
-import os
-
-import httpx
-
-from agenta.sdk.utils.logging import get_module_logger
-
-from .ports import Harness, HarnessRequest, HarnessResult
-
-log = get_module_logger(__name__)
-
-_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
-
-
-class PiHttpHarness(Harness):
-    def __init__(
-        self,
-        base_url: str,
-        *,
-        timeout: float = _DEFAULT_TIMEOUT,
-    ) -> None:
-        self._base_url = base_url.rstrip("/")
-        self._timeout = timeout
-
-    async def setup(self) -> None:
-        return None
-
-    async def shutdown(self) -> None:
-        return None
-
-    async def invoke(self, request: HarnessRequest) -> HarnessResult:
-        payload = {
-            "agentsMd": request.agents_md,
-            "model": request.model,
-            "prompt": request.prompt,
-            "messages": request.messages,
-            "tools": request.tools,
-            "customTools": request.custom_tools,
-            "toolCallback": request.tool_callback.to_wire()
-            if request.tool_callback
-            else None,
-            "trace": request.trace.to_wire() if request.trace else None,
-        }
-
-        async with httpx.AsyncClient(timeout=self._timeout) as client:
-            response = await client.post(f"{self._base_url}/run", json=payload)
-
-        if response.status_code >= 500:
-            raise RuntimeError(
-                f"Pi wrapper HTTP {response.status_code}: {response.text[:1000]}"
-            )
-
-        data = response.json()
-        if not data.get("ok"):
-            raise RuntimeError(f"Pi run failed: {data.get('error')}")
-
-        return HarnessResult(
-            output=data.get("output", ""),
-            session_id=data.get("sessionId"),
-            model=data.get("model"),
-        )
diff --git a/services/oss/src/agent_pi/ports.py b/services/oss/src/agent_pi/ports.py
index dc768a29cd..bbf7f59e4c 100644
--- a/services/oss/src/agent_pi/ports.py
+++ b/services/oss/src/agent_pi/ports.py
@@ -1,70 +1,213 @@
-"""Ports for the agent service: the Harness seam and the Runtime (environment) seam.
-
-These interfaces keep the service harness-agnostic and environment-agnostic. The MVP
-ships one adapter for each (Pi over a local subprocess), but the boundaries are where
-Codex/Claude Code (other harnesses) and Daytona (other environments) slot in later.
+"""Ports for the agent service: the Environment seam and the Harness seam.
+
+These interfaces keep the service environment-agnostic and engine-agnostic. The shapes
+are borrowed from the rivet ``sandbox-agent`` SDK (see
+``docs/design/agent-workflows/harness-port-redesign/``) but stay ours, so rivet is one
+adapter behind the seam and a non-rivet engine (the legacy in-process Pi path) fits the
+same port.
+
+Two seams:
+
+- ``Environment`` — where the harness process runs. ``LocalEnvironment`` runs it as a
+  subprocess on this host; a sandbox environment runs it elsewhere. This is the "runtime"
+  axis renamed; ``exec`` survives only as the subprocess transport's mechanism.
+- ``Harness`` — the agent engine. One ``invoke`` is one cold run. ``create_session``
+  returns an :class:`AgentSession`, the rivet-shaped abstraction on top: under cold +
+  replay it holds no warm daemon, so continuing a conversation replays the caller-held
+  history into a fresh run.
+
+The engine choice (legacy in-process Pi vs rivet over ACP) is not a Python class. It is an
+env value the transport hands the TypeScript runner, so Python has two transports
+(subprocess, HTTP), not three backend adapters.
 """
 
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+
+# ---------------------------------------------------------------------------
+# Capabilities
+# ---------------------------------------------------------------------------
 
 
 @dataclass
-class ExecResult:
-    """Result of running a command through a Runtime."""
+class HarnessCapabilities:
+    """What a harness can do, probed by the runtime (rivet ``AgentCapabilities``).
 
-    code: int
-    stdout: str
-    stderr: str
+    The runner reports these in the result; the service uses them for observability and
+    for input shaping (for example, do not send image blocks to a harness without
+    ``images``). The branching that used to key off the harness name (``if pi``) now keys
+    off these flags in the TypeScript runner, where the live answer is.
+    """
+
+    text_messages: bool = True
+    images: bool = False
+    file_attachments: bool = False
+    mcp_tools: bool = False
+    tool_calls: bool = False
+    reasoning: bool = False
+    plan_mode: bool = False
+    permissions: bool = False
+    usage: bool = False
+    streaming_deltas: bool = False
+    session_lifecycle: bool = False
+
+    @classmethod
+    def from_wire(
+        cls, data: Optional[Dict[str, Any]]
+    ) -> Optional["HarnessCapabilities"]:
+        """Parse the camelCase capability object the runner returns. ``None`` passes through."""
+        if not isinstance(data, dict):
+            return None
+        return cls(
+            text_messages=bool(data.get("textMessages", True)),
+            images=bool(data.get("images", False)),
+            file_attachments=bool(data.get("fileAttachments", False)),
+            mcp_tools=bool(data.get("mcpTools", False)),
+            tool_calls=bool(data.get("toolCalls", False)),
+            reasoning=bool(data.get("reasoning", False)),
+            plan_mode=bool(data.get("planMode", False)),
+            permissions=bool(data.get("permissions", False)),
+            usage=bool(data.get("usage", False)),
+            streaming_deltas=bool(data.get("streamingDeltas", False)),
+            session_lifecycle=bool(data.get("sessionLifecycle", False)),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Turn input: content blocks and messages
+# ---------------------------------------------------------------------------
 
 
-class Runtime(ABC):
-    """Port for the run environment: where and how the harness process runs.
+@dataclass
+class ContentBlock:
+    """One piece of a message, mirroring the ACP content-block kinds.
 
-    The local adapter runs it as a subprocess on this host. A sandbox adapter (WP-3)
-    runs it inside Daytona. ``pause`` and ``connect_volume`` are lifecycle hooks the
-    design doc calls out; the local adapter no-ops them.
+    ``text`` is the only kind the playground sends today; ``image`` and ``resource`` are
+    plumbed so an image-capable harness can take them once the playground does. A bare
+    string content is normalized to a single ``text`` block on the wire.
     """
 
-    @abstractmethod
-    async def start(self) -> None:
-        """Bring the environment up (no-op for a local process)."""
+    type: str  # "text" | "image" | "resource"
+    text: Optional[str] = None
+    # image / resource payloads (base64 data or a uri), used when type != "text".
+    data: Optional[str] = None
+    mime_type: Optional[str] = None
+    uri: Optional[str] = None
 
-    @abstractmethod
-    async def shutdown(self) -> None:
-        """Tear the environment down (no-op for a local process)."""
+    def to_wire(self) -> Dict[str, Any]:
+        block: Dict[str, Any] = {"type": self.type}
+        if self.text is not None:
+            block["text"] = self.text
+        if self.data is not None:
+            block["data"] = self.data
+        if self.mime_type is not None:
+            block["mimeType"] = self.mime_type
+        if self.uri is not None:
+            block["uri"] = self.uri
+        return block
+
+    @classmethod
+    def from_raw(cls, raw: Any) -> "ContentBlock":
+        """Coerce a loose block (string or dict) into a ContentBlock."""
+        if isinstance(raw, ContentBlock):
+            return raw
+        if isinstance(raw, str):
+            return cls(type="text", text=raw)
+        if isinstance(raw, dict):
+            return cls(
+                type=str(raw.get("type", "text")),
+                text=raw.get("text"),
+                data=raw.get("data"),
+                mime_type=raw.get("mimeType") or raw.get("mime_type"),
+                uri=raw.get("uri"),
+            )
+        return cls(type="text", text=str(raw))
+
+
+# A message's content is either a plain string or a list of content blocks.
+MessageContent = Union[str, List[ContentBlock]]
 
-    async def pause(self) -> None:
-        """Pause the environment. Optional; no-op by default."""
-        return None
 
-    async def connect_volume(self, *args: Any, **kwargs: Any) -> None:
-        """Attach a volume to the environment. Optional; no-op by default."""
-        return None
+@dataclass
+class Message:
+    """A chat message in the conversation. ``content`` is text or content blocks."""
 
-    @abstractmethod
-    async def exec(
-        self,
-        command: Sequence[str],
-        input_bytes: bytes,
-        *,
-        cwd: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        timeout: Optional[float] = None,
-    ) -> ExecResult:
-        """Run ``command`` in the environment, feeding ``input_bytes`` to stdin."""
+    role: str
+    content: MessageContent = ""
+
+    def to_wire(self) -> Dict[str, Any]:
+        if isinstance(self.content, str):
+            content: Any = self.content
+        else:
+            # Tolerate both ContentBlock objects and the raw dicts a caller may pass.
+            content = [
+                block.to_wire() if isinstance(block, ContentBlock) else block
+                for block in self.content
+            ]
+        return {"role": self.role, "content": content}
+
+    @classmethod
+    def from_raw(cls, raw: Any) -> Optional["Message"]:
+        """Coerce a loose dict (the playground's message shape) into a Message.
+
+        List content (ACP-style content blocks) is normalized into ``ContentBlock``
+        objects so the typed-content invariant holds downstream.
+        """
+        if isinstance(raw, Message):
+            return raw
+        if not isinstance(raw, dict) or "role" not in raw:
+            return None
+        content = raw.get("content", "")
+        if isinstance(content, list):
+            content = [ContentBlock.from_raw(block) for block in content]
+        return cls(role=str(raw["role"]), content=content)
+
+
+# ---------------------------------------------------------------------------
+# Run events: the structured stream
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AgentEvent:
+    """One structured event from a run, mapped from an ACP ``session/update``.
+
+    ``type`` is one of ``message``, ``thought``, ``tool_call``, ``tool_result``,
+    ``usage``, ``error``, ``done``. ``data`` carries the rest verbatim. The runner returns
+    these as a per-turn log; an ``on_event`` callback can also receive them live.
+    """
+
+    type: str
+    data: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_wire(cls, raw: Any) -> Optional["AgentEvent"]:
+        if not isinstance(raw, dict) or not raw.get("type"):
+            return None
+        return cls(type=str(raw["type"]), data=raw)
+
+
+# A live event sink. Synchronous: the transports invoke it as events arrive.
+EventSink = Callable[[AgentEvent], None]
+
+
+# ---------------------------------------------------------------------------
+# Trace context and tool callback (cross-boundary plumbing, unchanged shapes)
+# ---------------------------------------------------------------------------
 
 
 @dataclass
 class TraceContext:
     """Agenta trace context threaded into the harness run.
 
-    Lets the harness nest its spans under the caller's workflow span (same
-    ``trace_id``) and ship them to the same Agenta backend with the same auth, so
-    the agent's whole run becomes part of the ``/invoke`` trace the way
-    completion/chat nest their LLM spans. All fields optional; with none set the
-    harness traces standalone (or not at all).
+    Lets the harness nest its spans under the caller's workflow span (same ``trace_id``)
+    and ship them to the same Agenta backend with the same auth, so the agent's whole run
+    becomes part of the ``/invoke`` trace the way completion/chat nest their LLM spans.
+    All fields optional; with none set the harness traces standalone (or not at all).
     """
 
     traceparent: Optional[str] = None
@@ -74,7 +217,6 @@ class TraceContext:
     capture_content: bool = True
 
     def to_wire(self) -> Dict[str, Any]:
-        """Serialize to the camelCase shape the TS wrapper expects on the wire."""
         return {
             "traceparent": self.traceparent,
             "baggage": self.baggage,
@@ -88,70 +230,189 @@ def to_wire(self) -> Dict[str, Any]:
 class ToolCallback:
     """How the harness routes a tool call back through Agenta's ``/tools/call``.
 
-    The backend resolves runnable tool references into specs and hands the harness
-    this callback. The TS wrapper turns each spec into a Pi ``customTool`` whose
-    ``execute`` POSTs the OpenAI-style envelope to ``endpoint`` with
-    ``authorization``. The provider key and connection auth never enter the sandbox;
-    they stay behind ``/tools/call``. Same mechanism that threads the OTLP credential.
+    The backend resolves runnable tool references into specs and hands the harness this
+    callback. The provider key and connection auth never enter the sandbox; they stay
+    behind ``/tools/call``. Same mechanism that threads the OTLP credential.
     """
 
     endpoint: str  # full ``/tools/call`` URL
     authorization: Optional[str] = None  # full Authorization header value
 
     def to_wire(self) -> Dict[str, Any]:
-        """Serialize to the camelCase shape the TS wrapper expects on the wire."""
-        return {
-            "endpoint": self.endpoint,
-            "authorization": self.authorization,
-        }
+        return {"endpoint": self.endpoint, "authorization": self.authorization}
+
+
+# ---------------------------------------------------------------------------
+# Session config, request, result
+# ---------------------------------------------------------------------------
+
+# Permission policy for harness tool use in a headless run. ``auto`` approves (tools are
+# backend-resolved and trusted, no human to prompt); ``deny`` rejects.
+PermissionPolicy = str  # "auto" | "deny"
 
 
 @dataclass
-class HarnessRequest:
-    """One agent run: instructions, model, the user turn, and optional history."""
+class SessionConfig:
+    """The agent config bundle for a session: everything but the turn itself.
 
-    agents_md: Optional[str] = None
+    Mirrors the rivet session config. ``instructions`` becomes ``AGENTS.md``;
+    ``harness``/``sandbox`` are the two orthogonal swap axes; ``secrets`` are provider keys
+    injected as harness env, never written to the agent filesystem. Skills and hooks are
+    carried as workspace artifacts (not modeled as verbs); they are not built in this pass.
+    """
+
+    instructions: Optional[str] = None  # AGENTS.md text
     model: Optional[str] = None
-    prompt: Optional[str] = None
-    messages: List[Any] = field(default_factory=list)
-    # Continue a prior run by id (rivet path resumes/replays its history). None = new.
+    harness: str = "pi"
+    sandbox: str = "local"
     session_id: Optional[str] = None
-    # Provider API keys resolved from the project vault, as harness env vars
-    # ({"OPENAI_API_KEY": "...", ...}). Injected into the harness environment (local
-    # daemon + Daytona env_vars). Empty => the harness uses its own login (OAuth).
     secrets: Dict[str, str] = field(default_factory=dict)
-    tools: List[str] = field(default_factory=list)
-    # Resolved runnable tool specs, already in the camelCase wire shape the TS
-    # wrapper turns into Pi customTools: {name, description, inputSchema, callRef}.
+    builtin_tools: List[str] = field(default_factory=list)
     custom_tools: List[Dict[str, Any]] = field(default_factory=list)
     tool_callback: Optional[ToolCallback] = None
+    permission_policy: PermissionPolicy = "auto"
     trace: Optional[TraceContext] = None
 
 
 @dataclass
-class HarnessResult:
-    """The agent's reply plus run metadata."""
+class AgentRequest:
+    """One transport call: the session config plus the conversation so far.
+
+    The runner picks the latest user turn and replays the prior turns as context (the
+    cold + replay model). ``messages`` is the full conversation the caller holds.
+    """
+
+    config: SessionConfig
+    messages: List[Message] = field(default_factory=list)
+
+
+@dataclass
+class AgentResult:
+    """The agent's reply plus structured run metadata.
 
-    output: str
+    ``output`` is the final assistant text (the playground renders this). ``messages`` and
+    ``events`` are the structured forms. ``usage`` rolls token/cost onto the workflow span
+    (the harness span tree ships in a separate OTLP batch, so the service stamps the totals
+    itself). ``capabilities`` is what the harness was probed to support this run.
+    """
+
+    output: str = ""
+    messages: List[Message] = field(default_factory=list)
+    events: List[AgentEvent] = field(default_factory=list)
+    usage: Optional[Dict[str, Any]] = None
+    stop_reason: Optional[str] = None
+    capabilities: Optional[HarnessCapabilities] = None
     session_id: Optional[str] = None
     model: Optional[str] = None
-    # Run token/cost totals ({input, output, total, cost}). The harness span tree is
-    # exported in a separate OTLP batch from the workflow span, so the service rolls
-    # these onto the workflow span itself (see agent.py). None when unavailable.
-    usage: Optional[Dict[str, Any]] = None
+    trace_id: Optional[str] = None
 
 
-class Harness(ABC):
-    """Port between our service and the agent engine. Pi is one implementation."""
+# ---------------------------------------------------------------------------
+# Environment seam (where the harness process runs)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ExecResult:
+    """Result of running a command through an Environment."""
+
+    code: int
+    stdout: str
+    stderr: str
+
+
+class Environment(ABC):
+    """Where and how the harness process runs.
+
+    ``LocalEnvironment`` runs it as a subprocess on this host. ``exec`` is the subprocess
+    transport's mechanism; the HTTP transport does not use it. ``start``/``dispose`` are
+    lifecycle hooks (no-ops for a local process).
+    """
+
+    async def start(self) -> None:
+        """Bring the environment up (no-op for a local process)."""
+        return None
+
+    async def dispose(self) -> None:
+        """Tear the environment down (no-op for a local process)."""
+        return None
 
     @abstractmethod
+    async def exec(
+        self,
+        command: Sequence[str],
+        input_bytes: bytes,
+        *,
+        cwd: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        timeout: Optional[float] = None,
+    ) -> ExecResult:
+        """Run ``command`` in the environment, feeding ``input_bytes`` to stdin."""
+
+
+# ---------------------------------------------------------------------------
+# Harness seam (the agent engine) and the session abstraction
+# ---------------------------------------------------------------------------
+
+
+class Harness(ABC):
+    """The agent engine behind one transport. Rivet and the legacy Pi path are adapters."""
+
     async def setup(self) -> None:
-        """Prepare the harness for a run."""
+        """Prepare the harness for a run (no-op by default)."""
+        return None
 
-    @abstractmethod
-    async def invoke(self, request: HarnessRequest) -> HarnessResult:
-        """Run one turn and return the agent's reply."""
+    async def shutdown(self) -> None:
+        """Release harness resources (no-op by default)."""
+        return None
 
     @abstractmethod
-    async def shutdown(self) -> None:
-        """Release any harness resources."""
+    async def invoke(
+        self,
+        request: AgentRequest,
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        """Run one cold turn and return the structured result."""
+
+    async def destroy_session(self, session_id: Optional[str]) -> None:
+        """Drop a session's resources. A no-op under cold + replay (nothing is kept warm)."""
+        return None
+
+    def create_session(self, config: SessionConfig) -> "AgentSession":
+        """Open a session for this config. The session is the rivet-shaped abstraction."""
+        return AgentSession(self, config)
+
+
+class AgentSession:
+    """A first-class session over a :class:`Harness`.
+
+    ``create_session(config)`` then ``session.prompt(messages)``. Under cold + replay the
+    session keeps no warm daemon: each ``prompt`` is a fresh ``invoke`` that replays the
+    supplied history. The abstraction is real (and where a future server-side history
+    store slots in); the cold lifecycle is an adapter detail.
+    """
+
+    def __init__(self, harness: Harness, config: SessionConfig) -> None:
+        self._harness = harness
+        self._config = config
+
+    @property
+    def id(self) -> Optional[str]:
+        return self._config.session_id
+
+    async def prompt(
+        self,
+        messages: Sequence[Message],
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        request = AgentRequest(config=self._config, messages=list(messages))
+        result = await self._harness.invoke(request, on_event=on_event)
+        # Carry the engine's session id forward so a follow-up prompt resumes it.
+        if result.session_id:
+            self._config.session_id = result.session_id
+        return result
+
+    async def destroy(self) -> None:
+        await self._harness.destroy_session(self._config.session_id)
diff --git a/services/oss/src/agent_pi/rivet_harness.py b/services/oss/src/agent_pi/rivet_harness.py
deleted file mode 100644
index cd84939ab1..0000000000
--- a/services/oss/src/agent_pi/rivet_harness.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Rivet harness adapter (WP-8): drives the agent over ACP via a rivet daemon.
-
-Same ``Harness`` port as the Pi adapters, but the transport behind it runs the chosen
-harness (Pi, Claude Code, ...) over the Agent Client Protocol through a rivet
-``sandbox-agent`` daemon, rather than the bespoke Pi SDK calls. The ``/invoke`` contract
-is unchanged; harness and sandbox become config values carried on the wire to the TS
-runner (``runRivet.ts``, selected by ``AGENT_BACKEND=rivet``).
-
-Two transports, mirroring the Pi adapters:
-
-- HTTP (docker): POST the envelope to the wrapper running as a sidecar. Selected when a
-  base URL is provided (``AGENTA_AGENT_PI_URL``); the sidecar runs in rivet mode.
-- subprocess (local): spawn the TS CLI with ``AGENT_BACKEND=rivet`` and hand it the
-  envelope over stdio.
-
-The envelope adds ``harness``, ``sandbox``, and ``sessionId`` to the Pi-shaped fields;
-everything else (agentsMd, model, prompt, messages, tools, customTools, toolCallback,
-trace) is identical, so the Python side stays thin.
-"""
-
-import json
-import os
-from typing import List, Optional, Sequence
-
-import httpx
-
-from agenta.sdk.utils.logging import get_module_logger
-
-from .ports import Harness, HarnessRequest, HarnessResult, Runtime
-
-log = get_module_logger(__name__)
-
-_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
-_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
-
-
-def _rivet_payload(request: HarnessRequest, harness: str, sandbox: str) -> dict:
-    """Build the wire envelope: the Pi-shaped fields plus harness/sandbox/sessionId."""
-    return {
-        "harness": harness,
-        "sandbox": sandbox,
-        "sessionId": request.session_id,
-        "secrets": request.secrets or {},
-        "agentsMd": request.agents_md,
-        "model": request.model,
-        "prompt": request.prompt,
-        "messages": request.messages,
-        "tools": request.tools,
-        "customTools": request.custom_tools,
-        "toolCallback": request.tool_callback.to_wire()
-        if request.tool_callback
-        else None,
-        "trace": request.trace.to_wire() if request.trace else None,
-    }
-
-
-def _to_result(data: dict) -> HarnessResult:
-    if not data.get("ok"):
-        raise RuntimeError(f"Rivet run failed: {data.get('error')}")
-    return HarnessResult(
-        output=data.get("output", ""),
-        session_id=data.get("sessionId"),
-        model=data.get("model"),
-        usage=data.get("usage"),
-    )
-
-
-class RivetHarness(Harness):
-    """Drive the harness over ACP via rivet, over HTTP or a local subprocess.
-
-    Pass ``base_url`` for the HTTP sidecar transport; otherwise a ``runtime`` plus
-    ``wrapper_dir`` runs the TS CLI as a subprocess. ``harness`` (pi/claude) and
-    ``sandbox`` (local/daytona) are the two orthogonal swap axes.
-    """
-
-    def __init__(
-        self,
-        *,
-        harness: str,
-        sandbox: str,
-        base_url: Optional[str] = None,
-        runtime: Optional[Runtime] = None,
-        wrapper_dir: Optional[str] = None,
-        command: Optional[Sequence[str]] = None,
-        timeout: float = _DEFAULT_TIMEOUT,
-    ) -> None:
-        if not base_url and not runtime:
-            raise ValueError(
-                "RivetHarness needs either base_url (HTTP) or runtime (subprocess)"
-            )
-        self._harness = harness
-        self._sandbox = sandbox
-        self._base_url = base_url.rstrip("/") if base_url else None
-        self._runtime = runtime
-        self._wrapper_dir = wrapper_dir
-        self._command: List[str] = list(command or _DEFAULT_COMMAND)
-        self._timeout = timeout
-
-    async def setup(self) -> None:
-        if self._runtime:
-            await self._runtime.start()
-
-    async def shutdown(self) -> None:
-        if self._runtime:
-            await self._runtime.shutdown()
-
-    async def invoke(self, request: HarnessRequest) -> HarnessResult:
-        payload = _rivet_payload(request, self._harness, self._sandbox)
-        if self._base_url:
-            return await self._invoke_http(payload)
-        return await self._invoke_subprocess(payload)
-
-    async def _invoke_http(self, payload: dict) -> HarnessResult:
-        async with httpx.AsyncClient(timeout=self._timeout) as client:
-            response = await client.post(f"{self._base_url}/run", json=payload)
-        if response.status_code >= 500:
-            raise RuntimeError(
-                f"Rivet wrapper HTTP {response.status_code}: {response.text[:1000]}"
-            )
-        return _to_result(response.json())
-
-    async def _invoke_subprocess(self, payload: dict) -> HarnessResult:
-        assert self._runtime is not None
-        result = await self._runtime.exec(
-            self._command,
-            json.dumps(payload).encode("utf-8"),
-            cwd=self._wrapper_dir,
-            env={**os.environ, "AGENT_BACKEND": "rivet"},
-            timeout=self._timeout,
-        )
-        if not result.stdout.strip():
-            raise RuntimeError(
-                "Rivet wrapper returned no output. "
-                f"exit={result.code} stderr={result.stderr[-2000:]}"
-            )
-        try:
-            data = json.loads(result.stdout)
-        except json.JSONDecodeError as exc:
-            raise RuntimeError(
-                "Rivet wrapper returned invalid JSON. "
-                f"stdout={result.stdout[:500]} stderr={result.stderr[-1000:]}"
-            ) from exc
-        return _to_result(data)
diff --git a/services/oss/src/agent_pi/schemas.py b/services/oss/src/agent_pi/schemas.py
index 7dc6af2580..c6aa3cea68 100644
--- a/services/oss/src/agent_pi/schemas.py
+++ b/services/oss/src/agent_pi/schemas.py
@@ -76,6 +76,16 @@
             "default": "local",
             "description": "Where the agent runs: local daemon or a Daytona sandbox.",
         },
+        "permission_policy": {
+            "type": "string",
+            "title": "Permission policy",
+            "enum": ["auto", "deny"],
+            "default": "auto",
+            "description": (
+                "How a permission-gating harness (e.g. Claude Code) handles tool-use "
+                "prompts in this headless run: auto-approve or deny."
+            ),
+        },
     },
 }
 
diff --git a/services/oss/src/agent_pi/wire.py b/services/oss/src/agent_pi/wire.py
new file mode 100644
index 0000000000..3fddee5324
--- /dev/null
+++ b/services/oss/src/agent_pi/wire.py
@@ -0,0 +1,73 @@
+"""The ``/run`` wire contract, in one place.
+
+Every transport (subprocess, HTTP) sends the same camelCase JSON to the TypeScript runner
+and parses the same result back, so the wire shape lives here rather than being rebuilt in
+each adapter. The TypeScript side mirrors these names in ``services/agent/src/protocol.ts``.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from .ports import (
+    AgentEvent,
+    AgentRequest,
+    AgentResult,
+    HarnessCapabilities,
+    Message,
+)
+
+
+def request_to_wire(request: AgentRequest) -> Dict[str, Any]:
+    """Serialize an :class:`AgentRequest` to the ``/run`` request JSON."""
+    config = request.config
+    return {
+        "harness": config.harness,
+        "sandbox": config.sandbox,
+        "sessionId": config.session_id,
+        "agentsMd": config.instructions,
+        "model": config.model,
+        "messages": [message.to_wire() for message in request.messages],
+        "secrets": config.secrets or {},
+        "tools": config.builtin_tools,
+        "customTools": config.custom_tools,
+        "toolCallback": config.tool_callback.to_wire()
+        if config.tool_callback
+        else None,
+        "permissionPolicy": config.permission_policy,
+        "trace": config.trace.to_wire() if config.trace else None,
+    }
+
+
+def result_from_wire(data: Dict[str, Any]) -> AgentResult:
+    """Parse a ``/run`` result JSON into an :class:`AgentResult`.
+
+    Raises ``RuntimeError`` when the runner reported a failure, so the invoke surfaces a
+    clear message rather than handing the model an empty reply.
+    """
+    if not data.get("ok"):
+        raise RuntimeError(f"Agent run failed: {data.get('error')}")
+
+    messages: List[Message] = []
+    for raw in data.get("messages") or []:
+        message = Message.from_raw(raw)
+        if message is not None:
+            messages.append(message)
+
+    events: List[AgentEvent] = []
+    for raw in data.get("events") or []:
+        event = AgentEvent.from_wire(raw)
+        if event is not None:
+            events.append(event)
+
+    return AgentResult(
+        output=data.get("output", "") or "",
+        messages=messages,
+        events=events,
+        usage=data.get("usage"),
+        stop_reason=data.get("stopReason"),
+        capabilities=HarnessCapabilities.from_wire(data.get("capabilities")),
+        session_id=data.get("sessionId"),
+        model=data.get("model"),
+        trace_id=data.get("traceId"),
+    )

From 586029de40a6cc3246d0c4c503424c186c9f1c53 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 13:52:29 +0200
Subject: [PATCH 05/10] refactor(agent): split into an agent app and an
 engine-agnostic harness runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address the god-module and the misleading package name:

- services/oss/src/harness/ (was agent_pi/): the engine-agnostic runtime — ports.py,
  transports.py (was harness.py), environment.py, wire.py. Named for the seam, not Pi;
  harness choice (pi/claude) lives inside the runtime, so there is no agent_claude.
- services/oss/src/agent/ (was the 470-line agent.py god-module): the Agenta workflow app
  — app.py (thin handler + backend wiring), inputs.py (request parsing), tools.py,
  secrets.py, tracing.py, client.py (shared backend access), schemas.py, config.py.

No behavior change. Verified live: a playground run answers 'REFACTOR-OK Lisbon' with usage.
---
 .../harness-port-redesign/implementation.md   |  34 +-
 .../harness-port-redesign/status.md           |  15 +-
 services/agent/src/protocol.ts                |   2 +-
 services/oss/src/agent.py                     | 485 ------------------
 services/oss/src/agent/__init__.py            |  12 +
 services/oss/src/agent/app.py                 | 125 +++++
 services/oss/src/agent/client.py              |  63 +++
 .../oss/src/{agent_pi => agent}/config.py     |   2 +-
 services/oss/src/agent/inputs.py              |  67 +++
 .../oss/src/{agent_pi => agent}/schemas.py    |   0
 services/oss/src/agent/secrets.py             |  72 +++
 services/oss/src/agent/tools.py               | 124 +++++
 services/oss/src/agent/tracing.py             |  85 +++
 .../oss/src/{agent_pi => harness}/__init__.py |  17 +-
 .../src/{agent_pi => harness}/environment.py  |   0
 .../oss/src/{agent_pi => harness}/ports.py    |   0
 .../harness.py => harness/transports.py}      |   0
 .../oss/src/{agent_pi => harness}/wire.py     |   0
 18 files changed, 596 insertions(+), 507 deletions(-)
 delete mode 100644 services/oss/src/agent.py
 create mode 100644 services/oss/src/agent/__init__.py
 create mode 100644 services/oss/src/agent/app.py
 create mode 100644 services/oss/src/agent/client.py
 rename services/oss/src/{agent_pi => agent}/config.py (97%)
 create mode 100644 services/oss/src/agent/inputs.py
 rename services/oss/src/{agent_pi => agent}/schemas.py (100%)
 create mode 100644 services/oss/src/agent/secrets.py
 create mode 100644 services/oss/src/agent/tools.py
 create mode 100644 services/oss/src/agent/tracing.py
 rename services/oss/src/{agent_pi => harness}/__init__.py (59%)
 rename services/oss/src/{agent_pi => harness}/environment.py (100%)
 rename services/oss/src/{agent_pi => harness}/ports.py (100%)
 rename services/oss/src/{agent_pi/harness.py => harness/transports.py} (100%)
 rename services/oss/src/{agent_pi => harness}/wire.py (100%)

diff --git a/docs/design/agent-workflows/harness-port-redesign/implementation.md b/docs/design/agent-workflows/harness-port-redesign/implementation.md
index 93a698bd40..bcf774fed8 100644
--- a/docs/design/agent-workflows/harness-port-redesign/implementation.md
+++ b/docs/design/agent-workflows/harness-port-redesign/implementation.md
@@ -5,20 +5,38 @@ the as-built reference for the rewrite (kept in sync with the code).
 
 ## Module layout
 
-### Python (`services/oss/src/agent_pi/`)
+### Python — two packages
+
+The engine-agnostic runtime and the Agenta workflow integration are separate packages, so
+nothing in the runtime is Agenta-specific and the god-module is gone.
+
+`services/oss/src/harness/` — the engine-agnostic runtime:
 
 | File | Holds |
 | --- | --- |
-| `ports.py` | The neutral types and the two seams. Types: `HarnessCapabilities`, `ContentBlock`, `Message`, `AgentEvent`, `TraceContext`, `ToolCallback`, `SessionConfig`, `AgentRequest`, `AgentResult`. Seams: `Environment` (where it runs) and `Harness` (the agent). Plus the concrete `AgentSession` sugar. |
-| `wire.py` | One place that serializes an `AgentRequest` to the camelCase `/run` JSON and parses an `AgentResult` back. Shared by every transport so the wire shape lives once. |
+| `ports.py` | The neutral types and the two seams. Types: `HarnessCapabilities`, `ContentBlock`, `Message`, `AgentEvent`, `TraceContext`, `ToolCallback`, `SessionConfig`, `AgentRequest`, `AgentResult`. Seams: `Environment` (where it runs) and `Harness` (the agent), plus the concrete `AgentSession`. |
+| `transports.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. Replaces `pi_harness.py`, `pi_http_harness.py`, `rivet_harness.py`. |
 | `environment.py` | `LocalEnvironment` (subprocess on this host). Replaces `local_runtime.py`. |
-| `harness.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. Replaces `pi_harness.py`, `pi_http_harness.py`, `rivet_harness.py`. |
-| `config.py` | Unchanged: load the file-backed `AgentConfig`. |
+| `wire.py` | Serializes an `AgentRequest` to the camelCase `/run` JSON and parses an `AgentResult` back. The wire shape lives once. |
+
+`services/oss/src/agent/` — the Agenta workflow app (was the single `agent.py` god-module):
+
+| File | Holds |
+| --- | --- |
+| `app.py` | The `/invoke` handler plus `select_backend` / `build_harness`. Thin: it orchestrates the modules below. |
+| `inputs.py` | Request parsing: `resolve_run_config`, `to_messages`, `_system_text`. |
+| `tools.py` | Tool resolution through `/tools/resolve` (and slug parsing). |
+| `secrets.py` | Provider keys from the project vault. |
+| `tracing.py` | `trace_context` and `record_usage` (the OTel glue). |
+| `client.py` | Shared Agenta-backend access (base URL + caller credential). |
 | `schemas.py` | The `/inspect` schemas. Gains the permission-policy parameter. |
+| `config.py` | The file-backed `AgentConfig` and the TS runner path. |
 
-The backend engine (legacy in-process Pi vs rivet ACP) is no longer a Python class. It
-is one env value (`AGENT_BACKEND`) the transport passes to the TS runner, or the sidecar
-auto-routes by request shape. So Python has two transports, not three backend adapters.
+The backend engine (legacy in-process Pi vs rivet ACP) is no longer a Python class. It is
+one env value (`AGENT_BACKEND`) the transport passes to the TS runner, so Python has two
+transports, not three backend adapters. The harness folder is named for the seam, not for
+Pi: harness choice (pi/claude) lives inside the runtime, which is why there is no
+`agent_claude` package.
 
 ### TypeScript (`services/agent/src/`)
 
diff --git a/docs/design/agent-workflows/harness-port-redesign/status.md b/docs/design/agent-workflows/harness-port-redesign/status.md
index 3301844dae..479fc42a88 100644
--- a/docs/design/agent-workflows/harness-port-redesign/status.md
+++ b/docs/design/agent-workflows/harness-port-redesign/status.md
@@ -4,11 +4,18 @@ Source of truth for this design effort. Keep it current.
 
 ## Current state
 
-Research and proposal drafted (2026-06-17). Nothing implemented. The comparison is in
+IMPLEMENTED, reviewed, and verified live (2026-06-17). Draft PR
+[#4721](https://github.com/Agenta-AI/agenta/pull/4721), stacked on the WP-8 PR (#4718).
+The as-built reference is [`implementation.md`](implementation.md); the comparison is in
 [`research.md`](research.md); the recommended shape and phased path are in
-[`proposal.md`](proposal.md). This builds on the shipped WP-8 runtime
-([`../wp-8-rivet-acp-runtime/status.md`](../wp-8-rivet-acp-runtime/status.md)), which
-adopted rivet unmodified and kept the ports unchanged on purpose.
+[`proposal.md`](proposal.md) and [`plan.md`](plan.md). Builds on the shipped WP-8 runtime
+([`../wp-8-rivet-acp-runtime/status.md`](../wp-8-rivet-acp-runtime/status.md)).
+
+The new port (`Environment` + `Harness` + `AgentSession`, capabilities, content blocks,
+structured events/result) ships with both backends (rivet ACP, legacy in-process Pi) on
+two transports sharing one wire contract. Verified live: pi, rivet+pi+local,
+rivet+claude+local, rivet+pi+daytona; a playground run nests `invoke_agent` under the
+`/invoke` span with usage. A high-effort review found and fixed 10 issues.
 
 ## Recommendation in one line
 
diff --git a/services/agent/src/protocol.ts b/services/agent/src/protocol.ts
index 4880e6f093..5e94bd9332 100644
--- a/services/agent/src/protocol.ts
+++ b/services/agent/src/protocol.ts
@@ -1,7 +1,7 @@
 /**
  * The `/run` wire contract, shared by both backends.
  *
- * The Python side mirrors these names in `services/oss/src/agent_pi/wire.py`. Keeping the
+ * The Python side mirrors these names in `services/oss/src/harness/wire.py`. Keeping the
  * request/result/event/capability types here (rather than in one runner that the other
  * imports from) is what lets `runPi.ts` and `runRivet.ts` stay peers.
  */
diff --git a/services/oss/src/agent.py b/services/oss/src/agent.py
deleted file mode 100644
index e76ae8d6d4..0000000000
--- a/services/oss/src/agent.py
+++ /dev/null
@@ -1,485 +0,0 @@
-"""Agent workflow service (WP-2 + WP-7).
-
-Mirrors the chat/completion services: an Agenta app exposing ``/invoke`` and
-``/inspect`` through ``ag.create_app`` + ``ag.workflow`` + ``ag.route``, so the
-backend and playground treat an agent like the other workflow types. The handler
-builds the user turn from the request and runs it through the Harness port, whose Pi
-adapter drives the TypeScript wrapper in ``services/agent``.
-
-Config is a ``prompt-template`` (system message as AGENTS.md, model, and tools): the
-playground renders the same prompt control as chat/completion, including the tool
-picker. Runnable tools (WP-7) are resolved in the backend (``/tools/resolve``) and
-executed back through ``/tools/call`` while Pi drives the loop. Streaming,
-multi-message output, and the Daytona sandbox are later work packages.
-"""
-
-import os
-from typing import Any, Dict, List, Optional, Tuple
-
-import httpx
-from opentelemetry import trace as otel_trace
-
-import agenta as ag
-from agenta.sdk.engines.tracing.propagation import inject
-from agenta.sdk.utils.logging import get_module_logger
-
-from oss.src.agent_pi.config import load_config, wrapper_dir
-from oss.src.agent_pi.environment import LocalEnvironment
-from oss.src.agent_pi.harness import HttpHarness, SubprocessHarness
-from oss.src.agent_pi.ports import (
-    Harness,
-    Message,
-    SessionConfig,
-    ToolCallback,
-    TraceContext,
-)
-from oss.src.agent_pi.schemas import AGENT_SCHEMAS
-
-log = get_module_logger(__name__)
-
-_CAPTURE_CONTENT = os.getenv("AGENTA_AGENT_CAPTURE_CONTENT", "true").lower() not in (
-    "0",
-    "false",
-    "no",
-)
-
-# Budget for the backend tool-resolution round-trip (catalog + connection check).
-_TOOLS_RESOLVE_TIMEOUT = float(os.getenv("AGENTA_AGENT_TOOLS_TIMEOUT", "30"))
-
-
-def _select_backend(harness_id: str, sandbox_id: str) -> str:
-    """Choose the engine (``rivet`` or ``pi``) for a run.
-
-    ``rivet`` drives a harness over ACP via a rivet daemon; ``pi`` is the legacy
-    in-process Pi path. The legacy path only runs the ``pi`` harness locally, so any other
-    harness or sandbox forces ``rivet`` rather than silently dropping the selection.
-    ``AGENTA_AGENT_RUNTIME=rivet`` forces rivet for everything.
-    """
-    runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
-    if runtime == "rivet" or harness_id != "pi" or sandbox_id != "local":
-        return "rivet"
-    return "pi"
-
-
-def _build_harness(backend: str) -> Harness:
-    """Pick the transport to the TypeScript runner for the current deployment.
-
-    The ``backend`` (engine) is chosen by :func:`_select_backend`. The transport is
-    env-driven: ``AGENTA_AGENT_PI_URL`` set (docker) -> call the sidecar over HTTP; unset
-    (local) -> spawn the runner as a subprocess.
-    """
-    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
-    if pi_url:
-        return HttpHarness(pi_url, backend=backend)
-    return SubprocessHarness(
-        LocalEnvironment(),
-        wrapper_dir=str(wrapper_dir()),
-        backend=backend,
-    )
-
-
-def _system_text(messages: Optional[List[Any]]) -> str:
-    """Join the system-message content of a prompt-template into AGENTS.md text."""
-    parts: List[str] = []
-    for message in messages or []:
-        if not isinstance(message, dict) or message.get("role") != "system":
-            continue
-        content = message.get("content")
-        if isinstance(content, str):
-            parts.append(content)
-        elif isinstance(content, list):
-            parts.extend(
-                block.get("text", "")
-                for block in content
-                if isinstance(block, dict) and block.get("type") == "text"
-            )
-    return "\n\n".join(part for part in parts if part)
-
-
-def _resolve_run_config(
-    params: Dict[str, Any],
-    config: Any,
-) -> Tuple[str, str, Any]:
-    """Pull model, instructions, and raw tools from the request parameters.
-
-    Accepts both shapes: the playground's ``prompt`` (a ``prompt-template`` whose
-    system message is the AGENTS.md and whose ``llm_config`` carries model + picker
-    tools) and the flat ``{model, agents_md, tools}`` an API caller may send. Falls
-    back to the service file config for any unset field.
-    """
-    prompt_cfg = params.get("prompt")
-    if isinstance(prompt_cfg, dict):
-        llm_config = prompt_cfg.get("llm_config") or {}
-        model = llm_config.get("model") or config.model
-        agents_md = _system_text(prompt_cfg.get("messages")) or config.agents_md
-        raw_tools = llm_config.get("tools")
-        if raw_tools is None:
-            raw_tools = prompt_cfg.get("tools")
-    else:
-        model = params.get("model") or config.model
-        agents_md = params.get("agents_md") or config.agents_md
-        raw_tools = params.get("tools")
-
-    if raw_tools is None:
-        raw_tools = config.tools
-    return model, agents_md, raw_tools
-
-
-def _to_messages(raw: Optional[List[Any]]) -> List[Message]:
-    """Coerce the playground's loose message dicts into :class:`Message` objects.
-
-    The runner picks the latest user turn and replays the rest as context, so we hand it
-    the whole conversation rather than pre-extracting a single prompt.
-    """
-    messages: List[Message] = []
-    for item in raw or []:
-        message = Message.from_raw(item)
-        if message is not None:
-            messages.append(message)
-    return messages
-
-
-# Map a vault standard-provider kind to the env var the harness (Pi/Claude/litellm)
-# reads. Only providers an agent harness can use are listed.
-_PROVIDER_ENV_VARS = {
-    "openai": "OPENAI_API_KEY",
-    "anthropic": "ANTHROPIC_API_KEY",
-    "gemini": "GEMINI_API_KEY",
-    "mistral": "MISTRAL_API_KEY",
-    "mistralai": "MISTRAL_API_KEY",
-    "groq": "GROQ_API_KEY",
-    "together_ai": "TOGETHERAI_API_KEY",
-    "openrouter": "OPENROUTER_API_KEY",
-}
-
-
-async def _resolve_harness_secrets() -> Dict[str, str]:
-    """Resolve provider API keys from the project vault into harness env vars.
-
-    The agent authenticates the harness with the same provider keys the project
-    configured for LLM access. We fetch the project's vault ``provider_key`` secrets
-    from the backend directly (same backend + caller credential the tool resolver uses)
-    and inject each as its standard env var, so the harness uses whichever its model
-    needs. The SDK's per-request secret context does not propagate to this custom route,
-    so we resolve here rather than reading it. Empty when the vault has none (the harness
-    then falls back to its own login / OAuth — see ``runRivet``). Best-effort.
-    """
-    api_base = _agenta_api_base()
-    if not api_base:
-        return {}
-    headers = {"Content-Type": "application/json"}
-    authorization = _request_authorization()
-    if authorization:
-        headers["Authorization"] = authorization
-
-    try:
-        async with httpx.AsyncClient(timeout=_TOOLS_RESOLVE_TIMEOUT) as client:
-            response = await client.get(f"{api_base}/secrets/", headers=headers)
-        if response.status_code >= 400:
-            log.warning("agent: vault secrets fetch HTTP %s", response.status_code)
-            return {}
-        secrets = response.json() or []
-    except Exception:  # pylint: disable=broad-except
-        log.warning("agent: vault secrets fetch failed", exc_info=True)
-        return {}
-
-    env: Dict[str, str] = {}
-    for secret in secrets:
-        if not isinstance(secret, dict) or secret.get("kind") != "provider_key":
-            continue
-        data = secret.get("data") or {}
-        env_var = _PROVIDER_ENV_VARS.get(str(data.get("kind", "")).lower())
-        key = (data.get("provider") or {}).get("key")
-        if env_var and key:
-            env.setdefault(env_var, key)
-    return env
-
-
-def _trace_context() -> Optional[TraceContext]:
-    """Capture the active workflow span's trace context for the harness.
-
-    This runs inside the instrumented handler, so the current OTel span is the
-    ``/invoke`` workflow span. Threading its ``traceparent`` into the Pi run makes
-    the agent's spans children of that span, in the same trace, so the agent's
-    whole run shows up under the response's ``trace_id`` the way completion/chat
-    nest their LLM spans. Best-effort: any failure returns ``None`` and the run is
-    simply traced standalone (or not at all) using the wrapper's env config.
-    """
-    try:
-        headers = inject({})
-
-        traceparent = headers.get("traceparent")
-        if not traceparent:
-            return None
-
-        endpoint = None
-        try:
-            endpoint = ag.tracing.otlp_url
-        except Exception:  # pylint: disable=broad-except
-            endpoint = None
-
-        return TraceContext(
-            traceparent=traceparent,
-            baggage=headers.get("baggage"),
-            endpoint=endpoint,
-            authorization=headers.get("Authorization"),
-            capture_content=_CAPTURE_CONTENT,
-        )
-    except Exception:  # pylint: disable=broad-except
-        log.warning("agent: failed to capture trace context", exc_info=True)
-        return None
-
-
-def _agenta_api_base() -> Optional[str]:
-    """Resolve the Agenta backend base URL (``.../api``) for tool calls.
-
-    Prefers an explicit override, then derives it from the OTLP endpoint the SDK is
-    configured with (``{host}/api/otlp/v1/traces``), then falls back to env. Returns
-    ``None`` when nothing is configured; callers only need this when tools are set.
-    """
-    override = os.getenv("AGENTA_AGENT_TOOLS_API_URL")
-    if override:
-        return override.rstrip("/")
-
-    try:
-        otlp_url = ag.tracing.otlp_url
-    except Exception:  # pylint: disable=broad-except
-        otlp_url = None
-    if otlp_url and "/otlp/" in otlp_url:
-        return otlp_url.split("/otlp/", 1)[0].rstrip("/")
-
-    api_url = os.getenv("AGENTA_API_URL")
-    if api_url:
-        return api_url.rstrip("/")
-
-    return None
-
-
-def _request_authorization() -> Optional[str]:
-    """The project-scoped credential to call ``/tools/resolve`` and ``/tools/call``.
-
-    Reuses the same propagation the OTLP credential rides on (the caller's
-    Authorization), falling back to the service's own API key the way the tracing
-    sidecar does. Scoping to the caller keeps an agent run from invoking tools the
-    user could not (see WP-7 risk: RUN_TOOLS scoping).
-    """
-    try:
-        authorization = inject({}).get("Authorization")
-    except Exception:  # pylint: disable=broad-except
-        authorization = None
-    if authorization:
-        return authorization
-
-    api_key = os.getenv("AGENTA_API_KEY")
-    if api_key:
-        return f"ApiKey {api_key}"
-
-    return None
-
-
-def _parse_gateway_slug(slug: Any) -> Optional[Dict[str, Any]]:
-    """Parse a gateway tool slug into a Composio reference, or ``None``.
-
-    The playground tool picker encodes a Composio action as a function name like
-    ``tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn`` (the same
-    5-segment slug ``/tools/call`` parses; ``__`` or ``.`` separated). Anything that
-    is not a 5-segment ``tools.composio.*`` slug returns ``None`` so the caller can
-    skip it.
-    """
-    if not isinstance(slug, str):
-        return None
-    parts = slug.replace("__", ".").split(".")
-    if len(parts) == 5 and parts[0] == "tools" and parts[1] == "composio":
-        return {
-            "type": "composio",
-            "integration": parts[2],
-            "action": parts[3],
-            "connection": parts[4],
-        }
-    return None
-
-
-def _normalize_tool_ref(ref: Any) -> Optional[Dict[str, Any]]:
-    """Coerce a config entry into a discriminated tool reference the resolver parses.
-
-    Handles three shapes: a bare string (or single-key ``{"name": ...}``) is the
-    existing built-in tool name; a dict already carrying ``type`` passes through; and
-    the playground picker's gateway entry (``{"function": {"name":
-    "tools__composio__..."}}``) is parsed into a ``composio`` ref. Unsupported picker
-    entries (provider built-ins, inline custom functions) return ``None`` and are
-    skipped rather than failing the run.
-    """
-    if isinstance(ref, str):
-        return {"type": "builtin", "name": ref}
-    if isinstance(ref, dict):
-        if ref.get("type") in ("builtin", "composio"):
-            return ref
-        function = ref.get("function") if isinstance(ref.get("function"), dict) else {}
-        gateway = _parse_gateway_slug(function.get("name") or ref.get("name"))
-        if gateway:
-            return gateway
-        if "type" not in ref and isinstance(ref.get("name"), str):
-            return {"type": "builtin", "name": ref["name"]}
-        return None
-    return None
-
-
-async def _resolve_tools(
-    tools: List[Any],
-) -> Tuple[List[str], List[Dict[str, Any]], Optional[ToolCallback]]:
-    """Resolve config tool references into builtins + Pi customTool specs.
-
-    Calls the backend resolver (``POST /tools/resolve``), which validates Composio
-    connections up front and enriches each action from the catalog. Returns the
-    built-in tool names, the camelCase customTool specs for the wire, and the
-    ``/tools/call`` callback. Raises on resolution failure so the invoke fails early
-    with a clear message rather than the model hitting a runtime tool error.
-    """
-    refs = [ref for ref in (_normalize_tool_ref(t) for t in tools if t) if ref]
-    if not refs:
-        return [], [], None
-
-    api_base = _agenta_api_base()
-    if not api_base:
-        raise RuntimeError(
-            "Agent has tools configured but the Agenta API base URL is unknown. "
-            "Set AGENTA_AGENT_TOOLS_API_URL or AGENTA_API_URL."
-        )
-
-    authorization = _request_authorization()
-    headers = {"Content-Type": "application/json"}
-    if authorization:
-        headers["Authorization"] = authorization
-
-    async with httpx.AsyncClient(timeout=_TOOLS_RESOLVE_TIMEOUT) as client:
-        response = await client.post(
-            f"{api_base}/tools/resolve",
-            json={"tools": refs},
-            headers=headers,
-        )
-
-    if response.status_code >= 400:
-        raise RuntimeError(
-            f"Tool resolution failed (HTTP {response.status_code}): "
-            f"{response.text[:500]}"
-        )
-
-    data = response.json()
-    builtins = data.get("builtins") or []
-    custom = data.get("custom") or []
-
-    custom_tools = [
-        {
-            "name": spec["name"],
-            "description": spec.get("description"),
-            "inputSchema": spec.get("input_schema"),
-            "callRef": spec["call_ref"],
-        }
-        for spec in custom
-    ]
-
-    callback = ToolCallback(
-        endpoint=f"{api_base}/tools/call",
-        authorization=authorization,
-    )
-
-    return builtins, custom_tools, callback
-
-
-async def _agent(
-    inputs: Optional[Dict[str, Any]] = None,
-    messages: Optional[List[Any]] = None,
-    parameters: Optional[Dict] = None,
-):
-    config = load_config()
-
-    # Config comes from parameters when the playground/caller sets it, falling back
-    # to the service file config. Accepts both the playground prompt-template shape
-    # and a flat {model, agents_md, tools} (see _resolve_run_config).
-    params = parameters or {}
-    model, agents_md, tools_config = _resolve_run_config(params, config)
-
-    if isinstance(tools_config, dict):
-        tools_config = [tools_config]
-    elif not isinstance(tools_config, list):
-        tools_config = []
-
-    msgs = _to_messages(messages or (inputs or {}).get("messages") or [])
-
-    builtins, custom_tools, tool_callback = await _resolve_tools(tools_config)
-
-    # Harness (pi/claude), sandbox (local/daytona), and permission policy are editable
-    # config (see schemas.py), so a playground run can switch engine or environment;
-    # unset falls back to the env defaults. They ride on the per-run SessionConfig.
-    harness_id = (
-        params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
-    ).lower()
-    sandbox_id = (
-        params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
-    ).lower()
-    session_config = SessionConfig(
-        instructions=agents_md,
-        model=model,
-        harness=harness_id,
-        sandbox=sandbox_id,
-        secrets=await _resolve_harness_secrets(),
-        builtin_tools=builtins,
-        custom_tools=custom_tools,
-        tool_callback=tool_callback,
-        permission_policy=(params.get("permission_policy") or "auto").lower(),
-        trace=_trace_context(),
-    )
-
-    # The engine follows the selected harness/sandbox: a claude harness or a daytona
-    # sandbox needs rivet, so the legacy pi path never silently swallows the selection.
-    harness = _build_harness(_select_backend(harness_id, sandbox_id))
-    await harness.setup()
-    try:
-        session = harness.create_session(session_config)
-        result = await session.prompt(msgs)
-        await session.destroy()
-    finally:
-        await harness.shutdown()
-
-    _record_usage(result.usage)
-
-    return {"role": "assistant", "content": result.output}
-
-
-def _record_usage(usage: Optional[Dict[str, Any]]) -> None:
-    """Stamp the agent's token/cost totals onto the active ``/invoke`` workflow span.
-
-    The harness emits its own span tree (turns, LLM, tools) in a separate OTLP batch, so
-    Agenta's per-batch cumulative roll-up cannot bridge the totals onto the workflow
-    span. Setting ``gen_ai.usage.*`` here records them directly on that span (the root of
-    its batch), so the trace shows the run's tokens and cost. Best-effort.
-    """
-    if not usage or not usage.get("total"):
-        return
-    try:
-        span = otel_trace.get_current_span()
-        input_tokens = int(usage.get("input") or 0)
-        output_tokens = int(usage.get("output") or 0)
-        span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
-        span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
-        span.set_attribute("gen_ai.usage.prompt_tokens", input_tokens)
-        span.set_attribute("gen_ai.usage.completion_tokens", output_tokens)
-        span.set_attribute("gen_ai.usage.total_tokens", int(usage.get("total") or 0))
-        cost = usage.get("cost")
-        if cost:
-            span.set_attribute("gen_ai.usage.cost", float(cost))
-    except Exception:  # pylint: disable=broad-except
-        log.warning("agent: failed to record usage on workflow span", exc_info=True)
-
-
-def create_agent_app():
-    app = ag.create_app()
-    # No builtin URI yet: registering the agent as a first-class workflow type
-    # (`agenta:builtin:agent:v0`) and its interface is WP-6. Here we register the
-    # handler directly, so it gets an auto URI (`user:custom:...`) and runs locally.
-    routed = ag.workflow(schemas=AGENT_SCHEMAS)(_agent)
-    ag.route("/", app=app, flags={"is_chat": True})(routed)
-    return app
-
-
-agent_v0_app = create_agent_app()
diff --git a/services/oss/src/agent/__init__.py b/services/oss/src/agent/__init__.py
new file mode 100644
index 0000000000..1701c8f1bd
--- /dev/null
+++ b/services/oss/src/agent/__init__.py
@@ -0,0 +1,12 @@
+"""The Agenta agent workflow app and its glue.
+
+The handler and harness wiring are in ``app``; request parsing in ``inputs``; tool
+resolution in ``tools``; provider secrets in ``secrets``; trace/usage glue in ``tracing``;
+the ``/inspect`` schemas in ``schemas``; the file-backed defaults in ``config``. The
+engine-agnostic runtime (the harness/environment seams and adapters) lives in
+``oss.src.harness``.
+"""
+
+from oss.src.agent.app import agent_v0_app, create_agent_app
+
+__all__ = ["agent_v0_app", "create_agent_app"]
diff --git a/services/oss/src/agent/app.py b/services/oss/src/agent/app.py
new file mode 100644
index 0000000000..62e51b855b
--- /dev/null
+++ b/services/oss/src/agent/app.py
@@ -0,0 +1,125 @@
+"""Agent workflow app: the ``/invoke`` handler and how it wires a harness run.
+
+Mirrors the chat/completion services: an Agenta app exposing ``/invoke`` and ``/inspect``
+through ``ag.create_app`` + ``ag.workflow`` + ``ag.route``. The handler parses the request
+(``inputs``), resolves tools (``tools``) and provider secrets (``secrets``), threads the
+trace context (``tracing``), runs one turn through an :class:`AgentSession` on the
+engine-agnostic runtime (``oss.src.harness``), and records the run's usage.
+
+The engine (rivet over ACP vs the legacy in-process Pi path) and the transport (HTTP
+sidecar vs subprocess) are deployment choices; the harness, sandbox, and permission policy
+are editable playground config.
+"""
+
+import os
+from typing import Any, Dict, List, Optional
+
+import agenta as ag
+
+from oss.src.agent.config import load_config, wrapper_dir
+from oss.src.agent.inputs import resolve_run_config, to_messages
+from oss.src.agent.schemas import AGENT_SCHEMAS
+from oss.src.agent.secrets import resolve_harness_secrets
+from oss.src.agent.tools import resolve_tools
+from oss.src.agent.tracing import record_usage, trace_context
+from oss.src.harness import (
+    Harness,
+    HttpHarness,
+    LocalEnvironment,
+    SessionConfig,
+    SubprocessHarness,
+)
+
+
+def select_backend(harness_id: str, sandbox_id: str) -> str:
+    """Choose the engine (``rivet`` or ``pi``) for a run.
+
+    ``rivet`` drives a harness over ACP via a rivet daemon; ``pi`` is the legacy in-process
+    Pi path. The legacy path only runs the ``pi`` harness locally, so any other harness or
+    sandbox forces ``rivet`` rather than silently dropping the selection.
+    ``AGENTA_AGENT_RUNTIME=rivet`` forces rivet for everything.
+    """
+    runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
+    if runtime == "rivet" or harness_id != "pi" or sandbox_id != "local":
+        return "rivet"
+    return "pi"
+
+
+def build_harness(backend: str) -> Harness:
+    """Pick the transport to the TypeScript runner for the current deployment.
+
+    ``AGENTA_AGENT_PI_URL`` set (docker) -> call the sidecar over HTTP; unset (local) ->
+    spawn the runner as a subprocess. ``backend`` (the engine) is chosen by
+    :func:`select_backend`.
+    """
+    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
+    if pi_url:
+        return HttpHarness(pi_url, backend=backend)
+    return SubprocessHarness(
+        LocalEnvironment(),
+        wrapper_dir=str(wrapper_dir()),
+        backend=backend,
+    )
+
+
+async def _agent(
+    inputs: Optional[Dict[str, Any]] = None,
+    messages: Optional[List[Any]] = None,
+    parameters: Optional[Dict] = None,
+):
+    params = parameters or {}
+    model, agents_md, raw_tools = resolve_run_config(params, load_config())
+
+    if isinstance(raw_tools, dict):
+        raw_tools = [raw_tools]
+    elif not isinstance(raw_tools, list):
+        raw_tools = []
+
+    msgs = to_messages(messages or (inputs or {}).get("messages") or [])
+    builtins, custom_tools, tool_callback = await resolve_tools(raw_tools)
+
+    harness_id = (
+        params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
+    ).lower()
+    sandbox_id = (
+        params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
+    ).lower()
+    session_config = SessionConfig(
+        instructions=agents_md,
+        model=model,
+        harness=harness_id,
+        sandbox=sandbox_id,
+        secrets=await resolve_harness_secrets(),
+        builtin_tools=builtins,
+        custom_tools=custom_tools,
+        tool_callback=tool_callback,
+        permission_policy=(params.get("permission_policy") or "auto").lower(),
+        trace=trace_context(),
+    )
+
+    # The engine follows the selected harness/sandbox: a claude harness or a daytona
+    # sandbox needs rivet, so the legacy pi path never silently swallows the selection.
+    harness = build_harness(select_backend(harness_id, sandbox_id))
+    await harness.setup()
+    try:
+        session = harness.create_session(session_config)
+        result = await session.prompt(msgs)
+        await session.destroy()
+    finally:
+        await harness.shutdown()
+
+    record_usage(result.usage)
+    return {"role": "assistant", "content": result.output}
+
+
+def create_agent_app():
+    app = ag.create_app()
+    # No builtin URI yet: registering the agent as a first-class workflow type
+    # (`agenta:builtin:agent:v0`) and its interface is WP-6. Here we register the handler
+    # directly, so it gets an auto URI (`user:custom:...`) and runs locally.
+    routed = ag.workflow(schemas=AGENT_SCHEMAS)(_agent)
+    ag.route("/", app=app, flags={"is_chat": True})(routed)
+    return app
+
+
+agent_v0_app = create_agent_app()
diff --git a/services/oss/src/agent/client.py b/services/oss/src/agent/client.py
new file mode 100644
index 0000000000..59ec7969b4
--- /dev/null
+++ b/services/oss/src/agent/client.py
@@ -0,0 +1,63 @@
+"""Access to the Agenta backend from inside a harness run.
+
+Resolving the backend base URL and the caller-scoped credential is shared by the tool
+resolver and the secret resolver, so it lives here. The credential reuses the same
+propagation the OTLP export rides on, so an agent run calls ``/tools/resolve``,
+``/tools/call``, and ``/secrets/`` as the caller, not with broader rights.
+"""
+
+import os
+from typing import Optional
+
+import agenta as ag
+from agenta.sdk.engines.tracing.propagation import inject
+
+# Budget for a backend round-trip (the tool catalog/connection check, the vault fetch).
+TOOLS_TIMEOUT = float(os.getenv("AGENTA_AGENT_TOOLS_TIMEOUT", "30"))
+
+
+def agenta_api_base() -> Optional[str]:
+    """Resolve the Agenta backend base URL (``.../api``).
+
+    Prefers an explicit override, then derives it from the OTLP endpoint the SDK is
+    configured with (``{host}/api/otlp/v1/traces``), then falls back to env. Returns
+    ``None`` when nothing is configured; callers only need this when tools or secrets apply.
+    """
+    override = os.getenv("AGENTA_AGENT_TOOLS_API_URL")
+    if override:
+        return override.rstrip("/")
+
+    try:
+        otlp_url = ag.tracing.otlp_url
+    except Exception:  # pylint: disable=broad-except
+        otlp_url = None
+    if otlp_url and "/otlp/" in otlp_url:
+        return otlp_url.split("/otlp/", 1)[0].rstrip("/")
+
+    api_url = os.getenv("AGENTA_API_URL")
+    if api_url:
+        return api_url.rstrip("/")
+
+    return None
+
+
+def request_authorization() -> Optional[str]:
+    """The project-scoped credential to call the Agenta backend.
+
+    Reuses the same propagation the OTLP credential rides on (the caller's Authorization),
+    falling back to the service's own API key the way the tracing sidecar does. Scoping to
+    the caller keeps an agent run from invoking tools the user could not (WP-7 risk:
+    RUN_TOOLS scoping).
+    """
+    try:
+        authorization = inject({}).get("Authorization")
+    except Exception:  # pylint: disable=broad-except
+        authorization = None
+    if authorization:
+        return authorization
+
+    api_key = os.getenv("AGENTA_API_KEY")
+    if api_key:
+        return f"ApiKey {api_key}"
+
+    return None
diff --git a/services/oss/src/agent_pi/config.py b/services/oss/src/agent/config.py
similarity index 97%
rename from services/oss/src/agent_pi/config.py
rename to services/oss/src/agent/config.py
index 8c2f5bf660..b8efb693dc 100644
--- a/services/oss/src/agent_pi/config.py
+++ b/services/oss/src/agent/config.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 from typing import Any, List, Optional
 
-# services/oss/src/agent_pi/config.py -> parents[3] == services/
+# services/oss/src/agent/config.py -> parents[3] == services/
 _SERVICES_DIR = Path(__file__).resolve().parents[3]
 _DEFAULT_AGENT_DIR = _SERVICES_DIR / "agent"
 
diff --git a/services/oss/src/agent/inputs.py b/services/oss/src/agent/inputs.py
new file mode 100644
index 0000000000..0f4ee23166
--- /dev/null
+++ b/services/oss/src/agent/inputs.py
@@ -0,0 +1,67 @@
+"""Parse the playground/API request into a model, instructions, tools, and messages."""
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from oss.src.agent.config import AgentConfig
+from oss.src.harness.ports import Message
+
+
+def _system_text(messages: Optional[List[Any]]) -> str:
+    """Join the system-message content of a prompt-template into AGENTS.md text."""
+    parts: List[str] = []
+    for message in messages or []:
+        if not isinstance(message, dict) or message.get("role") != "system":
+            continue
+        content = message.get("content")
+        if isinstance(content, str):
+            parts.append(content)
+        elif isinstance(content, list):
+            parts.extend(
+                block.get("text", "")
+                for block in content
+                if isinstance(block, dict) and block.get("type") == "text"
+            )
+    return "\n\n".join(part for part in parts if part)
+
+
+def resolve_run_config(
+    params: Dict[str, Any],
+    config: AgentConfig,
+) -> Tuple[str, str, Any]:
+    """Pull model, instructions, and raw tools from the request parameters.
+
+    Accepts both shapes: the playground's ``prompt`` (a ``prompt-template`` whose system
+    message is the AGENTS.md and whose ``llm_config`` carries model + picker tools) and the
+    flat ``{model, agents_md, tools}`` an API caller may send. Falls back to the service
+    file config for any unset field.
+    """
+    prompt_cfg = params.get("prompt")
+    if isinstance(prompt_cfg, dict):
+        llm_config = prompt_cfg.get("llm_config") or {}
+        model = llm_config.get("model") or config.model
+        agents_md = _system_text(prompt_cfg.get("messages")) or config.agents_md
+        raw_tools = llm_config.get("tools")
+        if raw_tools is None:
+            raw_tools = prompt_cfg.get("tools")
+    else:
+        model = params.get("model") or config.model
+        agents_md = params.get("agents_md") or config.agents_md
+        raw_tools = params.get("tools")
+
+    if raw_tools is None:
+        raw_tools = config.tools
+    return model, agents_md, raw_tools
+
+
+def to_messages(raw: Optional[List[Any]]) -> List[Message]:
+    """Coerce the playground's loose message dicts into :class:`Message` objects.
+
+    The runner picks the latest user turn and replays the rest as context, so we hand it
+    the whole conversation rather than pre-extracting a single prompt.
+    """
+    messages: List[Message] = []
+    for item in raw or []:
+        message = Message.from_raw(item)
+        if message is not None:
+            messages.append(message)
+    return messages
diff --git a/services/oss/src/agent_pi/schemas.py b/services/oss/src/agent/schemas.py
similarity index 100%
rename from services/oss/src/agent_pi/schemas.py
rename to services/oss/src/agent/schemas.py
diff --git a/services/oss/src/agent/secrets.py b/services/oss/src/agent/secrets.py
new file mode 100644
index 0000000000..7bd3096f35
--- /dev/null
+++ b/services/oss/src/agent/secrets.py
@@ -0,0 +1,72 @@
+"""Resolve provider API keys from the project vault into harness env vars.
+
+The agent authenticates the harness with the same provider keys the project configured for
+LLM access. We fetch the project's vault ``provider_key`` secrets from the backend (the
+same backend + caller credential the tool resolver uses) and inject each as its standard
+env var, so the harness uses whichever its model needs. Empty when the vault has none, in
+which case the harness falls back to its own login / OAuth (see ``runRivet``).
+"""
+
+from typing import Dict
+
+import httpx
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from oss.src.agent.client import (
+    TOOLS_TIMEOUT,
+    agenta_api_base,
+    request_authorization,
+)
+
+log = get_module_logger(__name__)
+
+# Map a vault standard-provider kind to the env var the harness (Pi/Claude/litellm) reads.
+# Only providers an agent harness can use are listed.
+_PROVIDER_ENV_VARS = {
+    "openai": "OPENAI_API_KEY",
+    "anthropic": "ANTHROPIC_API_KEY",
+    "gemini": "GEMINI_API_KEY",
+    "mistral": "MISTRAL_API_KEY",
+    "mistralai": "MISTRAL_API_KEY",
+    "groq": "GROQ_API_KEY",
+    "together_ai": "TOGETHERAI_API_KEY",
+    "openrouter": "OPENROUTER_API_KEY",
+}
+
+
+async def resolve_harness_secrets() -> Dict[str, str]:
+    """Fetch the project vault's provider keys as ``{ENV_VAR: key}``. Best-effort.
+
+    The SDK's per-request secret context does not propagate to this custom route, so we
+    resolve here rather than reading it.
+    """
+    api_base = agenta_api_base()
+    if not api_base:
+        return {}
+    headers = {"Content-Type": "application/json"}
+    authorization = request_authorization()
+    if authorization:
+        headers["Authorization"] = authorization
+
+    try:
+        async with httpx.AsyncClient(timeout=TOOLS_TIMEOUT) as client:
+            response = await client.get(f"{api_base}/secrets/", headers=headers)
+        if response.status_code >= 400:
+            log.warning("agent: vault secrets fetch HTTP %s", response.status_code)
+            return {}
+        secrets = response.json() or []
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: vault secrets fetch failed", exc_info=True)
+        return {}
+
+    env: Dict[str, str] = {}
+    for secret in secrets:
+        if not isinstance(secret, dict) or secret.get("kind") != "provider_key":
+            continue
+        data = secret.get("data") or {}
+        env_var = _PROVIDER_ENV_VARS.get(str(data.get("kind", "")).lower())
+        key = (data.get("provider") or {}).get("key")
+        if env_var and key:
+            env.setdefault(env_var, key)
+    return env
diff --git a/services/oss/src/agent/tools.py b/services/oss/src/agent/tools.py
new file mode 100644
index 0000000000..5c84581b27
--- /dev/null
+++ b/services/oss/src/agent/tools.py
@@ -0,0 +1,124 @@
+"""Resolve the agent's configured tools through the Agenta backend.
+
+The playground tool picker emits provider-agnostic references; the backend resolver
+(``POST /tools/resolve``) validates Composio connections up front and enriches each action
+from the catalog. We turn the result into the customTool specs the wire carries and the
+``/tools/call`` callback. The provider key and connection auth stay server-side.
+"""
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import httpx
+
+from oss.src.agent.client import (
+    TOOLS_TIMEOUT,
+    agenta_api_base,
+    request_authorization,
+)
+from oss.src.harness.ports import ToolCallback
+
+
+def _parse_gateway_slug(slug: Any) -> Optional[Dict[str, Any]]:
+    """Parse a gateway tool slug into a Composio reference, or ``None``.
+
+    The playground tool picker encodes a Composio action as a function name like
+    ``tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn`` (the same 5-segment
+    slug ``/tools/call`` parses; ``__`` or ``.`` separated). Anything that is not a
+    5-segment ``tools.composio.*`` slug returns ``None`` so the caller can skip it.
+    """
+    if not isinstance(slug, str):
+        return None
+    parts = slug.replace("__", ".").split(".")
+    if len(parts) == 5 and parts[0] == "tools" and parts[1] == "composio":
+        return {
+            "type": "composio",
+            "integration": parts[2],
+            "action": parts[3],
+            "connection": parts[4],
+        }
+    return None
+
+
+def _normalize_tool_ref(ref: Any) -> Optional[Dict[str, Any]]:
+    """Coerce a config entry into a discriminated tool reference the resolver parses.
+
+    Handles three shapes: a bare string (or single-key ``{"name": ...}``) is a built-in
+    tool name; a dict already carrying ``type`` passes through; and the playground picker's
+    gateway entry (``{"function": {"name": "tools__composio__..."}}``) becomes a
+    ``composio`` ref. Unsupported picker entries (provider built-ins, inline custom
+    functions) return ``None`` and are skipped rather than failing the run.
+    """
+    if isinstance(ref, str):
+        return {"type": "builtin", "name": ref}
+    if isinstance(ref, dict):
+        if ref.get("type") in ("builtin", "composio"):
+            return ref
+        function = ref.get("function") if isinstance(ref.get("function"), dict) else {}
+        gateway = _parse_gateway_slug(function.get("name") or ref.get("name"))
+        if gateway:
+            return gateway
+        if "type" not in ref and isinstance(ref.get("name"), str):
+            return {"type": "builtin", "name": ref["name"]}
+        return None
+    return None
+
+
+async def resolve_tools(
+    tools: List[Any],
+) -> Tuple[List[str], List[Dict[str, Any]], Optional[ToolCallback]]:
+    """Resolve config tool references into built-in names + customTool specs + callback.
+
+    Calls the backend resolver (``POST /tools/resolve``), which validates Composio
+    connections up front and enriches each action from the catalog. Returns the built-in
+    tool names, the camelCase customTool specs for the wire, and the ``/tools/call``
+    callback. Raises on resolution failure so the invoke fails early with a clear message
+    rather than the model hitting a runtime tool error.
+    """
+    refs = [ref for ref in (_normalize_tool_ref(t) for t in tools if t) if ref]
+    if not refs:
+        return [], [], None
+
+    api_base = agenta_api_base()
+    if not api_base:
+        raise RuntimeError(
+            "Agent has tools configured but the Agenta API base URL is unknown. "
+            "Set AGENTA_AGENT_TOOLS_API_URL or AGENTA_API_URL."
+        )
+
+    authorization = request_authorization()
+    headers = {"Content-Type": "application/json"}
+    if authorization:
+        headers["Authorization"] = authorization
+
+    async with httpx.AsyncClient(timeout=TOOLS_TIMEOUT) as client:
+        response = await client.post(
+            f"{api_base}/tools/resolve",
+            json={"tools": refs},
+            headers=headers,
+        )
+
+    if response.status_code >= 400:
+        raise RuntimeError(
+            f"Tool resolution failed (HTTP {response.status_code}): {response.text[:500]}"
+        )
+
+    data = response.json()
+    builtins = data.get("builtins") or []
+    custom = data.get("custom") or []
+
+    custom_tools = [
+        {
+            "name": spec["name"],
+            "description": spec.get("description"),
+            "inputSchema": spec.get("input_schema"),
+            "callRef": spec["call_ref"],
+        }
+        for spec in custom
+    ]
+
+    callback = ToolCallback(
+        endpoint=f"{api_base}/tools/call",
+        authorization=authorization,
+    )
+
+    return builtins, custom_tools, callback
diff --git a/services/oss/src/agent/tracing.py b/services/oss/src/agent/tracing.py
new file mode 100644
index 0000000000..a64c3c5fab
--- /dev/null
+++ b/services/oss/src/agent/tracing.py
@@ -0,0 +1,85 @@
+"""OpenTelemetry glue: thread the workflow trace into the run, record the run's usage.
+
+The handler runs inside the instrumented ``/invoke`` span, so threading its trace context
+into the harness makes the agent's spans children of that span (same trace), and stamping
+the run's token/cost totals onto it shows the run's usage even though the harness exports
+its span tree in a separate OTLP batch.
+"""
+
+import os
+from typing import Any, Dict, Optional
+
+from opentelemetry import trace as otel_trace
+
+import agenta as ag
+from agenta.sdk.engines.tracing.propagation import inject
+from agenta.sdk.utils.logging import get_module_logger
+
+from oss.src.harness.ports import TraceContext
+
+log = get_module_logger(__name__)
+
+_CAPTURE_CONTENT = os.getenv("AGENTA_AGENT_CAPTURE_CONTENT", "true").lower() not in (
+    "0",
+    "false",
+    "no",
+)
+
+
+def trace_context() -> Optional[TraceContext]:
+    """Capture the active workflow span's trace context for the harness.
+
+    Threading the ``/invoke`` span's ``traceparent`` into the run makes the agent's spans
+    children of that span, so the whole run shows up under the response's ``trace_id`` the
+    way completion/chat nest their LLM spans. Best-effort: any failure returns ``None`` and
+    the run is traced standalone (or not at all) using the runner's env config.
+    """
+    try:
+        headers = inject({})
+
+        traceparent = headers.get("traceparent")
+        if not traceparent:
+            return None
+
+        endpoint = None
+        try:
+            endpoint = ag.tracing.otlp_url
+        except Exception:  # pylint: disable=broad-except
+            endpoint = None
+
+        return TraceContext(
+            traceparent=traceparent,
+            baggage=headers.get("baggage"),
+            endpoint=endpoint,
+            authorization=headers.get("Authorization"),
+            capture_content=_CAPTURE_CONTENT,
+        )
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: failed to capture trace context", exc_info=True)
+        return None
+
+
+def record_usage(usage: Optional[Dict[str, Any]]) -> None:
+    """Stamp the agent's token/cost totals onto the active ``/invoke`` workflow span.
+
+    The harness emits its own span tree (turns, LLM, tools) in a separate OTLP batch, so
+    Agenta's per-batch cumulative roll-up cannot bridge the totals onto the workflow span.
+    Setting ``gen_ai.usage.*`` here records them directly on that span (the root of its
+    batch), so the trace shows the run's tokens and cost. Best-effort.
+    """
+    if not usage or not usage.get("total"):
+        return
+    try:
+        span = otel_trace.get_current_span()
+        input_tokens = int(usage.get("input") or 0)
+        output_tokens = int(usage.get("output") or 0)
+        span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
+        span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
+        span.set_attribute("gen_ai.usage.prompt_tokens", input_tokens)
+        span.set_attribute("gen_ai.usage.completion_tokens", output_tokens)
+        span.set_attribute("gen_ai.usage.total_tokens", int(usage.get("total") or 0))
+        cost = usage.get("cost")
+        if cost:
+            span.set_attribute("gen_ai.usage.cost", float(cost))
+    except Exception:  # pylint: disable=broad-except
+        log.warning("agent: failed to record usage on workflow span", exc_info=True)
diff --git a/services/oss/src/agent_pi/__init__.py b/services/oss/src/harness/__init__.py
similarity index 59%
rename from services/oss/src/agent_pi/__init__.py
rename to services/oss/src/harness/__init__.py
index 11321c7cd8..066e364c76 100644
--- a/services/oss/src/agent_pi/__init__.py
+++ b/services/oss/src/harness/__init__.py
@@ -1,18 +1,18 @@
-"""Agent runtime: ports and adapters for the agent service.
+"""Engine-agnostic agent runtime: the harness and environment seams, plus their adapters.
 
-The Python service is "our agent implementation". It owns two seams (see
+Nothing here is Agenta-specific. The Agenta workflow integration (the ``/invoke`` handler,
+tool resolution, secrets, tracing) lives in ``oss.src.agent``. Two seams (see
 ``docs/design/agent-workflows/harness-port-redesign/``):
 
-- ``Harness``: the agent engine. ``SubprocessHarness`` and ``HttpHarness`` (in
-  ``harness.py``) are the two transports to the TypeScript runner; the engine (legacy
-  in-process Pi vs rivet over ACP) is an env value, not a class. ``create_session``
-  returns an :class:`AgentSession`, the rivet-shaped session abstraction.
+- ``Harness``: the agent engine. ``SubprocessHarness`` and ``HttpHarness`` (``transports.py``)
+  reach the TypeScript runner over a subprocess or HTTP. The engine that runs behind them
+  (rivet over ACP, or the legacy in-process Pi path) is an env value, not a class.
+  ``create_session`` returns an :class:`AgentSession` (create / prompt / destroy).
 - ``Environment``: where the harness process runs. ``LocalEnvironment`` runs it as a local
   subprocess; a sandbox environment is selected inside the rivet runner.
 """
 
 from .environment import LocalEnvironment
-from .harness import HttpHarness, SubprocessHarness
 from .ports import (
     AgentEvent,
     AgentRequest,
@@ -20,13 +20,14 @@
     AgentSession,
     ContentBlock,
     Environment,
-    HarnessCapabilities,
     Harness,
+    HarnessCapabilities,
     Message,
     SessionConfig,
     ToolCallback,
     TraceContext,
 )
+from .transports import HttpHarness, SubprocessHarness
 
 __all__ = [
     "AgentEvent",
diff --git a/services/oss/src/agent_pi/environment.py b/services/oss/src/harness/environment.py
similarity index 100%
rename from services/oss/src/agent_pi/environment.py
rename to services/oss/src/harness/environment.py
diff --git a/services/oss/src/agent_pi/ports.py b/services/oss/src/harness/ports.py
similarity index 100%
rename from services/oss/src/agent_pi/ports.py
rename to services/oss/src/harness/ports.py
diff --git a/services/oss/src/agent_pi/harness.py b/services/oss/src/harness/transports.py
similarity index 100%
rename from services/oss/src/agent_pi/harness.py
rename to services/oss/src/harness/transports.py
diff --git a/services/oss/src/agent_pi/wire.py b/services/oss/src/harness/wire.py
similarity index 100%
rename from services/oss/src/agent_pi/wire.py
rename to services/oss/src/harness/wire.py

From 67ab85620cce8f23331e06a8154de19fb9fa8bea Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 14:18:14 +0200
Subject: [PATCH 06/10] refactor(agent): group the TypeScript runner into role
 folders + rewrite README

The TS runner's src/ had grown one work package at a time into a flat folder of ten
files with no signal of role. Group them and rewrite the stale README (it still called
this a 'Pi wrapper' and pointed at the moved agent.py):

  src/cli.ts, server.ts, protocol.ts   entrypoints + the wire contract
  src/engines/{pi,rivet}.ts            the two engines (was runPi.ts / runRivet.ts)
  src/tracing/otel.ts                  the tracers (was agenta-otel.ts)
  src/tools/{client,mcp-bridge,mcp-server}.ts   tool delivery (was toolClient/toolBridge*)
  src/extensions/agenta.ts             the Pi extension (was piExtension.ts)

No behavior change. Updated the fragile __dirname-relative paths in engines/rivet.ts
(PKG_ROOT) and tools/mcp-bridge.ts (the tsx bin + server path) for the new depth, and the
build-extension entry. Verified live: rivet+pi+local through the restarted sidecar answers
'Athens' with usage; tsc --strict and the extension build pass.
---
 .../harness-port-redesign/implementation.md   |  20 ++-
 services/agent/README.md                      | 163 ++++++++++--------
 services/agent/scripts/build-extension.mjs    |   2 +-
 services/agent/src/cli.ts                     |   4 +-
 .../agent/src/{runPi.ts => engines/pi.ts}     |   8 +-
 .../src/{runRivet.ts => engines/rivet.ts}     |  18 +-
 .../{piExtension.ts => extensions/agenta.ts}  |   6 +-
 services/agent/src/protocol.ts                |   2 +-
 services/agent/src/server.ts                  |   4 +-
 .../src/{toolClient.ts => tools/client.ts}    |   8 +-
 .../{toolBridge.ts => tools/mcp-bridge.ts}    |  16 +-
 .../mcp-server.ts}                            |   6 +-
 .../src/{agenta-otel.ts => tracing/otel.ts}   |   4 +-
 13 files changed, 141 insertions(+), 120 deletions(-)
 rename services/agent/src/{runPi.ts => engines/pi.ts} (97%)
 rename services/agent/src/{runRivet.ts => engines/rivet.ts} (98%)
 rename services/agent/src/{piExtension.ts => extensions/agenta.ts} (96%)
 rename services/agent/src/{toolClient.ts => tools/client.ts} (90%)
 rename services/agent/src/{toolBridge.ts => tools/mcp-bridge.ts} (78%)
 rename services/agent/src/{toolBridgeServer.ts => tools/mcp-server.ts} (94%)
 rename services/agent/src/{agenta-otel.ts => tracing/otel.ts} (99%)

diff --git a/docs/design/agent-workflows/harness-port-redesign/implementation.md b/docs/design/agent-workflows/harness-port-redesign/implementation.md
index bcf774fed8..084afb6e3d 100644
--- a/docs/design/agent-workflows/harness-port-redesign/implementation.md
+++ b/docs/design/agent-workflows/harness-port-redesign/implementation.md
@@ -38,16 +38,22 @@ transports, not three backend adapters. The harness folder is named for the seam
 Pi: harness choice (pi/claude) lives inside the runtime, which is why there is no
 `agent_claude` package.
 
-### TypeScript (`services/agent/src/`)
+### TypeScript (`services/agent/src/`) — grouped by role
 
 | File | Holds |
 | --- | --- |
-| `protocol.ts` | Shared wire types: `AgentRunRequest`, `AgentRunResult`, `AgentEvent`, `ContentBlock`, `HarnessCapabilities`. Both runners import from here (no more `runRivet` importing types out of `runPi`). |
-| `runPi.ts` | Legacy backend: drive the Pi SDK in-process. Returns the enriched result. |
-| `runRivet.ts` | Rivet backend: drive a harness over ACP. Probes `getAgent(harness).capabilities` and branches on capability flags, not on the harness name. Returns the enriched result, including usage for both Pi and Claude. |
-| `agenta-otel.ts` | The Pi-extension tracer and the ACP-event tracer. Also accumulates the structured event log. |
-| `piExtension.ts`, `toolBridge*.ts` | Unchanged tool/trace delivery. |
-| `cli.ts`, `server.ts` | Route to the backend by `AGENT_BACKEND` (auto by request shape on the sidecar). |
+| `cli.ts`, `server.ts` | The two entrypoints (stdio subprocess, HTTP sidecar). Route to an engine by the request's `backend`. |
+| `protocol.ts` | Shared wire types: `AgentRunRequest`, `AgentRunResult`, `AgentEvent`, `ContentBlock`, `HarnessCapabilities`. Both engines import from here. |
+| `engines/pi.ts` | Legacy engine: drive the Pi SDK in-process. Returns the enriched result. |
+| `engines/rivet.ts` | Rivet engine: drive a harness over ACP. Probes `getAgent(harness).capabilities` and branches on capability flags, not on the harness name. Returns the enriched result, with usage for both Pi and Claude. |
+| `tracing/otel.ts` | The Pi-extension tracer and the ACP-event tracer; accumulates the structured event log. |
+| `tools/client.ts` | The one `/tools/call` HTTP client. |
+| `tools/mcp-bridge.ts`, `tools/mcp-server.ts` | Tool delivery over MCP for non-Pi harnesses. |
+| `extensions/agenta.ts` | The Pi extension (tracing + tools), bundled to `dist/extensions/agenta.js`. |
+
+The folder grouping (entrypoints + contract at the top, `engines/`, `tracing/`, `tools/`,
+`extensions/`) replaced a flat `src/` of ten files that had grown one work package at a
+time. No behavior change.
 
 ## The seams
 
diff --git a/services/agent/README.md b/services/agent/README.md
index f566acb704..8c8e8f949d 100644
--- a/services/agent/README.md
+++ b/services/agent/README.md
@@ -1,103 +1,118 @@
-# Agent service: Pi wrapper (WP-2 + WP-7)
+# Agent runner (TypeScript)
 
-This is the TypeScript side of the agent workflow service. It is a thin wrapper that
-drives the [Pi](https://pi.dev) agent harness for a single run. The Python service
-(`services/oss/src/agent.py`) calls it; see
-`docs/design/agent-workflows/wp-2-agent-service/`.
+The Node side of the agent workflow service. It runs the actual agent loop and serves one
+contract: a JSON request in, a structured result out. The Python service
+(`services/oss/src/agent/`) decides *what* to run (config, tools, secrets, trace) and calls
+in here; this package *runs* it. It lives in Node because the harnesses (Pi, Claude Code,
+rivet's `sandbox-agent`) are Node libraries with no Python SDK.
 
-## What it does
+## How it is invoked
 
-`src/cli.ts` reads one JSON request on stdin, runs Pi once via the SDK
-(`createAgentSession`), and writes one JSON result on stdout. AGENTS.md is injected in
-memory; the session and working dir are throwaway. stdout is the result channel only,
-logs go to stderr.
+Two entrypoints, same `/run` contract (see `src/protocol.ts`):
 
-Request (stdin):
+- **`src/cli.ts`** — one JSON request on stdin, one result on stdout. The Python
+  `SubprocessHarness` spawns this for local runs. stdout is the result channel only; logs
+  go to stderr.
+- **`src/server.ts`** — the same thing as a long-lived HTTP server on `:8765`
+  (`GET /health`, `POST /run`). This is the **dockerized sidecar** the Python `HttpHarness`
+  calls in-network. The dev image (`docker/Dockerfile.dev`) runs `tsx watch src/server.ts`.
 
-```json
-{
-  "agentsMd": "You are a hello-world agent.",
-  "model": "gpt-5.5",
-  "prompt": "Hi there",
-  "messages": [{"role": "user", "content": "Hi there"}],
-  "tools": []
-}
-```
+Both route to an engine by the request's `backend` field.
 
-Result (stdout):
+## Layout (`src/`)
 
-```json
-{ "ok": true, "output": "Hello! ...", "sessionId": "...", "model": "openai-codex/gpt-5.5", "traceId": "..." }
+```
+src/
+  cli.ts              entrypoint: stdin/stdout (subprocess transport)
+  server.ts           entrypoint: HTTP sidecar on :8765
+  protocol.ts         the /run wire contract (request, result, events, capabilities)
+  engines/
+    pi.ts             legacy engine: drive the Pi SDK in-process
+    rivet.ts          engine: drive a harness over ACP via a rivet sandbox-agent daemon
+  tracing/
+    otel.ts           turn a run into OpenTelemetry spans nested under /invoke
+  tools/
+    client.ts         the one /tools/call HTTP client
+    mcp-bridge.ts     build the MCP server config that exposes tools to a harness
+    mcp-server.ts     the stdio MCP server itself (launched per session by the daemon)
+  extensions/
+    agenta.ts         the Pi extension (tracing + tools), bundled into dist/ for Pi to load
 ```
 
-## Tracing
+## Engines
+
+- **`pi`** (`engines/pi.ts`) — the legacy path. Drives the Pi SDK directly in-process.
+- **`rivet`** (`engines/rivet.ts`) — drives any harness (`pi`, `claude`) over the Agent
+  Client Protocol through a rivet `sandbox-agent` daemon, either local or in a Daytona
+  sandbox. This is the default on the platform.
+
+The engine is a deployment choice (`backend` on the wire / `AGENT_BACKEND` env), not a
+harness. Harness choice (pi/claude) and sandbox (local/daytona) are per-run config the
+Python service sends.
 
-When the request carries a `trace` block, the run is traced into Agenta as
-OpenTelemetry spans and nested under the caller's `/invoke` span, so the agent's whole
-run is part of the same trace (the way completion/chat nest their LLM spans). The
-Python service fills `trace` in from the live workflow span; see
-`docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md`.
+## Result
 
 ```json
 {
-  "prompt": "Hi there",
-  "trace": {
-    "traceparent": "00-<32hex trace>-<16hex span>-01",
-    "endpoint": "https://host/api/otlp/v1/traces",
-    "authorization": "ApiKey ...",
-    "captureContent": true
-  }
+  "ok": true,
+  "output": "Rome",
+  "messages": [{ "role": "assistant", "content": "Rome" }],
+  "events": [{ "type": "message", "text": "Rome" }, { "type": "done" }],
+  "usage": { "input": 1297, "output": 5, "total": 1302, "cost": 0.0066 },
+  "stopReason": "end_turn",
+  "capabilities": { "mcpTools": false, "images": true, "...": "..." },
+  "sessionId": "...",
+  "model": "openai-codex/gpt-5.5",
+  "traceId": "..."
 }
 ```
 
-With no `trace` block the run is traced standalone using `AGENTA_HOST` /
-`AGENTA_API_KEY`, or not at all when neither is set. The extension lives in
-`src/agenta-otel.ts`.
+`runRivet` probes the harness's capabilities and branches on them (for example, tools go
+over MCP only when the harness advertises `mcpTools`); usage and the structured event log
+come back on every run.
 
-## Tools (WP-7)
+## Tracing
 
-The agent's runnable tools are resolved in the backend (not here) and arrive on the
-request as `customTools` plus a `toolCallback`. `buildCustomTools` in `src/runPi.ts`
-turns each spec into a Pi `customTool` whose `execute` does one
-`POST {toolCallback.endpoint}` (Agenta's `/tools/call`) with the `callRef` slug and the
-threaded `authorization`. Pi drives the loop and runs the tool in-process; the provider
-key and connection auth stay server-side behind `/tools/call` and never enter this
-sandbox. See `docs/design/agent-workflows/wp-7-tools/README.md`.
+When the request carries a `trace` block, the run is exported to Agenta as OpenTelemetry
+spans nested under the caller's `/invoke` span. The Pi path self-instruments via the
+bundled extension (`extensions/agenta.ts`); other harnesses are traced from the rivet ACP
+event stream (`tracing/otel.ts`). The Python `tracing` module fills `trace` in from the
+live workflow span.
 
-```json
-{
-  "prompt": "What is my GitHub username?",
-  "customTools": [
-    {
-      "name": "github__GET_THE_AUTHENTICATED_USER",
-      "description": "Gets the authenticated GitHub user.",
-      "inputSchema": {"type": "object", "properties": {}},
-      "callRef": "tools.composio.github.GET_THE_AUTHENTICATED_USER.github-tvn"
-    }
-  ],
-  "toolCallback": {
-    "endpoint": "https://host/api/tools/call",
-    "authorization": "ApiKey ..."
-  }
-}
+## Tools
+
+Tools are resolved in the Python backend and arrive on the request as `customTools` plus a
+`toolCallback`. Delivery is capability-routed: the Pi extension registers them natively;
+other harnesses get them over MCP through `tools/mcp-bridge.ts` + `tools/mcp-server.ts`.
+Either way each call POSTs back to Agenta's `/tools/call` (`tools/client.ts`), so the
+provider key and connection auth stay server-side.
+
+## The extension bundle
+
+`scripts/build-extension.mjs` esbuild-bundles `src/extensions/agenta.ts` into one
+self-contained `dist/extensions/agenta.js` that Pi can load anywhere (host, the sidecar, a
+Daytona snapshot). The dev image bakes it; rebuild after editing the extension or the
+tracer:
+
+```bash
+pnpm run build:extension
 ```
 
 ## Auth
 
-`AuthStorage.create()` reads `~/.pi/agent/auth.json`. Log in once with `pnpm exec pi`
-then `/login`, or set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`.
+Provider keys arrive as `request.secrets` (resolved from the project vault) or fall back to
+the harness's own login: Pi reads `~/.pi/agent/auth.json` (`pnpm exec pi` then `/login`),
+Claude Code reads `~/.claude`. Set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` to override.
+
+## config/
+
+`config/AGENTS.md` and `config/agent.json` are a fallback "hello-world" agent, used only
+when a request arrives with no config. In practice the playground always sends the agent
+revision's config, so these are rarely hit.
 
 ## Local use
 
 ```bash
 pnpm install
-echo '{"agentsMd":"You are a hello-world agent.","prompt":"Hi"}' | pnpm run run:cli
+echo '{"backend":"pi","messages":[{"role":"user","content":"Hi"}]}' | pnpm run run:cli
 ```
-
-## Config
-
-The live config comes from the agent revision in the playground: a `prompt-template`
-whose system message is the AGENTS.md, with the model and the picked tools under
-`llm_config`. The Python service (`services/oss/src/agent.py`) reads that and fills the
-request. `config/AGENTS.md` and `config/agent.json` are only the file fallback used when
-the request carries no config.
diff --git a/services/agent/scripts/build-extension.mjs b/services/agent/scripts/build-extension.mjs
index 229d805040..debdae88d7 100644
--- a/services/agent/scripts/build-extension.mjs
+++ b/services/agent/scripts/build-extension.mjs
@@ -12,7 +12,7 @@ import { fileURLToPath } from "node:url";
 const root = join(dirname(fileURLToPath(import.meta.url)), "..");
 
 await build({
-  entryPoints: [join(root, "src/piExtension.ts")],
+  entryPoints: [join(root, "src/extensions/agenta.ts")],
   outfile: join(root, "dist/extensions/agenta.js"),
   bundle: true,
   platform: "node",
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
index 8de7d8a97c..5eb771c78b 100644
--- a/services/agent/src/cli.ts
+++ b/services/agent/src/cli.ts
@@ -7,8 +7,8 @@
  * long-lived RPC adapter can replace it later behind the same Python-side port.
  */
 import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
-import { runPi } from "./runPi.ts";
-import { runRivet } from "./runRivet.ts";
+import { runPi } from "./engines/pi.ts";
+import { runRivet } from "./engines/rivet.ts";
 
 // Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
 // legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
diff --git a/services/agent/src/runPi.ts b/services/agent/src/engines/pi.ts
similarity index 97%
rename from services/agent/src/runPi.ts
rename to services/agent/src/engines/pi.ts
index 1ae732f555..85a9a3b930 100644
--- a/services/agent/src/runPi.ts
+++ b/services/agent/src/engines/pi.ts
@@ -5,7 +5,7 @@
  * AGENTS.md in memory, resolves the model, sends one user turn, and returns the structured
  * result (final text, messages, events, usage, capabilities). It also turns the
  * backend-resolved runnable tools (WP-7) into Pi customTools that route back through
- * Agenta's /tools/call. The rivet backend (`runRivet.ts`) is the ACP path; both serve the
+ * Agenta's /tools/call. The rivet engine (`engines/rivet.ts`) is the ACP path; both serve the
  * same `/run` contract (see `protocol.ts`).
  *
  * Auth: provider keys arrive as `request.secrets` (applied to the env) or fall back to the
@@ -30,7 +30,7 @@ import {
   SettingsManager,
 } from "@earendil-works/pi-coding-agent";
 
-import { createAgentaOtel } from "./agenta-otel.ts";
+import { createAgentaOtel } from "../tracing/otel.ts";
 import {
   type AgentEvent,
   type AgentRunRequest,
@@ -40,8 +40,8 @@ import {
   type ResolvedToolSpec,
   type ToolCallbackContext,
   resolvePromptText,
-} from "./protocol.ts";
-import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
+} from "../protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "../tools/client.ts";
 
 /** What the in-process Pi engine supports. Static (no daemon to probe, unlike rivet). */
 const PI_CAPABILITIES: HarnessCapabilities = {
diff --git a/services/agent/src/runRivet.ts b/services/agent/src/engines/rivet.ts
similarity index 98%
rename from services/agent/src/runRivet.ts
rename to services/agent/src/engines/rivet.ts
index 88e4c020a4..d8dbd354e0 100644
--- a/services/agent/src/runRivet.ts
+++ b/services/agent/src/engines/rivet.ts
@@ -2,9 +2,9 @@
  * WP-8 rivet harness driver.
  *
  * Drives a coding harness (Pi, Claude Code, ...) over the Agent Client Protocol (ACP)
- * through a rivet `sandbox-agent` daemon, instead of the bespoke Pi SDK calls in
- * runPi.ts. It serves the same /run contract (AgentRunRequest -> AgentRunResult), so
- * the Python side stays thin and the choice of harness/sandbox is config, not new code.
+ * through a rivet `sandbox-agent` daemon, instead of the bespoke Pi SDK calls in the pi
+ * engine. It serves the same /run contract (AgentRunRequest -> AgentRunResult), so the
+ * Python side stays thin and the choice of harness/sandbox is config, not new code.
  *
  * Per invoke (cold), mirroring the shipped code-evaluator DaytonaRunner pattern:
  *
@@ -19,7 +19,7 @@
  * harness (which engine). The ACP boundary is daemon-to-harness; the service-to-rivet
  * hop stays harness-agnostic behind the Harness port.
  *
- * Tracing is built here from the ACP event stream (see agenta-otel.ts createRivetOtel),
+ * Tracing is built here from the ACP event stream (see tracing/otel.ts createRivetOtel),
  * so it is uniform across every harness and always nests under the caller's /invoke
  * span. stdout is reserved for the JSON result (see cli.ts); logs go to stderr.
  */
@@ -44,8 +44,8 @@ import { SandboxAgent, InMemorySessionPersistDriver } from "sandbox-agent";
 import { local } from "sandbox-agent/local";
 import { daytona } from "sandbox-agent/daytona";
 
-import { createRivetOtel } from "./agenta-otel.ts";
-import { buildToolMcpServers } from "./toolBridge.ts";
+import { createRivetOtel } from "../tracing/otel.ts";
+import { buildToolMcpServers } from "../tools/mcp-bridge.ts";
 import {
   type AgentRunRequest,
   type AgentRunResult,
@@ -55,11 +55,11 @@ import {
   type ToolCallbackContext,
   messageText,
   resolvePromptText,
-} from "./protocol.ts";
+} from "../protocol.ts";
 
 const require = createRequire(import.meta.url);
-// services/agent/src/runRivet.ts -> services/agent
-const PKG_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
+// services/agent/src/engines/rivet.ts -> services/agent
+const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url))));
 const ADAPTER_BIN_DIR = join(PKG_ROOT, "node_modules", ".bin");
 
 /** Map node platform/arch to the @sandbox-agent CLI binary package. */
diff --git a/services/agent/src/piExtension.ts b/services/agent/src/extensions/agenta.ts
similarity index 96%
rename from services/agent/src/piExtension.ts
rename to services/agent/src/extensions/agenta.ts
index 9af88d8ee9..884ae5b925 100644
--- a/services/agent/src/piExtension.ts
+++ b/services/agent/src/extensions/agenta.ts
@@ -26,9 +26,9 @@ import { writeFileSync } from "node:fs";
 
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
 
-import { createAgentaOtel } from "./agenta-otel.ts";
-import type { ResolvedToolSpec } from "./protocol.ts";
-import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
+import { createAgentaOtel } from "../tracing/otel.ts";
+import type { ResolvedToolSpec } from "../protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "../tools/client.ts";
 
 function log(message: string): void {
   process.stderr.write(`[agenta-pi-ext] ${message}\n`);
diff --git a/services/agent/src/protocol.ts b/services/agent/src/protocol.ts
index 5e94bd9332..af5029234d 100644
--- a/services/agent/src/protocol.ts
+++ b/services/agent/src/protocol.ts
@@ -3,7 +3,7 @@
  *
  * The Python side mirrors these names in `services/oss/src/harness/wire.py`. Keeping the
  * request/result/event/capability types here (rather than in one runner that the other
- * imports from) is what lets `runPi.ts` and `runRivet.ts` stay peers.
+ * imports from) is what lets `engines/pi.ts` and `engines/rivet.ts` stay peers.
  */
 
 /** One piece of a message. `text` is all the playground sends today; the rest is plumbed. */
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
index 6096198699..116a8e7578 100644
--- a/services/agent/src/server.ts
+++ b/services/agent/src/server.ts
@@ -13,8 +13,8 @@
 import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
 
 import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
-import { runPi } from "./runPi.ts";
-import { runRivet } from "./runRivet.ts";
+import { runPi } from "./engines/pi.ts";
+import { runRivet } from "./engines/rivet.ts";
 
 const PORT = Number(process.env.PORT ?? 8765);
 
diff --git a/services/agent/src/toolClient.ts b/services/agent/src/tools/client.ts
similarity index 90%
rename from services/agent/src/toolClient.ts
rename to services/agent/src/tools/client.ts
index 330e63c611..db6a71538f 100644
--- a/services/agent/src/toolClient.ts
+++ b/services/agent/src/tools/client.ts
@@ -2,15 +2,15 @@
  * Shared Agenta /tools/call client.
  *
  * One implementation of the tool round-trip used by every delivery path:
- *  - runPi.ts buildCustomTools (in-process Pi customTools)
- *  - piExtension.ts registerTools (Pi under rivet/ACP, via the bundled extension)
- *  - toolBridgeServer.ts (the MCP stdio bridge for non-Pi harnesses)
+ *  - engines/pi.ts buildCustomTools (in-process Pi customTools)
+ *  - extensions/agenta.ts registerTools (Pi under rivet/ACP, via the bundled extension)
+ *  - tools/mcp-server.ts (the MCP stdio bridge for non-Pi harnesses)
  *
  * Each call POSTs the OpenAI-style envelope to Agenta's /tools/call, so the Composio key
  * and connection auth stay server-side. Keeping the request envelope and response parse in
  * one place means a change to the /tools/call contract is a one-line edit, not three.
  */
-export type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
+export type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
 
 /** Per-tool budget for the /tools/call round-trip. Surfaced as a tool error on timeout. */
 export const TOOL_CALL_TIMEOUT_MS = Number(
diff --git a/services/agent/src/toolBridge.ts b/services/agent/src/tools/mcp-bridge.ts
similarity index 78%
rename from services/agent/src/toolBridge.ts
rename to services/agent/src/tools/mcp-bridge.ts
index 56db4dfb52..b83e71fe6a 100644
--- a/services/agent/src/toolBridge.ts
+++ b/services/agent/src/tools/mcp-bridge.ts
@@ -1,29 +1,29 @@
 /**
  * WP-8 tool delivery over rivet/ACP.
  *
- * The Pi backend (runPi.ts) injected resolved runnable tools (WP-7) as in-process Pi
+ * The Pi engine (engines/pi.ts) injected resolved runnable tools (WP-7) as in-process Pi
  * customTools. Over ACP the harness only accepts tools through MCP, so the same
  * resolved specs are exposed as an MCP server whose tool bodies POST back to Agenta's
  * /tools/call (the provider key and connection auth stay server-side, exactly as in
  * the Pi path). `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to
  * the session.
  *
- * Delivery: a stdio MCP bridge (toolBridgeServer.ts) launched by the daemon. The specs
- * and callback are passed to it as env, so nothing tool-specific is written to the
+ * Delivery: a stdio MCP bridge (mcp-server.ts) launched by the daemon. The specs and
+ * callback are passed to it as env, so nothing tool-specific is written to the
  * agent-visible filesystem.
  */
 import { existsSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 
-import type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
+import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
 
-export type { ResolvedToolSpec, ToolCallbackContext } from "./protocol.ts";
+export type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
 
 const HERE = dirname(fileURLToPath(import.meta.url));
-// services/agent/src/toolBridge.ts -> services/agent/node_modules/.bin/tsx
-const TSX_BIN = join(HERE, "..", "node_modules", ".bin", "tsx");
-const SERVER = join(HERE, "toolBridgeServer.ts");
+// services/agent/src/tools/mcp-bridge.ts -> services/agent/node_modules/.bin/tsx
+const TSX_BIN = join(HERE, "..", "..", "node_modules", ".bin", "tsx");
+const SERVER = join(HERE, "mcp-server.ts");
 
 /** Resolve how to launch the bridge: an explicit override, else the local tsx bin. */
 function bridgeLauncher(): { command: string; args: string[] } {
diff --git a/services/agent/src/toolBridgeServer.ts b/services/agent/src/tools/mcp-server.ts
similarity index 94%
rename from services/agent/src/toolBridgeServer.ts
rename to services/agent/src/tools/mcp-server.ts
index 45a666f3de..09512bfa36 100644
--- a/services/agent/src/toolBridgeServer.ts
+++ b/services/agent/src/tools/mcp-server.ts
@@ -6,7 +6,7 @@
  * (WP-7) and routes each tool call back through Agenta's /tools/call — so the Composio
  * key and connection auth stay server-side, exactly as in the in-process Pi path.
  *
- * Launched by the rivet daemon as a session MCP server (see toolBridge.ts). It reads
+ * Launched by the rivet daemon as a session MCP server (see mcp-bridge.ts). It reads
  * everything from env so nothing tool-specific is written to the agent filesystem:
  *   AGENTA_TOOL_SPECS            JSON array of { name, description, inputSchema, callRef }
  *   AGENTA_TOOL_CALLBACK_ENDPOINT  full /tools/call URL
@@ -16,8 +16,8 @@
  * initialize, tools/list, tools/call; ignores notifications. stdout carries protocol
  * messages only; logs go to stderr.
  */
-import type { ResolvedToolSpec } from "./protocol.ts";
-import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./toolClient.ts";
+import type { ResolvedToolSpec } from "../protocol.ts";
+import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "./client.ts";
 
 const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]");
 const ENDPOINT = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT ?? "";
diff --git a/services/agent/src/agenta-otel.ts b/services/agent/src/tracing/otel.ts
similarity index 99%
rename from services/agent/src/agenta-otel.ts
rename to services/agent/src/tracing/otel.ts
index 65045a2dd9..d1de1019cf 100644
--- a/services/agent/src/agenta-otel.ts
+++ b/services/agent/src/tracing/otel.ts
@@ -52,7 +52,7 @@ import type {
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
 import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
 
-import type { AgentEvent, AgentUsage } from "./protocol.ts";
+import type { AgentEvent, AgentUsage } from "../protocol.ts";
 
 // ---------------------------------------------------------------------------
 // Shared, process-wide tracing infrastructure
@@ -798,7 +798,7 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     if (kind === "usage_update") {
       // ACP usage_update carries only `used` (context tokens) and `cost.amount`. The
       // per-call input/output split is NOT on the stream; it rides on the PromptResponse,
-      // which runRivet.ts reads. Keep total + cost here and leave the split to the caller.
+      // which the rivet engine reads. Keep total + cost here and leave the split to the caller.
       const cost = update.cost?.amount;
       const total = update.used;
       usage = {

From 72850406a8fc3bc5b33003154aa471100b01733b Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 15:46:37 +0200
Subject: [PATCH 07/10] feat(agent): dedicated agent-config playground element

Replace the loose model/agents_md/harness/sandbox params with one `agent`
config element (x-ag-type: agent_config) carrying instructions, model, tools,
harness, sandbox, and permission policy. The playground renders it through a new
AgentConfigControl that reuses the existing controls: the model selector, the
tool picker (so Composio and builtin tools are finally selectable on the agent),
the enum selects, and a textarea. The backend reads it via resolve_agent_config
and falls back to the old shape so existing revisions keep running.

Verified live: the element renders with the tool picker, and a GitHub Composio
tool runs end to end on pi+local.
---
 services/oss/src/agent/app.py                 |  31 +--
 services/oss/src/agent/inputs.py              |  61 ++++++
 services/oss/src/agent/schemas.py             |  70 ++++--
 .../SchemaControls/AgentConfigControl.tsx     | 207 ++++++++++++++++++
 .../SchemaControls/SchemaPropertyRenderer.tsx |  21 ++
 5 files changed, 346 insertions(+), 44 deletions(-)
 create mode 100644 web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx

diff --git a/services/oss/src/agent/app.py b/services/oss/src/agent/app.py
index 62e51b855b..aee738fbb1 100644
--- a/services/oss/src/agent/app.py
+++ b/services/oss/src/agent/app.py
@@ -17,7 +17,7 @@
 import agenta as ag
 
 from oss.src.agent.config import load_config, wrapper_dir
-from oss.src.agent.inputs import resolve_run_config, to_messages
+from oss.src.agent.inputs import resolve_agent_config, to_messages
 from oss.src.agent.schemas import AGENT_SCHEMAS
 from oss.src.agent.secrets import resolve_harness_secrets
 from oss.src.agent.tools import resolve_tools
@@ -68,38 +68,27 @@ async def _agent(
     parameters: Optional[Dict] = None,
 ):
     params = parameters or {}
-    model, agents_md, raw_tools = resolve_run_config(params, load_config())
-
-    if isinstance(raw_tools, dict):
-        raw_tools = [raw_tools]
-    elif not isinstance(raw_tools, list):
-        raw_tools = []
+    cfg = resolve_agent_config(params, load_config())
 
     msgs = to_messages(messages or (inputs or {}).get("messages") or [])
-    builtins, custom_tools, tool_callback = await resolve_tools(raw_tools)
-
-    harness_id = (
-        params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
-    ).lower()
-    sandbox_id = (
-        params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
-    ).lower()
+    builtins, custom_tools, tool_callback = await resolve_tools(cfg.tools)
+
     session_config = SessionConfig(
-        instructions=agents_md,
-        model=model,
-        harness=harness_id,
-        sandbox=sandbox_id,
+        instructions=cfg.instructions,
+        model=cfg.model,
+        harness=cfg.harness,
+        sandbox=cfg.sandbox,
         secrets=await resolve_harness_secrets(),
         builtin_tools=builtins,
         custom_tools=custom_tools,
         tool_callback=tool_callback,
-        permission_policy=(params.get("permission_policy") or "auto").lower(),
+        permission_policy=cfg.permission_policy,
         trace=trace_context(),
     )
 
     # The engine follows the selected harness/sandbox: a claude harness or a daytona
     # sandbox needs rivet, so the legacy pi path never silently swallows the selection.
-    harness = build_harness(select_backend(harness_id, sandbox_id))
+    harness = build_harness(select_backend(cfg.harness, cfg.sandbox))
     await harness.setup()
     try:
         session = harness.create_session(session_config)
diff --git a/services/oss/src/agent/inputs.py b/services/oss/src/agent/inputs.py
index 0f4ee23166..3ba0728fd9 100644
--- a/services/oss/src/agent/inputs.py
+++ b/services/oss/src/agent/inputs.py
@@ -1,11 +1,72 @@
 """Parse the playground/API request into a model, instructions, tools, and messages."""
 
+import os
+from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 
 from oss.src.agent.config import AgentConfig
 from oss.src.harness.ports import Message
 
 
+@dataclass
+class RunConfig:
+    """The agent config for one run, resolved from the request and the file defaults."""
+
+    instructions: str
+    model: str
+    tools: List[Any] = field(default_factory=list)
+    harness: str = "pi"
+    sandbox: str = "local"
+    permission_policy: str = "auto"
+
+
+def _as_list(raw: Any) -> List[Any]:
+    """Coerce a tools value (a dict, a list, or nothing) into a list."""
+    if isinstance(raw, dict):
+        return [raw]
+    if isinstance(raw, list):
+        return raw
+    return []
+
+
+def resolve_agent_config(params: Dict[str, Any], config: AgentConfig) -> RunConfig:
+    """Resolve the full agent run config from the request parameters.
+
+    Prefers the dedicated ``agent`` config element (the ``agent_config`` control). Falls
+    back to the legacy shape (a ``prompt`` prompt-template plus loose ``harness`` /
+    ``sandbox`` / ``permission_policy`` params) so existing revisions keep working.
+    Unset harness/sandbox fall back to the env defaults.
+    """
+    agent = params.get("agent")
+    if isinstance(agent, dict):
+        return RunConfig(
+            instructions=agent.get("instructions") or config.agents_md,
+            model=agent.get("model") or config.model,
+            tools=_as_list(agent.get("tools")),
+            harness=(
+                agent.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
+            ).lower(),
+            sandbox=(
+                agent.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
+            ).lower(),
+            permission_policy=(agent.get("permission_policy") or "auto").lower(),
+        )
+
+    model, instructions, raw_tools = resolve_run_config(params, config)
+    return RunConfig(
+        instructions=instructions,
+        model=model,
+        tools=_as_list(raw_tools),
+        harness=(
+            params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
+        ).lower(),
+        sandbox=(
+            params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
+        ).lower(),
+        permission_policy=(params.get("permission_policy") or "auto").lower(),
+    )
+
+
 def _system_text(messages: Optional[List[Any]]) -> str:
     """Join the system-message content of a prompt-template into AGENTS.md text."""
     parts: List[str] = []
diff --git a/services/oss/src/agent/schemas.py b/services/oss/src/agent/schemas.py
index c6aa3cea68..a5e65de2ee 100644
--- a/services/oss/src/agent/schemas.py
+++ b/services/oss/src/agent/schemas.py
@@ -34,34 +34,50 @@
     },
 }
 
-# Parameters: the agent config the playground renders. We reuse the existing
-# `prompt-template` control (model selector + tool picker + message editor) instead
-# of a bespoke agent form: the `x-ag-type-ref: prompt-template` marker makes the
-# playground render the same prompt UI chat/completion use, so the tool picker comes
-# for free. The agent reads the system message as its AGENTS.md, `llm_config.model`
-# as the model, and `llm_config.tools` (the picker output) as its runnable tools.
-AGENT_PARAMETERS_SCHEMA = {
-    "$schema": _SCHEMA,
+# The agent config element: one composite control the playground renders for the whole
+# agent config, instead of reusing `prompt-template` plus loose params. The
+# `x-ag-type: agent_config` marker is what the playground dispatches to the AgentConfigControl
+# (web/packages/agenta-entity-ui/.../AgentConfigControl.tsx). The schema is inline (not an
+# `x-ag-type-ref`), so it needs no `/ag-types` registration; the control reuses the existing
+# model selector, tool picker, and enum selects. agent.py reads this value (see inputs.py).
+_DEFAULT_AGENT_CONFIG = {
+    "instructions": _DEFAULT_AGENTS_MD,
+    "model": _DEFAULT_MODEL,
+    "tools": [],
+    "harness": "pi",
+    "sandbox": "local",
+    "permission_policy": "auto",
+}
+
+AGENT_CONFIG_SCHEMA = {
     "type": "object",
-    "additionalProperties": True,
+    "x-ag-type": "agent_config",
+    "title": "Agent",
+    "description": "The agent's instructions, model, tools, and runtime.",
     "properties": {
-        "prompt": {
-            "x-ag-type-ref": "prompt-template",
-            "type": "object",
+        "instructions": {
+            "type": "string",
+            "x-ag-type": "textarea",
+            "title": "Instructions",
+            "description": "The agent's system prompt (its AGENTS.md).",
+            "default": _DEFAULT_AGENTS_MD,
+        },
+        "model": {
+            "type": "string",
+            "x-parameter": "grouped_choice",
+            "title": "Model",
+            "default": _DEFAULT_MODEL,
+        },
+        "tools": {
+            "type": "array",
+            "title": "Tools",
             "description": (
-                "The agent's instructions (system message), model, and tools. Tools "
-                "are picked from connected providers (e.g. Composio) and run "
-                "server-side via /tools/call."
+                "Runnable tools the agent can call. Picked from connected providers "
+                "(e.g. Composio) and run server-side via /tools/call."
             ),
-            "default": {
-                "messages": [{"role": "system", "content": _DEFAULT_AGENTS_MD}],
-                "template_format": "mustache",
-                "llm_config": {"model": _DEFAULT_MODEL, "tools": []},
-            },
+            "items": {"type": "object", "additionalProperties": True},
+            "default": [],
         },
-        # The two orthogonal runtime axes, editable in the playground so a run can
-        # switch engine (pi/claude) or where it runs (local/daytona) without redeploy.
-        # Read in agent.py and threaded to the rivet harness; fall back to env defaults.
         "harness": {
             "type": "string",
             "title": "Harness",
@@ -87,6 +103,14 @@
             ),
         },
     },
+    "default": _DEFAULT_AGENT_CONFIG,
+}
+
+AGENT_PARAMETERS_SCHEMA = {
+    "$schema": _SCHEMA,
+    "type": "object",
+    "additionalProperties": True,
+    "properties": {"agent": AGENT_CONFIG_SCHEMA},
 }
 
 # Outputs: the final assistant message.
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx
new file mode 100644
index 0000000000..e2fd6a199d
--- /dev/null
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx
@@ -0,0 +1,207 @@
+/**
+ * AgentConfigControl
+ *
+ * One composite control for the whole agent config, dispatched from
+ * `x-ag-type: "agent_config"` (see SchemaPropertyRenderer). It reuses the existing
+ * controls rather than inventing new ones: the model selector (GroupedChoiceControl), the
+ * tool picker (ToolSelectorPopover + ToolItemControl), enum selects (harness, sandbox,
+ * permission policy), and a textarea (instructions). The backend advertises the inline
+ * schema and reads this value (services/oss/src/agent/schemas.py + inputs.py).
+ */
+import {useCallback, useMemo} from "react"
+
+import type {SchemaProperty} from "@agenta/entities/shared"
+import {useDrillInUI} from "@agenta/ui/drill-in"
+import {cn} from "@agenta/ui/styles"
+
+import {EnumSelectControl} from "./EnumSelectControl"
+import {GroupedChoiceControl} from "./GroupedChoiceControl"
+import {TextInputControl} from "./TextInputControl"
+import {ToolItemControl} from "./ToolItemControl"
+import {ToolSelectorPopover, type ToolSelectionMeta} from "./ToolSelectorPopover"
+import {type ToolObj} from "./toolUtils"
+
+export interface AgentConfigControlProps {
+    schema?: SchemaProperty | null
+    label?: string
+    value?: Record<string, unknown> | null
+    onChange: (value: Record<string, unknown>) => void
+    description?: string
+    withTooltip?: boolean
+    disabled?: boolean
+    className?: string
+}
+
+/** Read the function name of a tool object (the gateway slug for Composio tools). */
+function toolName(tool: unknown): string | undefined {
+    if (!tool || typeof tool !== "object") return undefined
+    const fn = (tool as Record<string, unknown>).function
+    if (!fn || typeof fn !== "object") return undefined
+    const name = (fn as Record<string, unknown>).name
+    return typeof name === "string" ? name : undefined
+}
+
+export function AgentConfigControl({
+    schema,
+    value,
+    onChange,
+    withTooltip,
+    disabled,
+    className,
+}: AgentConfigControlProps) {
+    const {EditorProvider} = useDrillInUI()
+    const config = (value ?? {}) as Record<string, unknown>
+    const props = (schema?.properties ?? {}) as Record<string, SchemaProperty>
+
+    // Update a single field of the agent config, leaving the rest intact.
+    const setField = useCallback(
+        (key: string, fieldValue: unknown) => onChange({...config, [key]: fieldValue}),
+        [config, onChange],
+    )
+
+    // Tools live as a flat array on the agent config (the same tool-object shape the
+    // prompt control uses, so the backend resolver parses them identically).
+    const tools = useMemo(
+        () => (Array.isArray(config.tools) ? (config.tools as unknown[]) : []),
+        [config.tools],
+    )
+    const setTools = useCallback((next: unknown[]) => setField("tools", next), [setField])
+
+    const handleAddTool = useCallback(
+        (tool: ToolObj, meta?: ToolSelectionMeta) => {
+            const next =
+                meta && tool && typeof tool === "object" && !Array.isArray(tool)
+                    ? {
+                          ...(tool as Record<string, unknown>),
+                          agenta_metadata: {
+                              ...(((tool as Record<string, unknown>).agenta_metadata as
+                                  | Record<string, unknown>
+                                  | undefined) ?? {}),
+                              ...meta,
+                          },
+                      }
+                    : tool
+            setTools([...tools, next])
+        },
+        [tools, setTools],
+    )
+
+    const handleToolChange = useCallback(
+        (index: number, next: ToolObj) => {
+            const updated = [...tools]
+            updated[index] = next
+            setTools(updated)
+        },
+        [tools, setTools],
+    )
+
+    const handleToolDelete = useCallback(
+        (index: number) => setTools(tools.filter((_, i) => i !== index)),
+        [tools, setTools],
+    )
+
+    const handleRemoveToolByName = useCallback(
+        (name: string) => setTools(tools.filter((tool) => toolName(tool) !== name)),
+        [tools, setTools],
+    )
+
+    const selectedToolNames = useMemo(
+        () => new Set(tools.map(toolName).filter((n): n is string => Boolean(n))),
+        [tools],
+    )
+
+    return (
+        <div className={cn("flex flex-col gap-3", className)}>
+            <TextInputControl
+                schema={props.instructions}
+                label="Instructions"
+                value={(config.instructions as string | null) ?? null}
+                onChange={(v) => setField("instructions", v)}
+                description={props.instructions?.description as string | undefined}
+                withTooltip={withTooltip}
+                disabled={disabled}
+                multiline
+            />
+
+            <GroupedChoiceControl
+                schema={props.model}
+                label="Model"
+                value={(config.model as string | null) ?? null}
+                onChange={(v) => setField("model", v)}
+                withTooltip={withTooltip}
+                disabled={disabled}
+            />
+
+            {/* Tools */}
+            <div className="flex flex-col gap-2">
+                {tools.length > 0 && (
+                    <div className="flex flex-col gap-2">
+                        {tools.map((tool, index) => {
+                            const control = (
+                                <ToolItemControl
+                                    key={`tool-${index}`}
+                                    value={tool}
+                                    onChange={(v) => handleToolChange(index, v)}
+                                    onDelete={disabled ? undefined : () => handleToolDelete(index)}
+                                    disabled={disabled}
+                                />
+                            )
+                            return EditorProvider ? (
+                                <EditorProvider
+                                    key={`tool-editor-${index}`}
+                                    codeOnly
+                                    language="json"
+                                    showToolbar={false}
+                                    enableTokens={false}
+                                    id={`agent-tool-editor-${index}`}
+                                >
+                                    {control}
+                                </EditorProvider>
+                            ) : (
+                                control
+                            )
+                        })}
+                    </div>
+                )}
+                {!disabled && (
+                    <div>
+                        <ToolSelectorPopover
+                            onAddTool={handleAddTool}
+                            onRemoveTool={handleRemoveToolByName}
+                            selectedToolNames={selectedToolNames}
+                            selectedTools={tools as ToolObj[]}
+                            existingToolCount={tools.length}
+                        />
+                    </div>
+                )}
+            </div>
+
+            <EnumSelectControl
+                schema={props.harness}
+                label="Harness"
+                value={(config.harness as string | null) ?? null}
+                onChange={(v) => setField("harness", v)}
+                withTooltip={withTooltip}
+                disabled={disabled}
+            />
+
+            <EnumSelectControl
+                schema={props.sandbox}
+                label="Sandbox"
+                value={(config.sandbox as string | null) ?? null}
+                onChange={(v) => setField("sandbox", v)}
+                withTooltip={withTooltip}
+                disabled={disabled}
+            />
+
+            <EnumSelectControl
+                schema={props.permission_policy}
+                label="Permission policy"
+                value={(config.permission_policy as string | null) ?? null}
+                onChange={(v) => setField("permission_policy", v)}
+                withTooltip={withTooltip}
+                disabled={disabled}
+            />
+        </div>
+    )
+}
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SchemaPropertyRenderer.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SchemaPropertyRenderer.tsx
index 2cfe8a9019..f38d9e7aed 100644
--- a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SchemaPropertyRenderer.tsx
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SchemaPropertyRenderer.tsx
@@ -19,6 +19,7 @@ import {formatLabel} from "@agenta/ui/drill-in"
 import {Typography} from "antd"
 import clsx from "clsx"
 
+import {AgentConfigControl} from "./AgentConfigControl"
 import {BooleanToggleControl} from "./BooleanToggleControl"
 import {CodeEditorControl} from "./CodeEditorControl"
 import {EnumSelectControl} from "./EnumSelectControl"
@@ -95,6 +96,7 @@ function getControlType(
     | "grouped_choice"
     | "feedback_config"
     | "fields_tags_editor"
+    | "agent_config"
     | "hidden"
     | "unknown" {
     if (forceType) return forceType
@@ -125,6 +127,9 @@ function getControlType(
     if (xAgTypeRef === "code" || xAgType === "code") {
         return "code"
     }
+    if (xAgTypeRef === "agent_config" || xAgType === "agent_config") {
+        return "agent_config"
+    }
 
     // When schema is null, fall back to value-based detection
     if (!schema) {
@@ -422,6 +427,22 @@ export const SchemaPropertyRenderer = memo(function SchemaPropertyRenderer({
                 />
             )
 
+        case "agent_config":
+            // Render the whole agent config (instructions, model, tools, runtime) as one
+            // composite control that reuses the model selector, tool picker, and enums.
+            return (
+                <AgentConfigControl
+                    schema={resolvedSchema}
+                    label={displayLabel}
+                    value={value as Record<string, unknown> | null}
+                    onChange={(v) => onChange(v)}
+                    description={tooltipDesc}
+                    withTooltip={withTooltip}
+                    disabled={disabled}
+                    className={className}
+                />
+            )
+
         case "prompt":
             // Render prompt object with message cards and LLM config
             return (

From 91ee25b3f4b75ff45b5910a71d544c7a61e890b6 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 15:47:04 +0200
Subject: [PATCH 08/10] fix(agent): relay Pi tool calls through the runner on
 Daytona

Tools worked locally but failed on Daytona. The in-sandbox Pi extension POSTed
each tool call to Agenta's /tools/call, but a firewalled or private backend does
not expose that to the remote cloud sandbox (the same reason tracing is built
from the event stream on Daytona rather than in-sandbox OTLP). The sandbox has
internet but cannot reach the backend, so the call failed and the model gave up.

Route the call through the runner, which can reach Agenta. The extension writes
the request to a file in a sandbox dir and polls for the response; the runner
watches the dir over the daemon filesystem API, calls /tools/call, and writes the
result back (tools/relay.ts). Local runs keep the direct path.

Verified programmatically: rivet+pi+daytona with a GitHub Composio tool now
returns the real login (was 'the tool failed twice'); local is unchanged.
---
 services/agent/src/engines/rivet.ts     |  18 ++++
 services/agent/src/extensions/agenta.ts |  73 +++++++++++++--
 services/agent/src/tools/relay.ts       | 119 ++++++++++++++++++++++++
 3 files changed, 200 insertions(+), 10 deletions(-)
 create mode 100644 services/agent/src/tools/relay.ts

diff --git a/services/agent/src/engines/rivet.ts b/services/agent/src/engines/rivet.ts
index d8dbd354e0..f056849855 100644
--- a/services/agent/src/engines/rivet.ts
+++ b/services/agent/src/engines/rivet.ts
@@ -46,6 +46,7 @@ import { daytona } from "sandbox-agent/daytona";
 
 import { createRivetOtel } from "../tracing/otel.ts";
 import { buildToolMcpServers } from "../tools/mcp-bridge.ts";
+import { startToolRelay } from "../tools/relay.ts";
 import {
   type AgentRunRequest,
   type AgentRunResult,
@@ -644,6 +645,14 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
     piExtEnv.AGENTA_USAGE_OUT = usageOutPath;
   }
 
+  // Daytona can't reach a firewalled Agenta from inside the sandbox, so relay the Pi
+  // extension's tool calls through the runner via the sandbox filesystem (see tools/relay).
+  const toolSpecsForRun = (request.customTools as ResolvedToolSpec[]) ?? [];
+  const relayDir = `${cwd}/.agenta-tools`;
+  const useToolRelay =
+    isPi && isDaytona && toolSpecsForRun.length > 0 && !!request.toolCallback?.endpoint;
+  if (useToolRelay) piExtEnv.AGENTA_TOOL_RELAY_DIR = relayDir;
+
   log(`harness=${harness} sandbox=${sandboxId} cwd=${cwd}`);
 
   // Persist events in-process so a follow-up turn can resume by session id.
@@ -661,6 +670,8 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
   // the model is resolved, so the chat span carries the harness's actual model rather
   // than the requested one. Declared here so the catch can flush a partial trace.
   let otel: ReturnType<typeof createRivetOtel> | undefined;
+  // Daytona tool relay loop (started once the session exists, stopped after the prompt).
+  let toolRelay: { stop: () => Promise<void> } | undefined;
 
   try {
     // On Daytona, push the harness login, the extension, and AGENTS.md into the remote
@@ -675,6 +686,7 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
         if (DAYTONA_PI_INSTALL) await installPiInSandbox(sandbox);
       }
       await sandbox.mkdirFs({ path: cwd }).catch(() => {});
+      if (useToolRelay) await sandbox.mkdirFs({ path: relayDir }).catch(() => {});
       if (agentsMd) await sandbox.writeFsFile({ path: `${cwd}/AGENTS.md` }, agentsMd);
     } else if (agentsMd) {
       writeFileSync(join(cwd, "AGENTS.md"), agentsMd, "utf-8");
@@ -744,7 +756,12 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
       if (req?.id) session.respondPermission(req.id, reply as any).catch(() => {});
     });
 
+    if (useToolRelay) {
+      toolRelay = startToolRelay(sandbox, relayDir, request.toolCallback as ToolCallbackContext);
+    }
+
     const result = await session.prompt([{ type: "text", text: turnText }]);
+    await toolRelay?.stop();
     const stopReason = (result as any)?.stopReason;
     log(`prompt stopReason=${stopReason}`);
 
@@ -785,6 +802,7 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
     await otel?.flush().catch(() => {});
     return { ok: false, error: conciseError(err, harness) };
   } finally {
+    await toolRelay?.stop().catch(() => {});
     await sandbox.destroySandbox().catch(() => {});
     await sandbox.dispose().catch(() => {});
     rmSync(cwd, { recursive: true, force: true });
diff --git a/services/agent/src/extensions/agenta.ts b/services/agent/src/extensions/agenta.ts
index 884ae5b925..e410b9d46e 100644
--- a/services/agent/src/extensions/agenta.ts
+++ b/services/agent/src/extensions/agenta.ts
@@ -17,23 +17,78 @@
  *   AGENTA_TOOL_SPECS             JSON [{ name, description, inputSchema, callRef }]
  *   AGENTA_TOOL_CALLBACK_ENDPOINT full /tools/call URL
  *   AGENTA_TOOL_CALLBACK_AUTH     Authorization header for the callback
+ *   AGENTA_TOOL_RELAY_DIR         set on Daytona: relay tool calls through the runner via
+ *                                 files here, since the sandbox can't reach Agenta directly
  *
  * Bundled self-contained (esbuild) so its OpenTelemetry deps resolve wherever Pi loads
  * it (local, the docker sidecar, a Daytona snapshot). Default export is the Pi
  * ExtensionFactory.
  */
-import { writeFileSync } from "node:fs";
+import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
 
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
 
 import { createAgentaOtel } from "../tracing/otel.ts";
 import type { ResolvedToolSpec } from "../protocol.ts";
 import { EMPTY_OBJECT_SCHEMA, callAgentaTool } from "../tools/client.ts";
+import {
+  RELAY_POLL_MS,
+  RELAY_REQ_SUFFIX,
+  RELAY_RES_SUFFIX,
+  RELAY_TIMEOUT_MS,
+  sanitizeRelayId,
+  sleep,
+  type RelayResponse,
+} from "../tools/relay.ts";
 
 function log(message: string): void {
   process.stderr.write(`[agenta-pi-ext] ${message}\n`);
 }
 
+/**
+ * Daytona tool call: the in-sandbox process can't reach Agenta, so write the request to a
+ * file the runner watches and poll for the response it writes back (see tools/relay.ts).
+ */
+async function relayToolCall(
+  dir: string,
+  callRef: string,
+  toolCallId: string,
+  params: unknown,
+  signal?: AbortSignal,
+): Promise<string> {
+  const id = sanitizeRelayId(toolCallId);
+  const reqPath = `${dir}/${id}${RELAY_REQ_SUFFIX}`;
+  const resPath = `${dir}/${id}${RELAY_RES_SUFFIX}`;
+  try {
+    mkdirSync(dir, { recursive: true });
+  } catch {
+    // The runner also creates it; a race here is harmless.
+  }
+  writeFileSync(reqPath, JSON.stringify({ callRef, toolCallId, args: params ?? {} }), "utf-8");
+
+  const deadline = Date.now() + RELAY_TIMEOUT_MS;
+  while (Date.now() < deadline) {
+    if (signal?.aborted) throw new Error("aborted");
+    if (existsSync(resPath)) {
+      const res = JSON.parse(readFileSync(resPath, "utf-8")) as RelayResponse;
+      try {
+        unlinkSync(reqPath);
+      } catch {
+        /* best-effort cleanup */
+      }
+      try {
+        unlinkSync(resPath);
+      } catch {
+        /* best-effort cleanup */
+      }
+      if (res.ok) return res.text ?? "";
+      throw new Error(res.error || `tool relay failed for ${callRef}`);
+    }
+    await sleep(RELAY_POLL_MS);
+  }
+  throw new Error(`tool relay timed out for ${callRef}`);
+}
+
 /** Register the resolved tools (from env) as Pi tools that call back to Agenta. */
 function registerTools(pi: ExtensionAPI): void {
   const raw = process.env.AGENTA_TOOL_SPECS;
@@ -48,6 +103,9 @@ function registerTools(pi: ExtensionAPI): void {
     return;
   }
   const authorization = process.env.AGENTA_TOOL_CALLBACK_AUTH;
+  // Daytona: the in-sandbox process can't reach Agenta, so tool calls are relayed through
+  // the runner via files in this dir. Unset for local runs (direct /tools/call).
+  const relayDir = process.env.AGENTA_TOOL_RELAY_DIR;
 
   for (const spec of specs) {
     pi.registerTool({
@@ -57,19 +115,14 @@ function registerTools(pi: ExtensionAPI): void {
       // Pi accepts plain JSON Schema here (non-TypeBox validation path).
       parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
       async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
-        const text = await callAgentaTool(
-          endpoint,
-          authorization,
-          spec.callRef,
-          toolCallId,
-          params,
-          signal,
-        );
+        const text = relayDir
+          ? await relayToolCall(relayDir, spec.callRef, toolCallId, params, signal)
+          : await callAgentaTool(endpoint, authorization, spec.callRef, toolCallId, params, signal);
         return { content: [{ type: "text", text }], details: { callRef: spec.callRef } };
       },
     } as any);
   }
-  log(`registered ${specs.length} tool(s) -> ${endpoint}`);
+  log(`registered ${specs.length} tool(s) -> ${relayDir ? `relay ${relayDir}` : endpoint}`);
 }
 
 /** The Pi ExtensionFactory: tools + (env-driven) tracing + usage writeback. */
diff --git a/services/agent/src/tools/relay.ts b/services/agent/src/tools/relay.ts
new file mode 100644
index 0000000000..c182e4e7f9
--- /dev/null
+++ b/services/agent/src/tools/relay.ts
@@ -0,0 +1,119 @@
+/**
+ * Daytona tool relay.
+ *
+ * On Daytona the harness runs in a remote cloud sandbox that can reach the public internet
+ * but NOT a firewalled / private Agenta backend (the same reason tracing is built from the
+ * event stream there instead of in-sandbox OTLP). So the in-sandbox Pi extension cannot
+ * POST tool calls to Agenta's /tools/call directly.
+ *
+ * The runner CAN reach Agenta (it resolved the tools and holds the callback), and it can
+ * reach the sandbox filesystem over the daemon API. So tool calls are relayed through the
+ * runner via files in a sandbox dir:
+ *
+ *   extension: write `<id>.req.json` {callRef, args}  ──▶  poll `<id>.res.json`
+ *   runner:    poll the dir, read `<id>.req.json` ──▶ /tools/call ──▶ write `<id>.res.json`
+ *
+ * Local runs keep the direct path (the in-process / local-daemon extension reaches Agenta);
+ * the relay is only wired when AGENTA_TOOL_RELAY_DIR is set (Daytona + Pi + tools).
+ */
+import { callAgentaTool } from "./client.ts";
+import type { ToolCallbackContext } from "../protocol.ts";
+
+export const RELAY_REQ_SUFFIX = ".req.json";
+export const RELAY_RES_SUFFIX = ".res.json";
+export const RELAY_POLL_MS = Number(process.env.AGENTA_TOOL_RELAY_POLL_MS ?? 300);
+export const RELAY_TIMEOUT_MS = Number(process.env.AGENTA_TOOL_RELAY_TIMEOUT_MS ?? 60000);
+
+export interface RelayRequest {
+  callRef: string;
+  toolCallId: string;
+  args: unknown;
+}
+export interface RelayResponse {
+  ok: boolean;
+  text?: string;
+  error?: string;
+}
+
+/** Make a tool-call id safe to use as a filename (and bounded). */
+export function sanitizeRelayId(id: string): string {
+  return id.replace(/[^A-Za-z0-9_-]/g, "_").slice(0, 120) || "tool";
+}
+
+export const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeout(r, ms));
+
+/**
+ * Runner-side relay loop. Polls the sandbox relay dir for request files, executes each
+ * against Agenta's /tools/call (which the runner can reach), and writes the response file
+ * the in-sandbox extension is waiting on. Returns `stop()` to end the loop and drain any
+ * in-flight executions; call it once the prompt resolves.
+ */
+export function startToolRelay(
+  sandbox: any,
+  relayDir: string,
+  callback: ToolCallbackContext,
+): { stop: () => Promise<void> } {
+  let active = true;
+  const seen = new Set<string>();
+  const inflight: Promise<void>[] = [];
+
+  const handle = async (reqName: string): Promise<void> => {
+    const id = reqName.slice(0, -RELAY_REQ_SUFFIX.length);
+    let res: RelayResponse;
+    try {
+      const bytes = await sandbox.readFsFile({ path: `${relayDir}/${reqName}` });
+      const raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes);
+      const req = JSON.parse(raw) as RelayRequest;
+      const text = await callAgentaTool(
+        callback.endpoint,
+        callback.authorization,
+        req.callRef,
+        req.toolCallId ?? id,
+        req.args,
+      );
+      res = { ok: true, text };
+    } catch (err) {
+      res = { ok: false, error: err instanceof Error ? err.message : String(err) };
+    }
+    try {
+      await sandbox.writeFsFile(
+        { path: `${relayDir}/${id}${RELAY_RES_SUFFIX}` },
+        JSON.stringify(res),
+      );
+    } catch {
+      // The extension will time out and surface a tool error; nothing else to do here.
+    }
+  };
+
+  const loop = (async () => {
+    while (active) {
+      try {
+        const ls = await sandbox.runProcess({
+          command: "ls",
+          args: ["-1", relayDir],
+          timeoutMs: 10_000,
+        });
+        const names = String(ls?.stdout ?? "")
+          .split("\n")
+          .map((s) => s.trim())
+          .filter(Boolean);
+        for (const name of names) {
+          if (!name.endsWith(RELAY_REQ_SUFFIX) || seen.has(name)) continue;
+          seen.add(name);
+          inflight.push(handle(name));
+        }
+      } catch {
+        // Transient (dir not created yet, or a poll raced sandbox teardown): retry.
+      }
+      await sleep(RELAY_POLL_MS);
+    }
+    await Promise.allSettled(inflight);
+  })();
+
+  return {
+    stop: async () => {
+      active = false;
+      await loop.catch(() => {});
+    },
+  };
+}

From 7be759eed14c0fc49d070690468bf9d32358916d Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 17 Jun 2026 21:34:00 +0200
Subject: [PATCH 09/10] docs(agent): restructure agent-workflows into a
 reviewable wiki

Move the raw work-package material (wp-1..wp-8, harness-port-redesign,
research) into scratch/ and add clean top-level pages a reviewer can read
top to bottom: a README index, architecture, ports-and-adapters, sessions,
and adapters/{pi,claude-code}. Update the three in-code references that
pointed at the moved doc paths.
---
 docs/design/agent-workflows/README.md         | 153 ++++-----------
 .../agent-workflows/adapters/claude-code.md   |  94 +++++++++
 docs/design/agent-workflows/adapters/pi.md    | 112 +++++++++++
 docs/design/agent-workflows/architecture.md   | 180 ++++++++++++++++++
 .../agent-workflows/ports-and-adapters.md     | 172 +++++++++++++++++
 .../harness-port-redesign/README.md           |   0
 .../harness-port-redesign/implementation.md   |   0
 .../harness-port-redesign/plan.md             |   0
 .../harness-port-redesign/proposal.md         |   0
 .../harness-port-redesign/research.md         |   0
 .../harness-port-redesign/status.md           |   0
 .../{ => scratch}/research/auth-secrets.md    |   0
 .../{ => scratch}/research/daytona-sandbox.md |   0
 .../research/diskless-in-memory-config.md     |   0
 .../{ => scratch}/research/open-questions.md  |   0
 .../research/otel-instrumentation.md          |   0
 .../{ => scratch}/research/pi-interaction.md  |   0
 .../{ => scratch}/research/sandbox-sharing.md |   0
 .../{ => scratch}/wp-1-pi-tracing/README.md   |   0
 .../integrating-the-tracing-extension.md      |   0
 .../wp-1-pi-tracing/poc/.env.example          |   0
 .../wp-1-pi-tracing/poc/README.md             |   0
 .../wp-1-pi-tracing/poc/agenta-otel.ts        |   0
 .../wp-1-pi-tracing/poc/package.json          |   0
 .../wp-1-pi-tracing/poc/pnpm-lock.yaml        |   0
 .../{ => scratch}/wp-1-pi-tracing/poc/run.ts  |   0
 .../tracing-in-the-agent-service.md           |   0
 .../wp-2-agent-service/README.md              |   0
 .../wp-2-agent-service/implementation-plan.md |   0
 .../{ => scratch}/wp-2-agent-service/qa.md    |   0
 .../wp-3-daytona-sandbox/README.md            |   0
 .../wp-3-daytona-sandbox/poc/README.md        |   0
 .../poc/bench_coldstart.py                    |   0
 .../poc/build_snapshot.py                     |   0
 .../wp-3-daytona-sandbox/poc/cleanup.py       |   0
 .../wp-3-daytona-sandbox/poc/run_agent.py     |   0
 .../wp-4-multi-message-output/README.md       |   0
 .../wp-5-chat-vs-completion/README.md         |   0
 .../wp-6-workflow-type-and-template/README.md |   0
 .../{ => scratch}/wp-7-tools/README.md        |   0
 .../wp-8-rivet-acp-runtime/README.md          |   0
 .../wp-8-rivet-acp-runtime/architecture.md    |   0
 .../wp-8-rivet-acp-runtime/context.md         |   0
 .../isolation-and-fork.md                     |   0
 .../wp-8-rivet-acp-runtime/plan.md            |   0
 .../poc/build_rivet_snapshot.py               |   0
 .../poc/commit_agent_config.py                |   0
 .../poc/debug-events.ts                       |   0
 .../wp-8-rivet-acp-runtime/poc/dump-full.ts   |   0
 .../wp-8-rivet-acp-runtime/poc/package.json   |   0
 .../wp-8-rivet-acp-runtime/poc/spike.ts       |   0
 .../wp-8-rivet-acp-runtime/research.md        |   0
 .../wp-8-rivet-acp-runtime/status.md          |   0
 docs/design/agent-workflows/sessions.md       |  93 +++++++++
 services/agent/src/tracing/otel.ts            |   2 +-
 services/oss/src/harness/__init__.py          |   2 +-
 services/oss/src/harness/ports.py             |   2 +-
 57 files changed, 687 insertions(+), 123 deletions(-)
 create mode 100644 docs/design/agent-workflows/adapters/claude-code.md
 create mode 100644 docs/design/agent-workflows/adapters/pi.md
 create mode 100644 docs/design/agent-workflows/architecture.md
 create mode 100644 docs/design/agent-workflows/ports-and-adapters.md
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/implementation.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/plan.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/proposal.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/research.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/harness-port-redesign/status.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/auth-secrets.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/daytona-sandbox.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/diskless-in-memory-config.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/open-questions.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/otel-instrumentation.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/pi-interaction.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/research/sandbox-sharing.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/integrating-the-tracing-extension.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/.env.example (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/agenta-otel.ts (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/package.json (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/pnpm-lock.yaml (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/poc/run.ts (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-1-pi-tracing/tracing-in-the-agent-service.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-2-agent-service/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-2-agent-service/implementation-plan.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-2-agent-service/qa.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/poc/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/poc/bench_coldstart.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/poc/build_snapshot.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/poc/cleanup.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-3-daytona-sandbox/poc/run_agent.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-4-multi-message-output/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-5-chat-vs-completion/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-6-workflow-type-and-template/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-7-tools/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/README.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/architecture.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/context.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/isolation-and-fork.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/plan.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/commit_agent_config.py (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/debug-events.ts (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/dump-full.ts (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/package.json (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/poc/spike.ts (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/research.md (100%)
 rename docs/design/agent-workflows/{ => scratch}/wp-8-rivet-acp-runtime/status.md (100%)
 create mode 100644 docs/design/agent-workflows/sessions.md

diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md
index dc3451cfda..57c8a3ad25 100644
--- a/docs/design/agent-workflows/README.md
+++ b/docs/design/agent-workflows/README.md
@@ -1,130 +1,43 @@
-# Agent Workflows
+# Agent workflows
 
-Status: context draft. Research and design to follow.
+This folder documents a proof of concept: running a coding agent as an Agenta workflow.
 
-## Summary
+Agenta runs prompt workflows today (completion, chat, the LLM judge). Each calls a model
+once and returns one answer. An agent is different. It runs a loop, calls tools across many
+turns, and returns a final answer. This PoC adds the agent as a new workflow type behind the
+same `/invoke` contract, traced into the same spans, configured from the same playground.
 
-Add a new workflow type to the backend: **agents**. Today the backend runs
-prompt-style workflows (completion, chat, LLM-as-a-judge). Agents are different. An
-agent runs inside a sandbox, executes tools over multiple turns, returns a multi-message
-output, and is instrumented end to end. Agents run on a **pi.dev** harness by default,
-and the same harness can run locally so a configuration pulled from the server behaves
-the same on a developer machine.
+It proves one specific claim: that the **agent** and the **place it runs** are both config,
+not code. You change a dropdown to swap Pi for Claude Code, or local for a Daytona cloud
+sandbox, and nothing above the seam changes.
 
-This document only captures context. It does not propose a solution yet. The research
-topics in [Open research topics](#open-research-topics) will be assigned to subagents and
-written up in sibling files.
+## Read in this order
 
-## What an agent is
+1. **[Architecture](architecture.md)**. How a request flows from the playground to the model
+   and back: the relay of programs, the two containers, and the vocabulary. Start here.
+2. **[Ports and adapters](ports-and-adapters.md)**. The seam that keeps the relay swappable:
+   the two seams, the wire contract, and how the service picks an engine and a transport.
+3. **[Sessions](sessions.md)**. How a multi-turn conversation holds together today (cold
+   replay), and the two paths open to us tomorrow.
+4. **[The Pi adapter](adapters/pi.md)**. The default harness, which traces itself and takes
+   tools natively through a Pi extension.
+5. **[The Claude Code adapter](adapters/claude-code.md)**. The second harness, which proves
+   the swap and is the template for any MCP-capable agent.
 
-An agent is a configured, sandboxed, instrumented runtime that:
+## What this PoC includes and defers
 
-- Boots a sandbox through startup hooks that lay down files and inject secrets.
-- Runs a harness (pi by default, configurable) that drives the model and its tools.
-- Produces a multi-message output rather than a single completion.
-- Carries a `session_id` so a run can be identified and, later, have its state stored.
-- Emits instrumentation through pi instruments for tracing and observability.
+It includes the agent workflow behind `/invoke`, two harnesses (Pi and Claude Code), two
+sandboxes (local and Daytona), backend-resolved tools that keep credentials server-side, and
+tracing that nests the agent's run under the caller's span.
 
-## Agent configuration
+It defers the things a production rollout will need: a warm daemon and server-owned session
+storage (see [Sessions](sessions.md)), live streaming to the client over the HTTP edge, the
+multi-tenant filesystem jail for a shared daemon, and registering the agent as a first-class
+backend workflow type with its own builtin URI. Each is called out where it belongs.
 
-The agent configuration is what gets stored on the server, versioned as a workflow
-revision, and pulled down to run locally. It includes:
+## The `scratch/` folder
 
-- **`AGENTS.md`** — the agent's instructions.
-- **Skills** — the skills available to the agent.
-- **Model** — the model the agent runs on.
-- **Tools** — the tools the agent has access to.
-- **Files** — files that are part of the config and are laid into the sandbox by the
-  startup hook.
-- **Secrets** — for example an OpenAI key, injected into the sandbox by the startup
-  hook.
-- **Harness** — which harness runs the agent. Defaults to pi; configurable.
-
-## Runtime model
-
-- **Sandbox.** Agents run in a Daytona sandbox, or any sandbox provider that works with
-  our port. The sandbox is initialized by startup hooks: file setup, then secrets setup.
-- **Harness.** The harness (pi by default) is the layer that exposes tools and drives the
-  agent loop. It is configurable per agent.
-- **Output.** A run returns multiple messages, not one completion.
-- **Instrumentation.** Runs are instrumented with pi instruments.
-- **Sessions.** Each run has a `session_id`. Future work adds session storage alongside
-  global storage so session state can persist across runs.
-
-## Local execution parity
-
-The same harness that runs server-side must run locally on pi.dev abstractions (tools and
-the rest). A user can pull an agent's configuration from the server and run it locally
-with the same behavior. Local-server parity is a first-class requirement, not an
-afterthought.
-
-## What the research established
-
-Full write-ups live in [`research/`](research/). The load-bearing conclusions:
-
-- **pi.dev is "Pi"**, an open-source TypeScript/Node agent harness by Earendil Inc. (MIT,
-  ~v0.79.4). It is local-first (a CLI/SDK/RPC, not a hosted service) and moves fast (0.x,
-  roughly weekly releases). There is no Python SDK.
-  See [`research/pi-interaction.md`](research/pi-interaction.md),
-  [`research/open-questions.md`](research/open-questions.md).
-- **Pi can run fully diskless.** Via the SDK's `createAgentSession`, AGENTS.md
-  (`systemPromptOverride`/`agentsFilesOverride`), skills (`skillsOverride`), tools
-  (`customTools`), LLM auth (`setRuntimeApiKey` / `AuthStorage.inMemory()` / env), and
-  session/settings/model state (`*.inMemory()`) are all in-memory. The only forced disk
-  write is bash output spillover to `os.tmpdir()`, redirected with `TMPDIR` to a per-run
-  tmpfs. See [`research/diskless-in-memory-config.md`](research/diskless-in-memory-config.md).
-- **"pi instruments" is not a product.** Pi emits no OTel by itself. Instrumentation is a
-  Pi extension on the `pi.on(...)` event bus that turns lifecycle events into OTLP spans.
-  Agenta already ingests OTLP at `POST /otlp/v1/traces` with adapters for GenAI semconv
-  and OpenInference, so `gen_ai.*` spans flow with little new backend code. Watch the
-  token-attribute drift (`input_tokens`/`output_tokens` vs the mapped
-  `prompt_tokens`/`completion_tokens`). See
-  [`research/otel-instrumentation.md`](research/otel-instrumentation.md).
-- **The harness seam is ours to build.** Pi's own "harness" concept is not a swap point
-  for Codex or Claude Code. The recommended shape is a thin TypeScript wrapper that drives
-  Pi's SDK with the in-memory overrides above and exposes our own protocol on a port. That
-  wrapper is the "works with our port" contract, the swappable-harness boundary, and the
-  local/server parity point. See [`research/auth-secrets.md`](research/auth-secrets.md).
-- **One shared sandbox is viable for v1.** Daytona supports one long-lived sandbox reused
-  across runs. It does not support swapping a volume per execution (volumes mount at create
-  time only). Per-run isolation comes from process memory plus a per-run tmpfs, not a
-  volume, which the diskless finding makes clean. Concurrency is contended, so bound it.
-  See [`research/sandbox-sharing.md`](research/sandbox-sharing.md),
-  [`research/daytona-sandbox.md`](research/daytona-sandbox.md).
-
-## POC work packages
-
-The POC runs as parallel tracks. Each has its own folder with scope and a definition of
-done. WP-1 and WP-2 run against a local Pi install first (no Daytona). WP-3 takes the
-sandbox path in parallel. WP-4 and WP-5 are design tasks that feed the WP-2 interface. WP-6 registers the agent as a
-backend workflow type and template, and defines its configuration and connection to the
-running agent.
-
-- [`wp-1-pi-tracing/`](wp-1-pi-tracing/README.md) — install Pi locally and send its agent
-  telemetry to Agenta as clean, structured traces.
-- [`wp-2-agent-service/`](wp-2-agent-service/README.md) — a new service that wraps Pi and
-  exposes a completion/chat-style interface, with auth and AGENTS.md set up in memory.
-- [`wp-3-daytona-sandbox/`](wp-3-daytona-sandbox/README.md) — create a Daytona sandbox with
-  Pi installed, inject files and secrets, run an agent, and stream output back.
-- [`wp-4-multi-message-output/`](wp-4-multi-message-output/README.md) — define how an
-  agent's multi-message output is shaped, streamed, stored, and surfaced.
-- [`wp-5-chat-vs-completion/`](wp-5-chat-vs-completion/README.md) — decide the interface
-  contract; start with chat that takes a single input.
-- [`wp-6-workflow-type-and-template/`](wp-6-workflow-type-and-template/README.md) — register
-  the agent as a new backend workflow type and template; define its config (model) and the
-  connection to the running agent.
-- [`wp-7-tools/`](wp-7-tools/README.md) — make runnable tools part of the agent config; resolve
-  Composio actions into Pi tools and route tool calls back through the existing
-  `POST /tools/call`, with MCP and workflow-as-tool as future adapters.
-- [`wp-8-rivet-acp-runtime/`](wp-8-rivet-acp-runtime/README.md) — re-platform the service onto
-  `rivet-dev/sandbox-agent` so the agent is driven over ACP and the harness (Pi, Claude Code,
-  Codex) becomes a config value, running locally first; tools, Daytona, and the folder jail deferred.
-
-## Related work
-
-- [`harness-port-redesign/`](harness-port-redesign/README.md) — research and proposal for
-  evolving the `Harness` and `Runtime` ports to learn from the rivet `sandbox-agent` SDK
-  (sessions, structured event streaming, capabilities, attachments, lifecycle). Follows on
-  from WP-8.
-- [`../prompt-runtime-unification/`](../prompt-runtime-unification/README.md) — the
-  prompt-side runtime that "future agent-style services" were already anticipated against.
+`scratch/` holds the raw working material from the build: the original work-package folders
+(WP-1 through WP-8), the port redesign notes, the research write-ups, and the proof-of-concept
+spikes. The pages above supersede it. It stays for history and for the running POC code, and
+it is not meant to be read as the design.
diff --git a/docs/design/agent-workflows/adapters/claude-code.md b/docs/design/agent-workflows/adapters/claude-code.md
new file mode 100644
index 0000000000..ba76e203bd
--- /dev/null
+++ b/docs/design/agent-workflows/adapters/claude-code.md
@@ -0,0 +1,94 @@
+# The Claude Code adapter
+
+Claude Code is the second harness. It proves the central claim of this PoC: that swapping
+the agent is one config value. Where the [Pi adapter](pi.md) does much of its work inside Pi
+through an extension, Claude does its work through standard ACP. That makes Claude the
+template for any MCP-capable harness rivet can drive.
+
+Read the [architecture](../architecture.md) and [ports and adapters](../ports-and-adapters.md)
+pages first.
+
+## Running Claude
+
+The daemon resolves the harness id `claude` to the `claude-agent-acp` adapter, which starts
+the `claude` CLI. One operational detail is worth calling out, because it caused a real bug.
+The daemon does not ship the `claude` CLI. It downloads it over HTTPS the first time a run
+asks for Claude. The sidecar image is a slim Node image with no root certificates, so that
+HTTPS download failed until we added `ca-certificates` to the image. With the certs in
+place, the download verifies and Claude runs.
+
+Auth is config, like everything else. Claude authenticates with `ANTHROPIC_API_KEY` from the
+project vault when present, or with an OAuth token (`CLAUDE_CODE_OAUTH_TOKEN`) otherwise. The
+runner turns the common failures into one clear line, so a user sees "add the project's
+Anthropic key" rather than a stack trace.
+
+## Tools over MCP
+
+Claude advertises the `mcpTools` capability, so the runner delivers tools to Claude the
+standard ACP way, over MCP. This is the branch that the [capability probe](../ports-and-adapters.md)
+chooses: deliver over MCP when the harness reports `mcpTools`, not when the harness name is
+something in particular.
+
+The mechanism is a small stdio MCP server (`tools/mcp-server.ts`) that the daemon launches
+and attaches to the session. Its tool bodies POST back to Agenta's `/tools/call` with the
+same WP-7 envelope the Pi path uses. The resolved specs and the callback endpoint reach the
+MCP server through its environment, so nothing tool-specific is written to a file the agent
+can read. The safety property is identical to Pi's: the provider key and the connection auth
+stay server-side, and the agent only ever asks Agenta to run a named tool.
+
+## Permissions
+
+Claude gates tool use behind a permission prompt. In an Agenta run there is no human at the
+keyboard to answer it, so the runner answers for it. By default it auto-approves, because the
+tools are backend-resolved and trusted. The per-run permission policy (or an env override)
+can flip this to deny, which rejects tool use instead. This is handled on
+`session.onPermissionRequest`, a hook Pi does not need because Pi does not gate tools this
+way.
+
+## Tracing from the event stream
+
+Claude does not self-instrument the way Pi does, because we do not load an Agenta extension
+into Claude. So the runner builds the trace itself, from the ACP event stream. It subscribes
+to the session's `session/update` notifications and turns them into the same span tree Pi
+produces:
+
+```
+invoke_agent            (AGENT)
+  turn 0                (CHAIN)
+    chat <model>        (LLM)
+    execute_tool <name> (TOOL)   one per ACP tool_call
+```
+
+This is the general path. Any harness rivet drives that does not bring its own
+instrumentation gets traced this way. Pi is the exception that traces itself; Claude is the
+rule.
+
+## Usage and output
+
+Claude reports usage in two places, so the runner reads both. The per-call input and output
+token split rides on the ACP `PromptResponse`, and the cost rides on the `usage_update`
+event. The runner combines them into the run total, which then rolls onto the workflow span
+the same way Pi's writeback total does.
+
+Output needs one small piece of care. Claude streams text deltas and also periodically
+streams a full cumulative snapshot of the message so far. If the runner naively appended
+everything, the answer would double. The runner detects a snapshot (a chunk that is a
+superset of what it already has) and replaces rather than appends, so the final text is
+correct whether a chunk is a delta or a snapshot.
+
+## Models
+
+Claude ignores a model id meant for another provider. Ask it for `gpt-5.5` and it keeps its
+own default. The runner handles this honestly: when the harness does not accept the requested
+model, the chat span is labelled `chat` rather than falsely claiming a model the run did not
+use.
+
+## What Claude demonstrates
+
+Claude is the proof that the seam works. Adding it required no change above the port and no
+new Python class. It also exercises the capability-driven branches the design is built on:
+tools over MCP because it reports `mcpTools`, a permission answer because it gates tools, and
+event-stream tracing because it does not self-instrument. A future harness that rivet can
+drive would reuse this exact path. A future harness that rivet cannot drive would instead get
+its own engine beside `engines/pi.ts` and `engines/rivet.ts`, behind the same `/run`
+contract.
diff --git a/docs/design/agent-workflows/adapters/pi.md b/docs/design/agent-workflows/adapters/pi.md
new file mode 100644
index 0000000000..7292ac0522
--- /dev/null
+++ b/docs/design/agent-workflows/adapters/pi.md
@@ -0,0 +1,112 @@
+# The Pi adapter
+
+Pi is the default harness. This page explains how we run it, how it gets its tools, and how
+it traces itself. Pi is the richer of the two adapters because Pi has an extension API we
+can use, so much of the work happens inside Pi rather than around it.
+
+Read the [architecture](../architecture.md) and [ports and adapters](../ports-and-adapters.md)
+pages first. This page assumes the relay and the wire contract.
+
+## Two ways Pi runs
+
+Pi runs through one of two engines, both behind the same port:
+
+- **Over ACP, through rivet** (`engines/rivet.ts` with `harness: pi`). This is the main
+  path and the one the rest of this page describes. The rivet daemon starts the `pi-acp`
+  adapter, which starts the `pi` CLI.
+- **In-process** (`engines/pi.ts`). This drives the Pi SDK directly inside the sidecar, with
+  no daemon, no adapter, and no ACP. It is the simplest local path and a fallback. The last
+  section covers it.
+
+## The ACP path: pi-acp plus a bundled extension
+
+On the ACP path, the daemon resolves the harness id `pi` to the `pi-acp` adapter. One detail
+matters: `pi-acp` does not bundle Pi. It spawns the `pi` CLI from `PATH`, so the runner
+points it at our pinned `pi` binary (`PI_ACP_PI_COMMAND`) and puts our `node_modules/.bin`
+on the daemon's `PATH`.
+
+The interesting part is what we load into Pi. We ship a single **Pi extension**
+(`extensions/agenta.ts`, bundled to `dist/extensions/agenta.js`) and install it into Pi's
+agent directory. Pi loads it on every run. This one extension does two jobs: it delivers our
+tools the Pi-native way, and it traces the run. Both are driven entirely by environment
+variables, so the extension stays inert when none are set and is safe to install globally.
+
+## Tools, the Pi-native way
+
+Pi 0.79.4 does not support MCP. So we do not deliver tools over MCP to Pi. Instead the
+extension reads the resolved tool specs from `AGENTA_TOOL_SPECS` and registers each one with
+Pi directly through `pi.registerTool`. Pi then sees them as native tools and runs the loop.
+
+Each registered tool's body does one thing: it POSTs the call back to Agenta's `/tools/call`
+with the tool's `callRef` (the WP-7 envelope). The model picks the tool and supplies the
+arguments; Agenta runs the actual tool server-side. This is the key safety property: the
+Composio key and the connection auth never enter the sandbox. The agent only ever asks
+Agenta to run a named tool.
+
+On Daytona the in-sandbox process cannot reach Agenta directly, so the extension writes each
+tool request to a file (`AGENTA_TOOL_RELAY_DIR`) and the runner, which can reach Agenta,
+relays it to `/tools/call` and writes the answer back. Same envelope, different delivery.
+
+## Tracing: Pi instruments itself
+
+Pi emits lifecycle events on an in-process event bus (`pi.on(...)`). The extension hooks
+those events and turns them into OpenTelemetry spans, the same span tree completion and chat
+already produce:
+
+```
+invoke_agent            (AGENT)
+  turn N                (CHAIN)
+    chat <model>        (LLM)    real token usage from the provider call
+    execute_tool <name> (TOOL)   one per tool the turn ran
+```
+
+The runner passes the caller's `traceparent` to the extension as `AGENTA_TRACEPARENT`. The
+extension starts `invoke_agent` as a child of that span, so the whole Pi run joins the same
+trace as the `/invoke` request. Because Pi self-instruments with real provider data, its
+spans carry true per-call token counts, not estimates.
+
+This is why the rivet engine does not also build spans for Pi. It would double them. The
+engine emits its own spans only for harnesses that do not self-instrument (see the
+[Claude Code adapter](claude-code.md)).
+
+## Usage writeback: the one extra hop
+
+Pi reports no token usage over ACP. It only has the numbers in-process. And the Pi spans and
+the workflow span ship to Agenta in separate batches, so Agenta cannot roll Pi's per-call
+tokens up onto the workflow span on its own.
+
+The fix is a small handoff. On `agent_end`, the extension writes the run's token and cost
+totals to a file (`AGENTA_USAGE_OUT`). The runner reads that file after the prompt finishes
+and returns the totals on the `/run` result. The Python service then stamps them on the live
+workflow span. The result is that `_agent` shows the agent's real tokens and cost even
+though the two traces shipped separately.
+
+## Models and output
+
+Pi exposes provider-prefixed model ids, like `openai-codex/gpt-5.5`. The runner normalizes a
+requested id to Pi's own id: it tries the value as given, and on rejection it matches by the
+part after the provider prefix. If nothing matches, Pi keeps its default and the run still
+answers.
+
+For output, Pi streams pure text deltas over ACP (`agent_message_chunk`). The runner
+appends them in order to build the final answer.
+
+## Daytona notes
+
+Two things differ on Daytona. The rivet `-full` image ships the `pi-acp` adapter but not the
+`pi` CLI, so the runner either installs `pi` into the sandbox at session time or runs from a
+pre-baked snapshot that already has it (the snapshot path avoids a slow per-run install).
+And auth comes from the provider key in the sandbox env when present, or from an uploaded
+`auth.json` (the developer's OAuth login) when no key is set.
+
+## The in-process engine
+
+The legacy engine (`engines/pi.ts`) skips rivet entirely. It drives Pi's `createAgentSession`
+directly, with everything in memory: AGENTS.md injected through the resource loader, the
+session and settings managers in memory, and a throwaway working directory. It registers the
+same WP-7 tools as Pi `customTools` (the same POST-back-to-`/tools/call` body) and traces
+with the same extension logic, just wired in process rather than loaded from disk.
+
+It returns the same `/run` result as the rivet path, which is the whole point of the port:
+the workflow author cannot tell which engine ran. It exists for the simplest local case and
+as a path that does not depend on the rivet daemon being present.
diff --git a/docs/design/agent-workflows/architecture.md b/docs/design/agent-workflows/architecture.md
new file mode 100644
index 0000000000..62cf1bfe85
--- /dev/null
+++ b/docs/design/agent-workflows/architecture.md
@@ -0,0 +1,180 @@
+# Architecture
+
+This page explains how an agent runs inside Agenta, from the moment a request arrives
+to the moment the answer comes back. Read it first. The other pages go deeper into the
+[ports and adapters](ports-and-adapters.md), [sessions](sessions.md), and the two
+shipped adapters ([Pi](adapters/pi.md), [Claude Code](adapters/claude-code.md)).
+
+## What an agent workflow is
+
+Agenta already runs prompt workflows: completion, chat, and the LLM judge. Each one calls
+a model once and returns one answer. An agent is different. It runs a loop. It reads its
+instructions, calls a model, runs a tool, reads the result, and calls the model again. It
+keeps going until the task is done, then returns the final answer.
+
+This PoC adds the agent as a new kind of workflow. It sits behind the same `/invoke`
+endpoint every other workflow uses, traces into the same spans, and reads its config from
+the same playground.
+
+The loop itself is not the hard part. Open-source coding agents already run the loop well.
+The hard part is running one of those agents *as an Agenta workflow*: behind the standard
+contract, traced into the standard spans, with the agent and the place it runs both
+swappable by config. That is the problem this architecture solves.
+
+## The core idea: a relay of programs
+
+The system is a relay. Each program starts the next one and passes work down the line. The
+prompt travels down the relay, and the answer travels back up.
+
+Here is the whole relay for a normal local run:
+
+```
+ browser / playground
+     │   POST /invoke
+     ▼
+ ┌───────────────────────────────────────────────────
+ │ CONTAINER 1: "services"   (Python / FastAPI)
+ │   the Agenta backend. Parses the request,
+ │   gathers config, and calls the runner.
+ └───────────────────────────────────────────────────
+     │   POST http://agent-pi:8765/run
+     ▼
+ ┌───────────────────────────────────────────────────
+ │ CONTAINER 2: "agent-pi"   (Node / TypeScript)
+ │   the sidecar.  server.ts → engines/rivet.ts
+ │
+ │   rivet daemon                  (subprocess)
+ │     └── ACP adapter: pi-acp     (subprocess)
+ │           └── pi                (subprocess)   ← the harness
+ └───────────────────────────────────────────────────
+     │   HTTPS
+     ▼
+ OpenAI / Anthropic   (the model)
+```
+
+Two containers carry the request. Inside the second one, a small tree of processes does
+the work. Each box has a clear job, and the next sections name them.
+
+## The two containers
+
+The deployment runs two containers that matter here. Both stay up all the time. You can
+see both in `hosting/docker-compose/ee/docker-compose.dev.yml`.
+
+The **`services`** container runs the Python backend. Every Agenta workflow lives here,
+including the agent. When you run an agent in the playground, the request lands in this
+container. The handler reads the config (which agent, which model, the instructions, the
+tools, the provider keys), builds one request, and calls the runner over HTTP.
+
+The **`agent-pi`** container is the sidecar. It runs a small Node web server on port 8765.
+Its only job is to receive a `POST /run`, drive the agent, and return the result. The
+`services` container reaches it on the internal network at `http://agent-pi:8765`.
+
+"Sidecar" just names a small helper container that runs next to a main one. Two reasons
+justify the split. The agent code is TypeScript and the backend is Python, so they want
+different runtimes. And the sidecar deliberately holds none of the stack's secrets (it has
+no `env_file`), so a sandboxed agent cannot read the platform's Stripe or Composio keys.
+
+## Inside the sidecar: the process tree
+
+The sidecar does not run the agent itself. When a `/run` request arrives, its TypeScript
+starts a chain of child processes, and each one starts the next.
+
+1. **The rivet daemon** (`sandbox-agent server`). Our code spawns it as a child process.
+   It is a binary from the open-source [`rivet-dev/sandbox-agent`](https://github.com/rivet-dev/sandbox-agent)
+   project (Apache-2.0). Think of it as a manager. You tell it "run agent `pi` with this
+   prompt," and it handles the work of launching the agent and streaming results back.
+
+2. **The ACP adapter** (`pi-acp`, or `claude-agent-acp` for Claude). The daemon spawns it
+   as a child process. It is a translator. It speaks ACP on the side facing the daemon and
+   the agent's own protocol on the side facing the agent.
+
+3. **The harness** (`pi`, or the `claude` CLI). The adapter spawns it as a child process.
+   This is the real coding agent. It reads the instructions, calls the model, runs tools,
+   and loops until the task is done.
+
+All three run as processes inside the `agent-pi` container. They are not separate
+containers. They form a parent-child-grandchild tree.
+
+## The vocabulary, defined once
+
+| Term | What it is |
+| --- | --- |
+| **Harness** | The coding agent program. Pi, Claude Code, and Codex are harnesses. Each is a CLI that takes instructions, calls a model, runs tools, and loops. "Harness" is our umbrella word for "the agent engine." |
+| **ACP** (Agent Client Protocol) | A shared language for talking to any coding agent. Without it, each agent has its own API and you write custom glue per agent. With it, you speak one protocol and the agent on the far end is swappable. This is why one config value flips `pi` to `claude`. |
+| **ACP adapter** | The translator that makes one specific agent speak ACP. Pi does not speak ACP on its own, so `pi-acp` wraps it. Claude has `claude-agent-acp`. |
+| **rivet daemon** | The manager that starts the adapter and harness, hides *where* they run, and streams their events back over ACP. We use it; we did not write it. |
+| **Sandbox** | *Where* the agent's process tree runs. `local` means processes inside the sidecar. `daytona` means a throwaway cloud machine. |
+| **Sidecar** | The always-on helper container (`agent-pi`) that drives runs. Not the sandbox. The sidecar starts the sandbox. |
+
+## Two axes you can change: harness and sandbox
+
+The whole point of the relay is that two pieces swap independently, by config, with no code
+change. The playground exposes both as dropdowns.
+
+- **Harness** chooses *which* agent runs: `pi` or `claude`. It becomes the rivet `agent`
+  value, which selects the ACP adapter.
+- **Sandbox** chooses *where* the agent's process tree runs: `local` or `daytona`.
+
+The two are orthogonal. You can run `pi` locally, `claude` locally, or `pi` on Daytona, and
+each is one dropdown change. The request also carries a **permission policy** (`auto` or
+`deny`) that decides how a permission-gating harness like Claude handles tool prompts in a
+run with no human watching.
+
+## Local versus Daytona: the same tree, a different place
+
+The relay above is `sandbox: local`. The daemon, adapter, and harness all run as processes
+inside the `agent-pi` container, on our own server.
+
+Switch to `sandbox: daytona` and one thing changes. That same tree runs in a Daytona cloud
+sandbox instead. Daytona starts a throwaway remote machine, the daemon and adapter and
+harness run there, and the sidecar talks to them over HTTP. Everything else is identical.
+
+So the sidecar is not the sandbox. The sidecar is the always-on driver. The sandbox is the
+place the agent runs, which is either "processes inside the sidecar" (`local`) or "a cloud
+machine the sidecar talks to" (`daytona`).
+
+## The lifecycle: cold per run
+
+Nothing in the process tree stays alive between runs. Only the two containers stay up.
+Every invoke starts a fresh daemon, which starts a fresh adapter, which starts a fresh
+harness. The run does its work, returns its answer, and then the runner tears the whole
+tree down (`destroySandbox` and `dispose` in a `finally` block). The next invoke builds the
+tree again from scratch.
+
+This is the **cold** model. It is simple and well isolated, and it has one consequence
+worth stating up front: because no session is held between turns, a multi-turn conversation
+replays its history on every turn. [Sessions](sessions.md) covers what that means today and
+how a warm model could change it tomorrow.
+
+## The other engine: in-process Pi
+
+The relay above describes the **rivet engine**, the default in the deployed stack and the
+path the rest of these docs assume. The runner also ships a second engine: **legacy
+in-process Pi**. It drives the
+Pi SDK directly inside the sidecar, with no daemon, adapter, or ACP in between. It exists
+for the simplest local case and as a fallback that does not depend on the rivet daemon.
+
+Both engines sit behind the same Python port and serve the same `/run` contract, so the
+choice between them is a deployment detail, not a difference the workflow author sees. The
+[ports and adapters](ports-and-adapters.md) page explains how one neutral seam holds both.
+
+## How a request flows, end to end
+
+Putting it together, a single agent run on `pi` / `local` goes like this:
+
+1. The playground sends `POST /invoke` to the `services` container.
+2. The Python handler (`agent/app.py`) reads the config, resolves the tools and provider
+   keys, and builds a `SessionConfig`.
+3. It picks the engine (`rivet`) and the transport (HTTP to the sidecar), then sends one
+   `POST /run`.
+4. The sidecar's rivet engine starts the daemon, which starts `pi-acp`, which starts `pi`.
+5. `pi` reads the instructions, calls the model, runs any tools, and streams events back up
+   the relay. Those events become trace spans nested under the `/invoke` span (the
+   [Pi adapter](adapters/pi.md) page explains who emits them).
+6. The harness finishes. The runner reads the final text and the token usage, tears the
+   tree down, and returns one `/run` result.
+7. The Python handler records the usage on the workflow span and returns the assistant
+   message as the `/invoke` response.
+
+The next pages explain the seam that makes step 3 engine-agnostic, the session model behind
+steps 4 to 6, and exactly how each adapter implements step 5.
diff --git a/docs/design/agent-workflows/ports-and-adapters.md b/docs/design/agent-workflows/ports-and-adapters.md
new file mode 100644
index 0000000000..1914c1e9f8
--- /dev/null
+++ b/docs/design/agent-workflows/ports-and-adapters.md
@@ -0,0 +1,172 @@
+# Ports and adapters
+
+The [architecture](architecture.md) page showed the relay of programs. This page shows the
+seam that keeps that relay swappable. It explains why the seam exists, what shapes it
+defines, and how the service picks an engine and a transport at run time.
+
+## The problem the seam solves
+
+We want three things at once. We want to run more than one coding agent (Pi today, Claude
+Code today, others later). We want to run them in more than one place (local today, Daytona
+today, other sandboxes later). And we never want either choice to leak into the workflow
+code that sits above it.
+
+A neutral port solves this. The Python service talks to one small interface. Everything
+agent-specific and place-specific lives in adapters behind that interface. Rivet, which
+does most of the heavy lifting, is one adapter behind the port, not the port itself. That
+keeps the door open for a future agent that rivet cannot drive (see the
+[Claude Code adapter](adapters/claude-code.md) page for how a non-rivet engine would slot
+in).
+
+We learned the shape of this port by studying the rivet SDK. Rivet splits its surface into
+three planes, and that split is the main lesson:
+
+| Plane | What it covers | In our port? |
+| --- | --- | --- |
+| Runtime / sandbox | Where the agent runs, and its lifecycle | Yes, as the **Environment** seam |
+| Agent session | The prompt, the config, the event stream | Yes, as the **Harness** and **AgentSession** seams |
+| System | Filesystem, process, desktop control | No. This is provisioning, used only inside an adapter, never shown to the workflow author |
+
+The first two planes became our two seams. The third we keep out of the port on purpose: a
+workflow author configures an agent, not a filesystem.
+
+## Seam one: the Environment (where it runs)
+
+The `Environment` seam answers one question: where does the harness process run? The
+`LocalEnvironment` runs it as a subprocess on this host. It has a `start` and a `dispose`
+lifecycle and one real method, `exec`, which runs a command and feeds it the request on
+stdin.
+
+Daytona does not need a separate Python `Environment`. The rivet engine selects the Daytona
+sandbox inside the TypeScript runner, below the port, so "run on a cloud machine" is an
+adapter detail rather than a second Python class. The `Environment` seam stays thin on
+purpose.
+
+## Seam two: the Harness and the AgentSession (the conversation)
+
+The `Harness` seam is the heart of the port. It is the agent engine, and rivet and the
+legacy Pi path are both adapters behind it.
+
+```python
+class Harness(ABC):
+    async def setup(self) -> None: ...
+    async def shutdown(self) -> None: ...
+    async def invoke(self, request, *, on_event=None) -> AgentResult: ...
+    async def destroy_session(self, session_id) -> None: ...   # cold: a no-op
+    def create_session(self, config) -> AgentSession: ...
+```
+
+`invoke` is the single transport call: one cold run in, one structured result out. On top of
+it sits the `AgentSession`, the first-class abstraction borrowed from rivet:
+
+```python
+class AgentSession:
+    async def prompt(self, messages, *, on_event=None) -> AgentResult: ...
+    async def destroy(self) -> None: ...
+```
+
+The workflow handler always works through the session: `create_session(config)`, then
+`session.prompt(messages)`, then `session.destroy()`. Under the cold model the session
+holds no warm daemon, so each `prompt` is a fresh `invoke` that replays the supplied
+history. The abstraction is real and stable even though the lifecycle behind it is cold.
+[Sessions](sessions.md) explains why we kept it cold and what a warm session would change.
+
+## The engine is config, not a class
+
+A reader expecting three Python classes (one per agent) will be surprised. There are two
+*transports*, and the *engine* is a value they pass, not a class hierarchy.
+
+The two transports differ only in how they reach the TypeScript runner:
+
+- **`SubprocessHarness`** spawns the runner's CLI through an `Environment` and hands it the
+  request on stdin. This is the local, no-Docker path.
+- **`HttpHarness`** sends a `POST /run` to the sidecar. This is the deployed path.
+
+Each transport carries a `backend` value (`rivet` or `pi`) that tells the runner which
+engine to use. So the choice of *agent engine* is one string on the wire, and the choice of
+*how Python reaches the runner* is the transport. Collapsing the engine into config is what
+replaced the old `PiHarness` / `PiHttpHarness` / `RivetHarness` trio with two transports
+and one wire contract.
+
+## How the service picks an engine and a transport
+
+The handler makes both choices on every request, in `agent/app.py`.
+
+It picks the **engine** with `select_backend(harness, sandbox)`. The rule is simple: use
+`rivet` when `AGENTA_AGENT_RUNTIME=rivet` is set, or when the harness is anything other than
+`pi`, or when the sandbox is anything other than `local`. The legacy in-process Pi engine
+only knows how to run Pi locally, so any Claude or Daytona selection forces `rivet` rather
+than silently dropping the choice.
+
+It picks the **transport** with `build_harness(backend)`. If `AGENTA_AGENT_PI_URL` is set
+(the Docker deployment), it uses `HttpHarness` against the sidecar. If it is unset (a local
+checkout), it uses `SubprocessHarness` and spawns the runner directly.
+
+Engine and transport are deployment concerns. Harness and sandbox are workflow config. The
+seam keeps the two kinds of choice from tangling.
+
+## The wire contract: one `/run` shape
+
+Both transports send the same camelCase JSON and parse the same result back. The shape
+lives once in `harness/wire.py` on the Python side and `protocol.ts` on the TypeScript
+side. This contract is the actual boundary of the system.
+
+**Request** (the `SessionConfig` plus the conversation):
+
+| Field | Meaning |
+| --- | --- |
+| `harness`, `sandbox` | The two swap axes |
+| `sessionId` | Continue a prior run by replaying its history |
+| `agentsMd` | The agent's instructions, written as `AGENTS.md` |
+| `model` | The requested model id |
+| `messages` | The conversation so far; the runner sends the latest turn and replays the rest |
+| `secrets` | Provider API keys as env vars, resolved from the project vault |
+| `tools`, `customTools`, `toolCallback` | The resolved runnable tools and where they call back |
+| `permissionPolicy` | `auto` or `deny` for a permission-gating harness |
+| `trace` | The Agenta trace context, so the run nests under the `/invoke` span |
+
+**Result** (the reply plus structured run metadata):
+
+| Field | Meaning |
+| --- | --- |
+| `output` | The final assistant text (what the playground renders) |
+| `messages` | The structured assistant messages |
+| `events` | The structured event log for the turn (see below) |
+| `usage` | Token and cost totals, rolled onto the workflow span |
+| `stopReason` | Why the turn ended |
+| `capabilities` | What the harness was probed to support this run |
+| `sessionId`, `model`, `traceId` | Identifiers for the run |
+
+## The shared vocabulary: capabilities, content blocks, events
+
+Three neutral types travel on that wire. They are ours, not rivet's, so a non-rivet adapter
+implements them too.
+
+**Capabilities** describe what a harness can do: `mcpTools`, `images`, `usage`,
+`streamingDeltas`, `permissions`, and the rest. The rivet engine probes them live from the
+daemon and returns them in the result. This is what removed the brittle `if harness == "pi"`
+branches: the runner now branches on a flag, where the live answer is. For example, it
+delivers tools over MCP only when the harness reports `mcpTools`, instead of guessing from
+the name.
+
+**Content blocks** mirror ACP: a message's content is either a plain string or a list of
+`text` / `image` / `resource` blocks. Today the playground sends only text. The image and
+resource kinds are plumbed through the types so an image-capable harness can take them once
+the playground sends them.
+
+**Events** are the structured stream. Each event is one of `message`, `thought`,
+`tool_call`, `tool_result`, `usage`, `error`, or `done`. The runner builds this log from the
+harness as the run proceeds and returns it on the result. An `on_event` sink can also
+receive the events. Today the transports deliver the whole log at once after the run, since
+`/run` is request-and-response; live streaming over the HTTP edge is a documented follow-on.
+This event vocabulary is also what makes a Vercel-AI-style stream easy to add later, because
+the event kinds line up with that protocol's parts almost one to one.
+
+## Why this shape
+
+The port mirrors rivet's vocabulary but keeps the types ours. That gives us rivet's rich
+session, capability probe, and event stream without making the port a rivet wrapper. The
+single neutral seam carries two engines today (rivet over ACP, legacy in-process Pi) and has
+room for a third tomorrow. The cost of that flexibility is one extra hop and one wire
+contract to keep in sync across two languages, which the `wire.py` / `protocol.ts` pairing
+contains in one place each.
diff --git a/docs/design/agent-workflows/harness-port-redesign/README.md b/docs/design/agent-workflows/scratch/harness-port-redesign/README.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/README.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/README.md
diff --git a/docs/design/agent-workflows/harness-port-redesign/implementation.md b/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/implementation.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md
diff --git a/docs/design/agent-workflows/harness-port-redesign/plan.md b/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/plan.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/plan.md
diff --git a/docs/design/agent-workflows/harness-port-redesign/proposal.md b/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/proposal.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md
diff --git a/docs/design/agent-workflows/harness-port-redesign/research.md b/docs/design/agent-workflows/scratch/harness-port-redesign/research.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/research.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/research.md
diff --git a/docs/design/agent-workflows/harness-port-redesign/status.md b/docs/design/agent-workflows/scratch/harness-port-redesign/status.md
similarity index 100%
rename from docs/design/agent-workflows/harness-port-redesign/status.md
rename to docs/design/agent-workflows/scratch/harness-port-redesign/status.md
diff --git a/docs/design/agent-workflows/research/auth-secrets.md b/docs/design/agent-workflows/scratch/research/auth-secrets.md
similarity index 100%
rename from docs/design/agent-workflows/research/auth-secrets.md
rename to docs/design/agent-workflows/scratch/research/auth-secrets.md
diff --git a/docs/design/agent-workflows/research/daytona-sandbox.md b/docs/design/agent-workflows/scratch/research/daytona-sandbox.md
similarity index 100%
rename from docs/design/agent-workflows/research/daytona-sandbox.md
rename to docs/design/agent-workflows/scratch/research/daytona-sandbox.md
diff --git a/docs/design/agent-workflows/research/diskless-in-memory-config.md b/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md
similarity index 100%
rename from docs/design/agent-workflows/research/diskless-in-memory-config.md
rename to docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md
diff --git a/docs/design/agent-workflows/research/open-questions.md b/docs/design/agent-workflows/scratch/research/open-questions.md
similarity index 100%
rename from docs/design/agent-workflows/research/open-questions.md
rename to docs/design/agent-workflows/scratch/research/open-questions.md
diff --git a/docs/design/agent-workflows/research/otel-instrumentation.md b/docs/design/agent-workflows/scratch/research/otel-instrumentation.md
similarity index 100%
rename from docs/design/agent-workflows/research/otel-instrumentation.md
rename to docs/design/agent-workflows/scratch/research/otel-instrumentation.md
diff --git a/docs/design/agent-workflows/research/pi-interaction.md b/docs/design/agent-workflows/scratch/research/pi-interaction.md
similarity index 100%
rename from docs/design/agent-workflows/research/pi-interaction.md
rename to docs/design/agent-workflows/scratch/research/pi-interaction.md
diff --git a/docs/design/agent-workflows/research/sandbox-sharing.md b/docs/design/agent-workflows/scratch/research/sandbox-sharing.md
similarity index 100%
rename from docs/design/agent-workflows/research/sandbox-sharing.md
rename to docs/design/agent-workflows/scratch/research/sandbox-sharing.md
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/README.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/README.md
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/README.md
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/integrating-the-tracing-extension.md
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/.env.example
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/.env.example
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/.env.example
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/README.md
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/README.md
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/package.json
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/package.json
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/package.json
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/pnpm-lock.yaml
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/pnpm-lock.yaml
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/pnpm-lock.yaml
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/run.ts
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/poc/run.ts
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/run.ts
diff --git a/docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md
similarity index 100%
rename from docs/design/agent-workflows/wp-1-pi-tracing/tracing-in-the-agent-service.md
rename to docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md
diff --git a/docs/design/agent-workflows/wp-2-agent-service/README.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-2-agent-service/README.md
rename to docs/design/agent-workflows/scratch/wp-2-agent-service/README.md
diff --git a/docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md
similarity index 100%
rename from docs/design/agent-workflows/wp-2-agent-service/implementation-plan.md
rename to docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md
diff --git a/docs/design/agent-workflows/wp-2-agent-service/qa.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/qa.md
similarity index 100%
rename from docs/design/agent-workflows/wp-2-agent-service/qa.md
rename to docs/design/agent-workflows/scratch/wp-2-agent-service/qa.md
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/README.md b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/README.md
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/README.md
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/poc/README.md
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/README.md
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/bench_coldstart.py
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/poc/bench_coldstart.py
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/bench_coldstart.py
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/build_snapshot.py
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/poc/build_snapshot.py
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/build_snapshot.py
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/cleanup.py
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/poc/cleanup.py
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/cleanup.py
diff --git a/docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/run_agent.py
similarity index 100%
rename from docs/design/agent-workflows/wp-3-daytona-sandbox/poc/run_agent.py
rename to docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/run_agent.py
diff --git a/docs/design/agent-workflows/wp-4-multi-message-output/README.md b/docs/design/agent-workflows/scratch/wp-4-multi-message-output/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-4-multi-message-output/README.md
rename to docs/design/agent-workflows/scratch/wp-4-multi-message-output/README.md
diff --git a/docs/design/agent-workflows/wp-5-chat-vs-completion/README.md b/docs/design/agent-workflows/scratch/wp-5-chat-vs-completion/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-5-chat-vs-completion/README.md
rename to docs/design/agent-workflows/scratch/wp-5-chat-vs-completion/README.md
diff --git a/docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md b/docs/design/agent-workflows/scratch/wp-6-workflow-type-and-template/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-6-workflow-type-and-template/README.md
rename to docs/design/agent-workflows/scratch/wp-6-workflow-type-and-template/README.md
diff --git a/docs/design/agent-workflows/wp-7-tools/README.md b/docs/design/agent-workflows/scratch/wp-7-tools/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-7-tools/README.md
rename to docs/design/agent-workflows/scratch/wp-7-tools/README.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/README.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/architecture.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/context.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/isolation-and-fork.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/isolation-and-fork.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/isolation-and-fork.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/plan.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/commit_agent_config.py
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/debug-events.ts
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/debug-events.ts
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/debug-events.ts
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/dump-full.ts
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/dump-full.ts
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/dump-full.ts
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/package.json
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/package.json
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/package.json
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/spike.ts
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/poc/spike.ts
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/spike.ts
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/research.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/research.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/research.md
diff --git a/docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md
similarity index 100%
rename from docs/design/agent-workflows/wp-8-rivet-acp-runtime/status.md
rename to docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md
diff --git a/docs/design/agent-workflows/sessions.md b/docs/design/agent-workflows/sessions.md
new file mode 100644
index 0000000000..2af252ee48
--- /dev/null
+++ b/docs/design/agent-workflows/sessions.md
@@ -0,0 +1,93 @@
+# Sessions: today and tomorrow
+
+A session is how a multi-turn conversation holds together across runs. This page explains
+what a session is in the PoC today, why we built it the simple way on purpose, and the two
+paths open to us tomorrow.
+
+## What a session is today
+
+Today a session is a `session_id` and the message history that goes with it. It is not a
+live process kept warm between turns. Every turn is a fresh, cold run: the runner starts the
+daemon, the adapter, and the harness, runs one turn, and tears all three down.
+
+Because nothing stays warm, the conversation has to be rebuilt on each turn. This works by
+**replay**. The playground holds the full message history and sends it back with every turn.
+The runner takes that history, flattens the prior turns into a short transcript, and puts
+the transcript in front of the new message before it prompts the harness:
+
+```
+Conversation so far:
+user: what is the capital of France?
+assistant: Paris.
+
+Continue the conversation. The user now says:
+and of Germany?
+```
+
+The transcript is capped (by `AGENTA_AGENT_HISTORY_MAX_CHARS`) so the replayed tokens stay
+bounded on long conversations. The `session_id` rides along on the trace (as `session.id`
+and `gen_ai.conversation.id`) and comes back on the result, so a follow-up turn can carry
+it forward.
+
+## The session is already a first-class object
+
+Even though the lifecycle is cold, the port models a session as a real object. The workflow
+handler does not call `invoke` directly. It calls:
+
+```python
+session = harness.create_session(config)
+result = await session.prompt(messages)
+await session.destroy()
+```
+
+`AgentSession` is the rivet-shaped abstraction described on the
+[ports and adapters](ports-and-adapters.md) page. Under the cold model, `prompt` is a fresh
+`invoke` that replays history and `destroy` is a no-op. The abstraction is stable. Only the
+mechanism behind it is cold. This matters because it gives a future session store a clean
+place to attach, with no change to the handler above it.
+
+## Why we kept it cold on purpose
+
+Rivet can do real, warm sessions. Its SDK has `createSession`, `resumeSession`, and the ACP
+`session/load` call, all backed by a persistence driver. The usual way to continue a
+conversation is to keep one daemon warm and replay events into it with `session/load`.
+
+We chose not to do that yet. A warm daemon shared across runs reopens hard questions that
+the cold model sidesteps: a per-session channel for secrets and trace context, and a
+filesystem jail so two tenants sharing a daemon cannot read each other's files. The cold
+model gives strong isolation for free, because each run is born and dies alone. For a PoC
+that proves the agent workflow end to end, that trade is the right one.
+
+The cost is the replay above. Replay spends tokens re-sending history, and it cannot restore
+in-harness state that a transcript does not capture (a partly built plan, a tool's cached
+result). For short conversations this is invisible. For long or stateful ones it is the
+thing a warm model would fix.
+
+## Tomorrow: two paths
+
+There are two ways to grow past cold replay, and they are not the same.
+
+**Path one: a server-side session store, still cold.** Keep one daemon per turn, but move
+the history out of the playground and into the platform. A `SessionStore` (backed by the
+backend database, or by a file for a standalone run) holds the event history. To continue,
+the service replays the persisted history into a fresh cold sandbox, exactly as today, but
+the platform owns the record instead of the client. This keeps the strong isolation of the
+cold model and still gives durable, server-owned sessions. It is the smaller step, and the
+`AgentSession` object is already the place it attaches.
+
+**Path two: a warm daemon with `session/load`.** Keep a daemon alive between turns and use
+the ACP `session/load` call to restore the real in-harness session, no transcript replay.
+This is the richer model. It restores state a transcript cannot, and it opens the door to
+`session/fork` for trying several variations of a turn. It also requires the per-session
+secret channel and the filesystem jail we deferred, so it is the larger step.
+
+The likely order is path one first, then path two if and when stateful, long-running agents
+need it. Path one is an additive feature behind the existing port. Path two is a change to
+the runtime model.
+
+## The open question
+
+Path one leaves one decision for the team: where the event history lives. The default
+assumption is the backend database on the platform and a file for a standalone run, which
+mirrors how the rest of Agenta splits platform storage from local runs. Settling that is the
+first step whenever session persistence moves from "documented" to "built."
diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts
index d1de1019cf..d1129e7ba9 100644
--- a/services/agent/src/tracing/otel.ts
+++ b/services/agent/src/tracing/otel.ts
@@ -3,7 +3,7 @@
  * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta.
  *
  * This is the service build of the WP-1 POC extension
- * (docs/design/agent-workflows/wp-1-pi-tracing/poc/agenta-otel.ts). It keeps the
+ * (docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts). It keeps the
  * span tree and the load-bearing attribute choices identical, and adds three
  * things the service needs that the single-run POC did not:
  *
diff --git a/services/oss/src/harness/__init__.py b/services/oss/src/harness/__init__.py
index 066e364c76..a6b553c33f 100644
--- a/services/oss/src/harness/__init__.py
+++ b/services/oss/src/harness/__init__.py
@@ -2,7 +2,7 @@
 
 Nothing here is Agenta-specific. The Agenta workflow integration (the ``/invoke`` handler,
 tool resolution, secrets, tracing) lives in ``oss.src.agent``. Two seams (see
-``docs/design/agent-workflows/harness-port-redesign/``):
+``docs/design/agent-workflows/ports-and-adapters.md``):
 
 - ``Harness``: the agent engine. ``SubprocessHarness`` and ``HttpHarness`` (``transports.py``)
   reach the TypeScript runner over a subprocess or HTTP. The engine that runs behind them
diff --git a/services/oss/src/harness/ports.py b/services/oss/src/harness/ports.py
index bbf7f59e4c..40e57b0174 100644
--- a/services/oss/src/harness/ports.py
+++ b/services/oss/src/harness/ports.py
@@ -2,7 +2,7 @@
 
 These interfaces keep the service environment-agnostic and engine-agnostic. The shapes
 are borrowed from the rivet ``sandbox-agent`` SDK (see
-``docs/design/agent-workflows/harness-port-redesign/``) but stay ours, so rivet is one
+``docs/design/agent-workflows/ports-and-adapters.md``) but stay ours, so rivet is one
 adapter behind the seam and a non-rivet engine (the legacy in-process Pi path) fits the
 same port.
 

From 8b006339a8505070666a1db0a79c5239b9290aac Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 18 Jun 2026 20:10:16 +0200
Subject: [PATCH 10/10] feat(agent): move the agent runtime into the SDK behind
 backend/harness ports

Relocate the neutral runtime to agenta.sdk.agents (dtos / interfaces / adapters /
utils): the Backend / Environment / Sandbox / Session / Harness ports, the
RivetBackend / InProcessPiBackend / LocalBackend backends, the Pi / Claude / Agenta
harness adapters (which own the per-harness config mapping), and the /run wire.
Rewire the agent service onto the ports and delete services/oss/src/harness. Add
SDK + service unit and golden tests, and update the agent-workflows docs to the
as-built design.
---
 docs/design/agent-workflows/README.md         |  14 +-
 .../design/agent-workflows/adapters/agenta.md |  64 ++
 .../agent-workflows/adapters/claude-code.md   |  15 +-
 docs/design/agent-workflows/adapters/pi.md    |  67 ++-
 .../agent-workflows/agent-protocol-rfc.md     | 526 +++++++++++++++++
 docs/design/agent-workflows/architecture.md   |  23 +-
 .../agent-workflows/ports-and-adapters.md     | 222 +++----
 .../scratch/harness-port-redesign/README.md   |   2 +
 .../harness-port-redesign/implementation.md   |  56 +-
 .../scratch/harness-port-redesign/plan.md     |   2 +
 .../scratch/harness-port-redesign/proposal.md |   2 +
 .../scratch/harness-port-redesign/research.md |   2 +
 .../scratch/harness-port-redesign/status.md   |  17 +-
 .../research/diskless-in-memory-config.md     |   1 +
 .../scratch/research/open-questions.md        |   1 +
 .../scratch/research/pi-interaction.md        |   1 +
 .../scratch/sdk-local-backend/status.md       |  81 +++
 .../integrating-the-tracing-extension.md      |   1 +
 .../tracing-in-the-agent-service.md           |   1 +
 .../scratch/wp-2-agent-service/README.md      |   1 +
 .../wp-2-agent-service/implementation-plan.md |   1 +
 .../scratch/wp-7-tools/README.md              |   1 +
 .../scratch/wp-8-rivet-acp-runtime/README.md  |   1 +
 .../wp-8-rivet-acp-runtime/architecture.md    |   1 +
 .../scratch/wp-8-rivet-acp-runtime/context.md |   1 +
 .../scratch/wp-8-rivet-acp-runtime/plan.md    |   1 +
 .../scratch/wp-8-rivet-acp-runtime/status.md  |   1 +
 docs/design/agent-workflows/sessions.md       |  17 +-
 .../agent-workflows/streaming-and-sessions.md | 481 +++++++++++++++
 sdks/python/agenta/__init__.py                |  17 +
 sdks/python/agenta/sdk/agents/__init__.py     |  91 +++
 .../agenta/sdk/agents/adapters/__init__.py    |  23 +
 .../sdk/agents/adapters/agenta_builtins.py    |  90 +++
 .../agenta/sdk/agents/adapters/harnesses.py   | 168 ++++++
 .../agenta/sdk/agents/adapters/in_process.py  | 170 ++++++
 .../agenta/sdk/agents/adapters/local.py       |  48 ++
 .../agenta/sdk/agents/adapters/rivet.py       | 186 ++++++
 sdks/python/agenta/sdk/agents/dtos.py         | 554 ++++++++++++++++++
 sdks/python/agenta/sdk/agents/errors.py       |  23 +
 sdks/python/agenta/sdk/agents/interfaces.py   | 279 +++++++++
 sdks/python/agenta/sdk/agents/streaming.py    |  91 +++
 .../agenta/sdk/agents/utils/__init__.py       |  19 +
 .../agenta/sdk/agents/utils/ts_runner.py      | 163 ++++++
 sdks/python/agenta/sdk/agents/utils/wire.py   |  88 +++
 .../agenta/tests/agents/test_streaming.py     | 167 ++++++
 .../oss/tests/pytest/unit/agents/__init__.py  |   1 +
 .../oss/tests/pytest/unit/agents/conftest.py  | 198 +++++++
 .../agents/golden/run_request.claude.json     |  27 +
 .../unit/agents/golden/run_request.pi.json    |  35 ++
 .../unit/agents/golden/run_result.error.json  |   4 +
 .../unit/agents/golden/run_result.ok.json     |  31 +
 .../unit/agents/test_dtos_agent_config.py     | 151 +++++
 .../agents/test_dtos_capabilities_events.py   |  81 +++
 .../unit/agents/test_dtos_content_blocks.py   |  90 +++
 .../unit/agents/test_dtos_harness_configs.py  |  74 +++
 .../unit/agents/test_environment_lifecycle.py | 127 ++++
 .../unit/agents/test_harness_adapters.py      | 267 +++++++++
 .../pytest/unit/agents/test_wire_contract.py  | 224 +++++++
 .../skills/agenta-getting-started/SKILL.md    |  21 +
 services/agent/src/cli.ts                     |  55 +-
 services/agent/src/engines/pi.ts              | 102 +++-
 services/agent/src/engines/rivet.ts           |  28 +-
 services/agent/src/protocol.ts                |  58 +-
 services/agent/src/server.ts                  |  72 ++-
 services/agent/src/tracing/otel.ts            | 131 ++++-
 services/agent/test/stream-events.test.ts     | 124 ++++
 services/oss/src/agent/__init__.py            |  11 +-
 services/oss/src/agent/app.py                 | 121 ++--
 services/oss/src/agent/inputs.py              | 128 ----
 services/oss/src/agent/schemas.py             |   7 +-
 services/oss/src/agent/tools.py               |   2 +-
 services/oss/src/agent/tracing.py             |   2 +-
 services/oss/src/harness/__init__.py          |  48 --
 services/oss/src/harness/environment.py       |  57 --
 services/oss/src/harness/ports.py             | 418 -------------
 services/oss/src/harness/transports.py        | 145 -----
 services/oss/src/harness/wire.py              |  73 ---
 .../oss/tests/pytest/unit/agent/__init__.py   |   1 +
 .../oss/tests/pytest/unit/agent/conftest.py   |  99 ++++
 .../pytest/unit/agent/test_invoke_handler.py  |  71 +++
 .../pytest/unit/agent/test_secrets_mapping.py |  24 +
 .../pytest/unit/agent/test_select_backend.py  |  61 ++
 .../tests/pytest/unit/agent/test_tool_refs.py |  75 +++
 83 files changed, 5894 insertions(+), 1131 deletions(-)
 create mode 100644 docs/design/agent-workflows/adapters/agenta.md
 create mode 100644 docs/design/agent-workflows/agent-protocol-rfc.md
 create mode 100644 docs/design/agent-workflows/scratch/sdk-local-backend/status.md
 create mode 100644 docs/design/agent-workflows/streaming-and-sessions.md
 create mode 100644 sdks/python/agenta/sdk/agents/__init__.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/__init__.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/harnesses.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/in_process.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/local.py
 create mode 100644 sdks/python/agenta/sdk/agents/adapters/rivet.py
 create mode 100644 sdks/python/agenta/sdk/agents/dtos.py
 create mode 100644 sdks/python/agenta/sdk/agents/errors.py
 create mode 100644 sdks/python/agenta/sdk/agents/interfaces.py
 create mode 100644 sdks/python/agenta/sdk/agents/streaming.py
 create mode 100644 sdks/python/agenta/sdk/agents/utils/__init__.py
 create mode 100644 sdks/python/agenta/sdk/agents/utils/ts_runner.py
 create mode 100644 sdks/python/agenta/sdk/agents/utils/wire.py
 create mode 100644 sdks/python/agenta/tests/agents/test_streaming.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/__init__.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/conftest.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/golden/run_request.claude.json
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/golden/run_request.pi.json
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/golden/run_result.error.json
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/golden/run_result.ok.json
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_dtos_agent_config.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_dtos_capabilities_events.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_dtos_content_blocks.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_dtos_harness_configs.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_environment_lifecycle.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_harness_adapters.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py
 create mode 100644 services/agent/skills/agenta-getting-started/SKILL.md
 create mode 100644 services/agent/test/stream-events.test.ts
 delete mode 100644 services/oss/src/agent/inputs.py
 delete mode 100644 services/oss/src/harness/__init__.py
 delete mode 100644 services/oss/src/harness/environment.py
 delete mode 100644 services/oss/src/harness/ports.py
 delete mode 100644 services/oss/src/harness/transports.py
 delete mode 100644 services/oss/src/harness/wire.py
 create mode 100644 services/oss/tests/pytest/unit/agent/__init__.py
 create mode 100644 services/oss/tests/pytest/unit/agent/conftest.py
 create mode 100644 services/oss/tests/pytest/unit/agent/test_invoke_handler.py
 create mode 100644 services/oss/tests/pytest/unit/agent/test_secrets_mapping.py
 create mode 100644 services/oss/tests/pytest/unit/agent/test_select_backend.py
 create mode 100644 services/oss/tests/pytest/unit/agent/test_tool_refs.py

diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md
index 57c8a3ad25..90a12c2b2c 100644
--- a/docs/design/agent-workflows/README.md
+++ b/docs/design/agent-workflows/README.md
@@ -15,14 +15,17 @@ sandbox, and nothing above the seam changes.
 
 1. **[Architecture](architecture.md)**. How a request flows from the playground to the model
    and back: the relay of programs, the two containers, and the vocabulary. Start here.
-2. **[Ports and adapters](ports-and-adapters.md)**. The seam that keeps the relay swappable:
-   the two seams, the wire contract, and how the service picks an engine and a transport.
+2. **[Ports and adapters](ports-and-adapters.md)**. The ports that keep the relay swappable:
+   the backend, environment, and harness layers, where they live in the SDK, the wire
+   contract, and how the service picks a backend.
 3. **[Sessions](sessions.md)**. How a multi-turn conversation holds together today (cold
    replay), and the two paths open to us tomorrow.
 4. **[The Pi adapter](adapters/pi.md)**. The default harness, which traces itself and takes
    tools natively through a Pi extension.
 5. **[The Claude Code adapter](adapters/claude-code.md)**. The second harness, which proves
    the swap and is the template for any MCP-capable agent.
+6. **[The Agenta harness](adapters/agenta.md)**. Pi with an opinion: forced skills, forced
+   tools, and a base AGENTS.md preamble the author's instructions are appended to.
 
 ## What this PoC includes and defers
 
@@ -35,6 +38,13 @@ storage (see [Sessions](sessions.md)), live streaming to the client over the HTT
 multi-tenant filesystem jail for a shared daemon, and registering the agent as a first-class
 backend workflow type with its own builtin URI. Each is called out where it belongs.
 
+The first two of those, streaming and server-owned sessions, have a proposed design:
+[Streaming and sessions](streaming-and-sessions.md) for the rationale and trade-offs, and
+the [Agent protocol RFC](agent-protocol-rfc.md) for the normative spec of the endpoints and
+the wire format. They add a new `POST /messages` endpoint (Vercel-AI-format SSE stream, an
+optional `session_id`, and `UIMessage` inputs) plus a `load-session` endpoint, sitting next
+to the existing `/invoke`, which is unchanged.
+
 ## The `scratch/` folder
 
 `scratch/` holds the raw working material from the build: the original work-package folders
diff --git a/docs/design/agent-workflows/adapters/agenta.md b/docs/design/agent-workflows/adapters/agenta.md
new file mode 100644
index 0000000000..ca9fd4ea77
--- /dev/null
+++ b/docs/design/agent-workflows/adapters/agenta.md
@@ -0,0 +1,64 @@
+# The Agenta harness
+
+`AgentaHarness` is Pi with an opinion. It runs on the same engine as the [Pi
+adapter](pi.md) and produces a Pi-shaped config, so it inherits everything Pi does (native
+tools, the system-prompt layers, tracing). What it adds is a fixed set of Agenta-shipped
+extras that the agent author cannot turn off:
+
+- **Forced tools** — always unioned into the agent's resolved tools. At minimum `read`
+  (Pi only renders the skills section when `read` is enabled) and `bash` (so skills can run
+  their helper scripts).
+- **Forced skills** — Agenta-shipped Pi skills loaded on every run.
+- **A base AGENTS.md preamble** — the author's `instructions` are appended after it.
+- **A base persona** — forced onto Pi's `append_system`, with any author-supplied
+  `append_system` appended after it.
+
+Read the [architecture](../architecture.md), [ports and adapters](../ports-and-adapters.md),
+and [Pi adapter](pi.md) pages first. This page assumes them.
+
+## Where the forced bits live
+
+The forced *policy* lives in the SDK harness layer, in one editable module:
+`sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py` (`AGENTA_PREAMBLE`,
+`AGENTA_FORCED_APPEND_SYSTEM`, `AGENTA_FORCED_TOOLS`, `AGENTA_FORCED_SKILLS`). `AgentaHarness`
+(`adapters/harnesses.py`) reads them in `_to_harness_config` and layers them onto the neutral
+`SessionConfig`, exactly where `PiHarness` and `ClaudeHarness` do their own translation.
+
+The forced skill *files* live with the runner that runs Pi, under
+`services/agent/skills/<name>/` (each a directory with a `SKILL.md`). Skills are real files on
+disk because they reference relative scripts and assets, so they cannot ride the wire as
+text. The contract between the two halves is the skill **name**: `AGENTA_FORCED_SKILLS` lists
+names, and each must match a committed directory under the runner's skills root.
+
+## How a skill reaches the model
+
+1. `AgentaHarness._to_harness_config` puts the forced skill names on the `skills` field of
+   the `/run` request (`AgentaAgentConfig.wire_tools`).
+2. The in-process Pi engine (`engines/pi.ts`) resolves each name against its bundled
+   `skills/` root (override with `AGENTA_AGENT_SKILLS_DIR`) and passes the directories to Pi's
+   `DefaultResourceLoader` as `additionalSkillPaths`, with `noSkills: true` so only the
+   bundled skills load (the run stays hermetic, like `noContextFiles`).
+3. Pi loads them, and because the forced `read` tool is enabled, surfaces them in the system
+   prompt. The model reads a skill's `SKILL.md` on demand (progressive disclosure).
+
+## Two prompt layers, kept distinct
+
+This follows Pi's own split (see `PiAgentConfig`): the **persona** ("who the agent is")
+belongs in `append_system`, and **project conventions** belong in `AGENTS.md`. So the Agenta
+persona is a forced `append_system`, while the Agenta base preamble plus the author's
+instructions are the `AGENTS.md`. An author's own `system` / `append_system` (via
+`AgentConfig.harness_options["pi"]`) still apply, layered after the forced persona.
+
+## Selecting it
+
+`agenta` is a harness option alongside `pi` and `claude` (the playground dropdown, the
+`harness` field). It runs on the in-process Pi backend (`InProcessPiBackend` now lists
+`HarnessType.AGENTA` as supported), so `select_backend` keeps `agenta` on the local Pi path.
+
+## Deferred
+
+Only the in-process Pi (local) path is wired. The ACP/rivet path (and therefore the Daytona
+sandbox) does not yet deliver the forced skills — it would teach `runRivet` to read the
+`skills` field and lay the bundled skill directories into the sandbox via the existing
+bundled-file provisioning. Until then, `agenta` with a non-local sandbox raises
+`UnsupportedHarnessError` rather than silently running without its skills.
diff --git a/docs/design/agent-workflows/adapters/claude-code.md b/docs/design/agent-workflows/adapters/claude-code.md
index ba76e203bd..64a4e3c96c 100644
--- a/docs/design/agent-workflows/adapters/claude-code.md
+++ b/docs/design/agent-workflows/adapters/claude-code.md
@@ -85,10 +85,11 @@ use.
 
 ## What Claude demonstrates
 
-Claude is the proof that the seam works. Adding it required no change above the port and no
-new Python class. It also exercises the capability-driven branches the design is built on:
-tools over MCP because it reports `mcpTools`, a permission answer because it gates tools, and
-event-stream tracing because it does not self-instrument. A future harness that rivet can
-drive would reuse this exact path. A future harness that rivet cannot drive would instead get
-its own engine beside `engines/pi.ts` and `engines/rivet.ts`, behind the same `/run`
-contract.
+Claude is the proof that the seam works. Adding it took a `ClaudeHarness` (which holds its
+Pi-versus-Claude config mapping) and no change to the workflow handler above the ports; the
+same `RivetBackend` drives it. It also exercises the capability-driven branches the design is
+built on: tools over MCP because it reports `mcpTools`, a permission answer because it gates
+tools, and event-stream tracing because it does not self-instrument. A future harness that
+rivet can drive would reuse this exact path. A future harness that rivet cannot drive would
+instead get its own backend beside `RivetBackend` and `InProcessPiBackend`, behind the same
+`/run` contract.
diff --git a/docs/design/agent-workflows/adapters/pi.md b/docs/design/agent-workflows/adapters/pi.md
index 7292ac0522..abcd9ced87 100644
--- a/docs/design/agent-workflows/adapters/pi.md
+++ b/docs/design/agent-workflows/adapters/pi.md
@@ -47,6 +47,60 @@ On Daytona the in-sandbox process cannot reach Agenta directly, so the extension
 tool request to a file (`AGENTA_TOOL_RELAY_DIR`) and the runner, which can reach Agenta,
 relays it to `/tools/call` and writes the answer back. Same envelope, different delivery.
 
+## System prompt: AGENTS.md, SYSTEM, and APPEND_SYSTEM
+
+Pi builds its system prompt from three separate inputs, and they stack rather than compete:
+
+- **`AGENTS.md`** is project context. Pi wraps it in a `<project_context>` block and appends
+  it after the base prompt. It loads with no trust gate, and it is what `instructions` on the
+  neutral `AgentConfig` becomes. This is the right home for project conventions, commands,
+  and preferences.
+- **`APPEND_SYSTEM`** adds to Pi's built-in base prompt without replacing it. Reach for this
+  when you only want to add framing on top of Pi's default coding-assistant prompt.
+- **`SYSTEM`** replaces the base prompt outright. Pi throws away its default
+  "you are a coding assistant" persona, the tool list, and the built-in guidelines, and uses
+  your text instead. Use it only when a workflow needs a fundamentally different agent.
+
+The key fact: these are not either/or with `AGENTS.md`. Even when `SYSTEM` replaces the base
+prompt, Pi still appends the `AGENTS.md` context after it. So `AGENTS.md` stays the project
+layer, and `SYSTEM` / `APPEND_SYSTEM` only change Pi's base persona. For almost every agent,
+`AGENTS.md` alone is enough; the other two are a deliberate opt-in.
+
+### How to set them
+
+`SYSTEM` and `APPEND_SYSTEM` are Pi-specific, so they ride the neutral config's per-harness
+escape hatch, `AgentConfig.harness_options`. It is a bag keyed by harness name; each Harness
+adapter reads only its own slice:
+
+```python
+AgentConfig(
+    instructions="Project: a SQL analytics tool. Run `make lint` before finishing.",  # AGENTS.md
+    harness_options={
+        "pi": {
+            "system": "You are a SQL expert. Only answer with queries.",  # replaces base prompt
+            "append_system": "Always explain each query in one line.",     # adds to base prompt
+        }
+    },
+)
+```
+
+`PiHarness` lifts the `pi` slice onto `PiAgentConfig.system` / `append_system`, which emit
+`systemPrompt` / `appendSystemPrompt` on the `/run` wire. An empty or whitespace value is
+dropped, so it never reaches the runner as a real override.
+
+### Delivery status
+
+The **in-process Pi engine** honors both. It feeds them through the resource loader's
+`systemPromptOverride` / `appendSystemPromptOverride`, so the run stays hermetic: only what
+the request carries applies, never a `SYSTEM.md` or `APPEND_SYSTEM.md` left on disk.
+
+The **ACP (rivet) path does not deliver them yet**. It drives Pi through `pi-acp`, which gives
+us no per-run hook to set the prompt: a project `.pi/SYSTEM.md` is trust-gated, and the CLI
+`--system-prompt` flag cannot be set per session through the adapter. The engine logs a
+warning when these fields are set on that path so the gap is visible, not silent. `AGENTS.md`
+still applies there, because Pi loads context files regardless of trust. Wiring the ACP path
+(via project trust plus `.pi/SYSTEM.md`, or per-session CLI flags) is the remaining work.
+
 ## Tracing: Pi instruments itself
 
 Pi emits lifecycle events on an in-process event bus (`pi.on(...)`). The extension hooks
@@ -101,12 +155,13 @@ And auth comes from the provider key in the sandbox env when present, or from an
 
 ## The in-process engine
 
-The legacy engine (`engines/pi.ts`) skips rivet entirely. It drives Pi's `createAgentSession`
-directly, with everything in memory: AGENTS.md injected through the resource loader, the
-session and settings managers in memory, and a throwaway working directory. It registers the
-same WP-7 tools as Pi `customTools` (the same POST-back-to-`/tools/call` body) and traces
-with the same extension logic, just wired in process rather than loaded from disk.
+The in-process Pi engine (`engines/pi.ts`, selected by the `InProcessPiBackend`) skips rivet
+entirely. It drives Pi's `createAgentSession` directly, with everything in memory: AGENTS.md
+injected through the resource loader, the session and settings managers in memory, and a
+throwaway working directory. It registers the same tools as Pi `customTools` (the same
+POST-back-to-`/tools/call` body) and traces with the same extension logic, just wired in
+process rather than loaded from disk.
 
-It returns the same `/run` result as the rivet path, which is the whole point of the port:
+It returns the same `/run` result as the rivet path, which is the whole point of the ports:
 the workflow author cannot tell which engine ran. It exists for the simplest local case and
 as a path that does not depend on the rivet daemon being present.
diff --git a/docs/design/agent-workflows/agent-protocol-rfc.md b/docs/design/agent-workflows/agent-protocol-rfc.md
new file mode 100644
index 0000000000..4a517d5a72
--- /dev/null
+++ b/docs/design/agent-workflows/agent-protocol-rfc.md
@@ -0,0 +1,526 @@
+# RFC: Agenta Agent Protocol (`POST /messages`, Sessions and Streaming)
+
+| | |
+| --- | --- |
+| **Status** | Draft |
+| **Version** | 0.1 |
+| **Layer** | Frontend to backend, over HTTP/1.1 |
+| **Defines** | `POST /messages`, `POST /load-session` |
+| **Reuses** | The workflow response envelope (`WorkflowServiceResponse`) and revision resolution (`references`) |
+| **Companion** | [streaming-and-sessions.md](streaming-and-sessions.md) (design rationale and trade-offs) |
+
+## Abstract
+
+This document specifies the wire protocol between an Agenta client (typically a browser
+running the Vercel AI SDK `useChat` hook) and the Agenta backend for running an **agent**
+workflow. It defines a new endpoint, `POST /messages`, for stateful, streaming chat. The
+endpoint carries a session identifier in the request and response bodies, offers two response
+modes (a single JSON response and a Server-Sent Events stream in the Vercel UI Message Stream
+format), and takes the agent's inputs as a conversation (`messages`) plus named input
+variables (`inputs`). A second endpoint, `POST /load-session`, returns a conversation's
+history.
+
+`/messages` is a sibling of the existing workflow `/invoke`, not a change to it. The generic,
+stateless `/invoke` is untouched. `/messages` exists because the chat contract differs: the
+conversation is a first-class top-level member in the Vercel `UIMessage` shape, the response
+can stream, and a turn belongs to a session.
+
+## 1. Conventions and terminology
+
+The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**,
+**SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this document are to be
+interpreted as described in RFC 2119 and RFC 8174 when, and only when, they appear in all
+capitals.
+
+JSON is defined in RFC 8259. Server-Sent Events (SSE) follow the WHATWG HTML `text/event-
+stream` definition. All request and response bodies are UTF-8 encoded JSON unless a streaming
+content type is negotiated.
+
+| Term | Definition |
+| --- | --- |
+| **Agent** | A workflow that runs a multi-step loop (model, tool, model, ...) and emits a stream of events before producing a final answer. |
+| **Turn** | One request to `/messages`. A turn supplies new input and produces one assistant response (streamed or whole). |
+| **Session** | A server-named conversation that groups turns. Identified by a `session_id`. |
+| **`session_id`** | An opaque string that identifies a session within a project. Carried in the request and response bodies. |
+| **`UIMessage`** | A message in Vercel AI SDK v5/v6 form: `{ id, role, parts[] }`. See Appendix B. |
+| **Part** | One element of the UI Message Stream (for example `text-delta`, `tool-input-available`). See Section 6.2. |
+| **`inputs`** | The agent's inputs for a turn: the conversation `messages` plus named input variables. See Section 5. |
+| **Streaming edge** | The backend component that encodes the agent's internal `AgentEvent` stream into the UI Message Stream. |
+
+## 2. Protocol overview
+
+The protocol defines two endpoints:
+
+| Method | Path | Purpose |
+| --- | --- | --- |
+| `POST` | `/messages` | Run one agent turn. Returns one JSON response or an SSE stream, by content negotiation. |
+| `POST` | `/load-session` | Return the history of a session. |
+
+A turn carries an OPTIONAL `session_id`. The server resolves it per Section 4. A turn's
+response mode is selected by the `Accept` request header per Section 6.
+
+The agent's input for a turn is the conversation `data.messages` in `UIMessage` form, plus the
+named input variables in `data.inputs`. The agent configuration travels as on `/invoke`,
+either inline in `data.parameters` or resolved from `references` (Section 5).
+
+```
+            ┌─────────────────────────── client (useChat) ───────────────────────────┐
+            │                                                                          │
+   POST /messages  (Accept: text/event-stream)               POST /load-session        │
+            │                                                          │               │
+            ▼                                                          ▼               │
+   ┌──────────────────┐   AgentEvent stream    ┌───────────────────────────────────┐  │
+   │ agent run         │ ─────────────────────▶│ streaming edge → UI Message Stream │──┘
+   │ (harness loop)    │                        └───────────────────────────────────┘
+   └──────────────────┘                                   persists per turn
+            │                                                     │
+            └──────────────── trace store (ag.session.id) ◀───────┘   load-session reads here
+```
+
+## 3. Relationship to `/invoke`
+
+`/messages` is a new endpoint. It does not change `/invoke`. The generic, stateless workflow
+invoke keeps its exact request and response, and a client that does not run a chat agent never
+touches `/messages`.
+
+`/messages` reuses two things from the workflow contract so the backend does not fork: the
+response envelope (`WorkflowServiceResponse`, with the answer in `data.outputs`) and revision
+resolution (`references`). It diverges from `/invoke` in three ways, which is why it is its own
+endpoint:
+
+1. The conversation is a first-class member, `data.messages`, in the `UIMessage` shape, rather
+   than nested in `data.inputs.messages` as `{role, content}`.
+2. The response can stream as a UI Message Stream (Section 6.2).
+3. A turn belongs to a session (`session_id`, Section 4).
+
+A server **SHOULD** map a `/messages` request onto the same internal agent invocation that
+`/invoke` uses, after lifting `data.messages` and `data.inputs` into the handler's `messages`
+and `inputs` arguments.
+
+## 4. Session model
+
+### 4.1 Identity
+
+A `session_id` is an opaque string scoped to a project. The pair `(project_id, session_id)`
+**MUST** be unique. A bare `session_id` is not a global identifier.
+
+A client-supplied `session_id`:
+
+- **MUST** be treated as an opaque token. A server **MUST NOT** interpolate it into a storage
+  path, a query, or a trace attribute without escaping.
+- **SHOULD** be constrained by the server to a bounded length and a restricted character set.
+  A server **MAY** reject an id outside those bounds with `400 Bad Request`.
+
+### 4.2 Resolution
+
+On receiving a turn, the server resolves the session as follows:
+
+1. If the request omits `session_id`, the server **MUST** mint a new unique id, associate the
+   turn with it, and return that id (Section 6).
+2. If the request supplies a `session_id` that does not exist for the caller's project, the
+   server **MUST** create a session with that id and associate the turn with it.
+3. If the request supplies a `session_id` that exists for the caller's project, the server
+   **MUST** associate the turn with that existing session.
+4. If the request supplies a `session_id` that exists under a **different** project, the
+   server **MUST NOT** resume it. The server **MUST** treat it as case 2 within the caller's
+   own project, or reject the turn. A server **MUST NOT** disclose the existence of a session
+   the caller does not own.
+
+Rule 4 is the ownership boundary. "Resume if it exists" means "resume if it exists and
+belongs to the caller."
+
+### 4.3 Continuation semantics for this version
+
+In this version, associating a turn with a session records the turn under that session for
+tracing and later retrieval. The conversation context the model sees is supplied by the
+`messages` in the request (Section 5.2), not reconstructed from the server's record.
+
+A future version MAY make the server's record authoritative, at which point a turn carries
+only the new message and the server supplies the prior history. The request field is
+unchanged by that evolution. See [streaming-and-sessions.md](streaming-and-sessions.md).
+
+### 4.4 Concurrency
+
+Two turns that create the same new `(project_id, session_id)` concurrently **MUST** resolve
+to a single session. A server **SHOULD** enforce this with a unique constraint and treat the
+losing creation as a resume (case 3).
+
+## 5. Request format (`POST /messages`)
+
+### 5.1 Envelope
+
+```jsonc
+{
+  "session_id": "sess_123",       // OPTIONAL (Section 4)
+  "references":  { ... },         // OPTIONAL: selects the workflow revision (as on /invoke)
+  "data": {
+    "messages":   [ /* UIMessage[] */ ],    // REQUIRED: the conversation (Section 5.2)
+    "inputs":     { "<name>": <value> },     // OPTIONAL: named input variables (Section 5.3)
+    "parameters": { /* agent config */ }     // OPTIONAL (Section 5.4)
+  }
+}
+```
+
+`session_id` sits at the envelope top level, alongside the existing `trace_id` and `span_id`.
+It **MUST NOT** be required in a request header.
+
+`data.messages`, `data.inputs`, and `data.parameters` are siblings. They map onto the agent
+handler's `messages`, `inputs`, and `parameters` arguments. On `/invoke` the conversation is
+nested at `data.inputs.messages`; on `/messages` it is lifted out to `data.messages`, because
+the conversation is the primary input of this endpoint.
+
+### 5.2 `data.messages`
+
+`data.messages` is the conversation as an array of `UIMessage` objects (Appendix B). It is
+REQUIRED. The last element is the new user turn.
+
+In this version the client **MUST** send the full conversation in `data.messages`. Each
+element uses the parts-based `UIMessage` shape (Appendix B), not the `{role, content}` shape
+of `/invoke`.
+
+### 5.3 `data.inputs`
+
+`data.inputs` carries the agent's named input variables for the turn: the workflow's declared
+inputs and any per-turn context the caller supplies (for example a retrieved document or a
+record id). Keys are input names; values are arbitrary JSON. This is the same `inputs` as the
+workflow contract, with the conversation no longer nested inside it.
+
+`data.inputs` is OPTIONAL and MAY be sent on every turn, since its values can change between
+turns.
+
+### 5.4 `data.parameters` and `references`
+
+The agent configuration (instructions, model, tools, harness, sandbox, permission policy)
+travels as on `/invoke`: inline in `data.parameters.agent`, or resolved by the platform from
+`references` when the request targets a stored revision. This protocol does not change that
+resolution.
+
+### 5.5 Content negotiation
+
+The response mode is selected by the `Accept` request header:
+
+| `Accept` | Response |
+| --- | --- |
+| `application/json` (or absent) | Single JSON response (Section 6.1) |
+| `text/event-stream` | UI Message Stream over SSE (Section 6.2) |
+
+A server that cannot satisfy the `Accept` header **MUST** respond `406 Not Acceptable`.
+
+## 6. Response formats
+
+### 6.1 Single JSON response
+
+For `Accept: application/json`, the server returns `200 OK` with a body extending
+`WorkflowServiceResponse`:
+
+```jsonc
+{
+  "trace_id":   "...",
+  "span_id":    "...",
+  "session_id": "sess_123",                 // the resolved id (minted or echoed)
+  "status":     { "code": 200 },
+  "data":       { "outputs": { "role": "assistant", "content": "Berlin." } }
+}
+```
+
+The response **MUST** include `session_id`, set to the resolved session (Section 4). The
+assistant answer rides in `data.outputs` as today. Token usage is not in the body; it is
+recorded on the trace.
+
+### 6.2 UI Message Stream (SSE)
+
+For `Accept: text/event-stream`, the server returns `200 OK` and streams the run in the
+Vercel UI Message Stream format (AI SDK v5/v6).
+
+#### 6.2.1 Response headers
+
+The response **MUST** set:
+
+```
+content-type: text/event-stream
+x-vercel-ai-ui-message-stream: v1
+```
+
+and **SHOULD** set:
+
+```
+cache-control: no-cache
+connection: keep-alive
+x-accel-buffering: no
+```
+
+`x-accel-buffering: no` disables proxy buffering so parts flush immediately.
+
+#### 6.2.2 Framing
+
+Each part is one SSE event: the literal bytes `data: `, followed by the part as compact JSON
+(no insignificant whitespace), followed by `\n\n`.
+
+```
+data: {"type":"text-delta","id":"t1","delta":"Hello"}\n\n
+```
+
+The stream **MUST** terminate with the literal line `data: [DONE]\n\n`.
+
+#### 6.2.3 Part registry
+
+The parts a server emits, with their REQUIRED fields. Fields not listed are OPTIONAL and MAY
+be omitted.
+
+| `type` | Required fields | Meaning |
+| --- | --- | --- |
+| `start` | none | Begin a message. Carries `messageId` and `messageMetadata` (Section 6.2.4). |
+| `start-step` | none | Begin a step of the agent loop. |
+| `finish-step` | none | End the current step. |
+| `finish` | none | End the message. Carries `finishReason`, `messageMetadata`. |
+| `text-start` | `id` | Begin a text block. |
+| `text-delta` | `id`, `delta` | Append `delta` to the text block `id`. |
+| `text-end` | `id` | End the text block. |
+| `reasoning-start` | `id` | Begin a reasoning block. |
+| `reasoning-delta` | `id`, `delta` | Append to the reasoning block. |
+| `reasoning-end` | `id` | End the reasoning block. |
+| `tool-input-start` | `toolCallId`, `toolName` | A tool call begins. |
+| `tool-input-delta` | `toolCallId`, `inputTextDelta` | Append a fragment of the tool arguments (note: `inputTextDelta`, not `delta`). |
+| `tool-input-available` | `toolCallId`, `toolName`, `input` | The full tool arguments are known. |
+| `tool-output-available` | `toolCallId`, `output` | The tool result. |
+| `tool-output-error` | `toolCallId`, `errorText` | The tool failed. |
+| `file` | `url`, `mediaType` | A file or image. `url` MAY be an `https:` or `data:` URL. |
+| `data-<name>` | `data` | An application-defined part (generative UI). MAY carry `id` and `transient`. |
+| `error` | `errorText` | A stream-level error (Section 8.2). |
+
+A server **MUST** order parts so that for any `id` or `toolCallId`, a `*-start` precedes its
+deltas, which precede its `*-end` or `*-available`. Text and reasoning deltas are
+concatenated by `id`. Tool parts are keyed by `toolCallId`.
+
+#### 6.2.4 Session id in the stream
+
+The server **MUST** convey the resolved `session_id` as `messageMetadata.sessionId` on the
+`start` part, which is the first part of the stream:
+
+```
+data: {"type":"start","messageId":"msg_1","messageMetadata":{"sessionId":"sess_123"}}
+```
+
+A server **MAY** additionally mirror `session_id` to a response header. The body remains the
+normative source.
+
+#### 6.2.5 Mapping from agent events
+
+The streaming edge consumes the agent's internal `AgentEvent` stream
+(`services/agent/src/protocol.ts:74`) and emits parts as follows:
+
+| `AgentEvent` | Parts |
+| --- | --- |
+| run start (synthesized) | `start` (with `messageId`, `messageMetadata.sessionId`), then `start-step` |
+| `message` | `text-start`, one or more `text-delta`, `text-end` |
+| `thought` | `reasoning-start`, `reasoning-delta`, `reasoning-end` |
+| `tool_call` | `tool-input-start`, then `tool-input-available` |
+| `tool_result` with `isError=false` | `tool-output-available` |
+| `tool_result` with `isError=true` | `tool-output-error` |
+| `usage` | `messageMetadata` on the `finish` part |
+| `error` | `error` (Section 8.2) |
+| `done` | `finish-step`, then `finish` (`finishReason` = `stopReason`), then `[DONE]` |
+
+A harness that reports `capabilities.streamingDeltas` produces token-level `text-delta`
+parts. A harness that does not produces one `text-delta` carrying the whole text. The wire
+shape is identical, so the client does not distinguish them.
+
+The protocol streams deltas only. There is no full-message snapshot part. The client
+assembles the final `UIMessage` from the parts. The server **SHOULD** record the assembled
+turn on the trace (`ag.session.id`), which is the source `load-session` reads.
+
+## 7. The `load-session` endpoint (`POST /load-session`)
+
+Returns the history of a session so a client can rebuild a conversation it does not hold
+locally.
+
+### 7.1 Request
+
+```jsonc
+{ "session_id": "sess_123" }
+```
+
+`session_id` is REQUIRED. The server **MUST** apply the ownership rule of Section 4.2: if the
+session does not exist for the caller's project, the server **MUST** respond `404 Not Found`
+and **MUST NOT** reveal a session owned by another project.
+
+### 7.2 Response (default, `Accept: application/json`)
+
+The server returns `200 OK` with the conversation as `UIMessage` objects, the shape `useChat`
+accepts as its initial `messages`:
+
+```jsonc
+{
+  "session_id": "sess_123",
+  "messages": [
+    { "id": "m1", "role": "user",      "parts": [ { "type": "text", "text": "capital of France?" } ] },
+    { "id": "m2", "role": "assistant", "parts": [ { "type": "text", "text": "Paris." } ] }
+  ]
+}
+```
+
+### 7.3 Response (negotiated replay, `Accept: text/event-stream`)
+
+A server **MAY** support a delta replay of the stored history under
+`Accept: text/event-stream`, re-emitting the session as a UI Message Stream (Section 6.2).
+This is OPTIONAL. Whether the folded form or the replay is the primary form is left open by
+this draft; a conformant client **SHOULD** request `application/json` for rebuilding a static
+view.
+
+## 8. Error handling
+
+### 8.1 Request and endpoint errors (JSON)
+
+Before a stream begins, the server reports errors with an HTTP status and the existing
+`status` envelope (`WorkflowServiceStatus`: `code`, `message`, `type`, `stacktrace`):
+
+| Status | Condition |
+| --- | --- |
+| `400 Bad Request` | Malformed body, or a `session_id` that violates Section 4.1. |
+| `401 Unauthorized` / `403 Forbidden` | Missing or invalid credentials. |
+| `404 Not Found` | `load-session` on a session the caller does not own. |
+| `406 Not Acceptable` | The `Accept` header cannot be satisfied. |
+| `5xx` | Server failure before streaming starts. |
+
+### 8.2 In-stream errors
+
+A failure after the stream has started **MUST** be reported as an `error` part:
+
+```
+data: {"type":"error","errorText":"the agent run failed: ..."}
+```
+
+After emitting an `error` part, the server **SHOULD** terminate the stream. It **MAY** omit
+the `finish` part. It **SHOULD** still emit `[DONE]` to close the SSE channel cleanly. The
+client surfaces the error to the user.
+
+## 9. Security considerations
+
+- **Session ownership.** Section 4.2 rule 4 is a security requirement, not a convenience.
+  Because a client may supply a `session_id` for an unknown id (case 2), a server that keys
+  sessions on `session_id` alone would let a caller read or extend another tenant's
+  conversation. Servers **MUST** key on `(project_id, session_id)` and scope every resume,
+  every `load-session`, and every existence check to the caller's project.
+- **Opaque ids.** A client-supplied `session_id` is untrusted input. See Section 4.1.
+- **Secrets.** Provider keys and tool credentials travel and resolve as in the current
+  contract. This protocol adds no new secret-bearing field. `inputs` is caller-supplied
+  input and **MUST NOT** be used to smuggle credentials in place of the existing `secrets`
+  and signed-credential mechanisms.
+- **Content negotiation and buffering.** A streaming response disables proxy buffering
+  (Section 6.2.1). Operators **MUST** ensure intermediaries do not re-buffer `text/event-
+  stream` responses, or streaming degrades to a single delayed flush.
+
+## 10. Interaction sequences
+
+### 10.1 New session, streaming turn
+
+```
+client                                  server
+  │  POST /messages                        │
+  │  Accept: text/event-stream             │
+  │  { data:{ messages:[...] } }           │   (no session_id)
+  │───────────────────────────────────────▶│
+  │                                         │  mint sess_123
+  │  200 text/event-stream                  │
+  │  data: {"type":"start",                 │
+  │         "messageMetadata":              │
+  │           {"sessionId":"sess_123"}}     │
+  │◀───────────────────────────────────────│
+  │  data: {"type":"start-step"} ...        │
+  │  ... tool / text parts ...              │
+  │  data: {"type":"finish"}                │
+  │  data: [DONE]                           │
+  │◀───────────────────────────────────────│
+  │  (client stores sess_123 for next turn) │
+```
+
+### 10.2 Returning to a known session
+
+```
+client                                  server
+  │  POST /load-session                     │
+  │  { "session_id": "sess_123" }           │
+  │───────────────────────────────────────▶│  check ownership
+  │  200 { messages: [ UIMessage, ... ] }   │
+  │◀───────────────────────────────────────│
+  │  (render history; hold it)              │
+  │                                         │
+  │  POST /messages                         │
+  │  Accept: text/event-stream              │
+  │  { session_id:"sess_123",               │
+  │    data:{ messages:[...full] } }        │
+  │───────────────────────────────────────▶│  resolve existing sess_123
+  │  200 text/event-stream → parts → [DONE] │
+  │◀───────────────────────────────────────│
+```
+
+## Appendix A: Full stream transcript
+
+One turn: the agent calls a weather tool, reads the result, and answers. Every `data:` line
+in order, each followed by a blank line.
+
+```
+data: {"type":"start","messageId":"msg_1","messageMetadata":{"sessionId":"sess_123"}}
+
+data: {"type":"start-step"}
+
+data: {"type":"tool-input-start","toolCallId":"call_1","toolName":"getWeather"}
+
+data: {"type":"tool-input-available","toolCallId":"call_1","toolName":"getWeather","input":{"city":"Paris"}}
+
+data: {"type":"tool-output-available","toolCallId":"call_1","output":{"weather":"sunny","temp":24}}
+
+data: {"type":"finish-step"}
+
+data: {"type":"start-step"}
+
+data: {"type":"text-start","id":"t1"}
+
+data: {"type":"text-delta","id":"t1","delta":"It is sunny "}
+
+data: {"type":"text-delta","id":"t1","delta":"and 24°C in Paris."}
+
+data: {"type":"text-end","id":"t1"}
+
+data: {"type":"finish-step"}
+
+data: {"type":"finish","messageMetadata":{"usage":{"input":820,"output":36,"cost":0.004}}}
+
+data: [DONE]
+```
+
+## Appendix B: `UIMessage` schema
+
+A message accumulated by the client and accepted by `load-session`:
+
+```jsonc
+{
+  "id":   "m2",
+  "role": "user | assistant | system",
+  "parts": [
+    { "type": "text",        "text": "..." },
+    { "type": "reasoning",   "text": "..." },
+    { "type": "tool-<name>", "toolCallId": "...", "state": "output-available", "input": {}, "output": {} },
+    { "type": "file",        "url": "...", "mediaType": "image/png" },
+    { "type": "data-<name>", "data": { } },
+    { "type": "step-start" }
+  ],
+  "metadata": { }
+}
+```
+
+A `UIMessage` carries no top-level `content` string in v5/v6. All content lives in `parts`.
+
+## Appendix C: References
+
+- RFC 2119, RFC 8174: requirement keywords.
+- RFC 8259: JSON.
+- WHATWG HTML, Server-Sent Events: `text/event-stream`.
+- Vercel AI SDK UI Message Stream (v5/v6): https://ai-sdk.dev, and the chunk schema at
+  https://github.com/vercel/ai/blob/main/packages/ai/src/ui-message-stream/ui-message-chunks.ts
+- Current contract: `sdks/python/agenta/sdk/models/workflows.py`,
+  `sdks/python/agenta/sdk/decorators/routing.py` (Accept negotiation at `:236`).
+- Agent events and session id: `services/agent/src/protocol.ts:74`,
+  `sdks/python/agenta/sdk/agents/dtos.py`, `services/oss/src/agent/app.py`.
+- Design rationale and trade-offs: [streaming-and-sessions.md](streaming-and-sessions.md).
+```
diff --git a/docs/design/agent-workflows/architecture.md b/docs/design/agent-workflows/architecture.md
index 62cf1bfe85..069a4ba7fa 100644
--- a/docs/design/agent-workflows/architecture.md
+++ b/docs/design/agent-workflows/architecture.md
@@ -149,14 +149,15 @@ how a warm model could change it tomorrow.
 ## The other engine: in-process Pi
 
 The relay above describes the **rivet engine**, the default in the deployed stack and the
-path the rest of these docs assume. The runner also ships a second engine: **legacy
-in-process Pi**. It drives the
-Pi SDK directly inside the sidecar, with no daemon, adapter, or ACP in between. It exists
-for the simplest local case and as a fallback that does not depend on the rivet daemon.
+path the rest of these docs assume. The runner also ships a second engine: **in-process
+Pi**. It drives the Pi SDK directly inside the sidecar, with no daemon, adapter, or ACP in
+between. It exists for the simplest local case and as a fallback that does not depend on the
+rivet daemon.
 
-Both engines sit behind the same Python port and serve the same `/run` contract, so the
-choice between them is a deployment detail, not a difference the workflow author sees. The
-[ports and adapters](ports-and-adapters.md) page explains how one neutral seam holds both.
+The two engines are the two backends behind the same SDK ports: `RivetBackend` and
+`InProcessPiBackend`. Both serve the same `/run` contract, so which one runs is a deployment
+detail, not a difference the workflow author sees. The
+[ports and adapters](ports-and-adapters.md) page explains the ports and the backends.
 
 ## How a request flows, end to end
 
@@ -164,9 +165,11 @@ Putting it together, a single agent run on `pi` / `local` goes like this:
 
 1. The playground sends `POST /invoke` to the `services` container.
 2. The Python handler (`agent/app.py`) reads the config, resolves the tools and provider
-   keys, and builds a `SessionConfig`.
-3. It picks the engine (`rivet`) and the transport (HTTP to the sidecar), then sends one
-   `POST /run`.
+   keys, and builds a neutral `AgentConfig` and `SessionConfig` from the SDK runtime
+   (`agenta.sdk.agents`).
+3. It picks a backend (`RivetBackend` here) from the harness and sandbox, wraps it in an
+   `Environment` and a `Harness`, and the harness sends one `POST /run` over the backend's
+   transport (HTTP to the sidecar).
 4. The sidecar's rivet engine starts the daemon, which starts `pi-acp`, which starts `pi`.
 5. `pi` reads the instructions, calls the model, runs any tools, and streams events back up
    the relay. Those events become trace spans nested under the `/invoke` span (the
diff --git a/docs/design/agent-workflows/ports-and-adapters.md b/docs/design/agent-workflows/ports-and-adapters.md
index 1914c1e9f8..a86002c1a8 100644
--- a/docs/design/agent-workflows/ports-and-adapters.md
+++ b/docs/design/agent-workflows/ports-and-adapters.md
@@ -1,120 +1,147 @@
 # Ports and adapters
 
 The [architecture](architecture.md) page showed the relay of programs. This page shows the
-seam that keeps that relay swappable. It explains why the seam exists, what shapes it
-defines, and how the service picks an engine and a transport at run time.
+seam that keeps that relay swappable: the ports, where they live, and the adapters behind
+them.
 
-## The problem the seam solves
+## Where the runtime lives
 
-We want three things at once. We want to run more than one coding agent (Pi today, Claude
-Code today, others later). We want to run them in more than one place (local today, Daytona
-today, other sandboxes later). And we never want either choice to leak into the workflow
-code that sits above it.
+The neutral runtime is part of the published Python SDK, at
+`sdks/python/agenta/sdk/agents/`. An SDK user gets it as `agenta.sdk.agents` (with the main
+types re-exported as `ag.AgentConfig`, `ag.RivetBackend`, and so on). The Agenta service
+(`services/oss/src/agent/`) is a thin consumer of it: it resolves tools and secrets
+server-side, threads a trace context, and runs a turn through the same ports. Nothing in the
+SDK runtime calls the Agenta API, so the same code runs an agent standalone, with no Agenta
+backend at all.
 
-A neutral port solves this. The Python service talks to one small interface. Everything
-agent-specific and place-specific lives in adapters behind that interface. Rivet, which
-does most of the heavy lifting, is one adapter behind the port, not the port itself. That
-keeps the door open for a future agent that rivet cannot drive (see the
-[Claude Code adapter](adapters/claude-code.md) page for how a non-rivet engine would slot
-in).
+The package follows Agenta's hexagonal vocabulary, the same words the `api/` domains use:
 
-We learned the shape of this port by studying the rivet SDK. Rivet splits its surface into
-three planes, and that split is the main lesson:
-
-| Plane | What it covers | In our port? |
+| Layer | File | What it holds |
 | --- | --- | --- |
-| Runtime / sandbox | Where the agent runs, and its lifecycle | Yes, as the **Environment** seam |
-| Agent session | The prompt, the config, the event stream | Yes, as the **Harness** and **AgentSession** seams |
-| System | Filesystem, process, desktop control | No. This is provisioning, used only inside an adapter, never shown to the workflow author |
-
-The first two planes became our two seams. The third we keep out of the port on purpose: a
-workflow author configures an agent, not a filesystem.
+| DTOs | `dtos.py` | data contracts (Pydantic): `AgentConfig`, `SessionConfig`, `Message`, events, capabilities, the per-harness configs |
+| Ports | `interfaces.py` | the abstract contracts: `Backend`, `Environment`, `Sandbox`, `Session`, `Harness` |
+| Adapters | `adapters/` | the implementations: the backends and the harnesses |
+| Utils | `utils/` | shared plumbing for the runner-backed adapters (the `/run` wire and the transports) |
 
-## Seam one: the Environment (where it runs)
+## The three layers
 
-The `Environment` seam answers one question: where does the harness process run? The
-`LocalEnvironment` runs it as a subprocess on this host. It has a `start` and a `dispose`
-lifecycle and one real method, `exec`, which runs a command and feeds it the request on
-stdin.
+The runtime is three ports stacked, lowest to highest.
 
-Daytona does not need a separate Python `Environment`. The rivet engine selects the Daytona
-sandbox inside the TypeScript runner, below the port, so "run on a cloud machine" is an
-adapter detail rather than a second Python class. The `Environment` seam stays thin on
-purpose.
+### Backend (the engine)
 
-## Seam two: the Harness and the AgentSession (the conversation)
-
-The `Harness` seam is the heart of the port. It is the agent engine, and rivet and the
-legacy Pi path are both adapters behind it.
+A `Backend` is the engine. It declares which harnesses it can drive, owns the sandbox and
+session lifecycle, and is pure plumbing: it takes an already-harness-shaped config and
+launches it. It carries no "how this harness works" logic.
 
 ```python
-class Harness(ABC):
-    async def setup(self) -> None: ...
-    async def shutdown(self) -> None: ...
-    async def invoke(self, request, *, on_event=None) -> AgentResult: ...
-    async def destroy_session(self, session_id) -> None: ...   # cold: a no-op
-    def create_session(self, config) -> AgentSession: ...
+class Backend(ABC):
+    supported_harnesses: ClassVar[FrozenSet[HarnessType]] = frozenset()
+    def supports(self, harness) -> bool: ...
+    async def create_sandbox(self) -> Sandbox: ...
+    async def create_session(self, sandbox, config, *, harness, secrets, trace, session_id) -> Session: ...
 ```
 
-`invoke` is the single transport call: one cold run in, one structured result out. On top of
-it sits the `AgentSession`, the first-class abstraction borrowed from rivet:
+Each backend is its own class and hard-codes what makes it that engine. There is no shared
+base beyond the ABC. Three exist:
+
+- **`RivetBackend`** drives a harness over ACP through the TypeScript rivet runner. It
+  supports Pi and Claude. Its `sandbox` axis (`local` or `daytona`) is a constructor
+  argument, because it is a real runtime choice.
+- **`InProcessPiBackend`** drives Pi in-process through the runner, with no rivet daemon. Pi
+  only, local only. It was the first backend and stays as the simplest one, the reference to
+  read when writing a new backend.
+- **`LocalBackend`** runs a harness on the user's own machine for standalone SDK use (Pi via
+  a bundled JS runner, Claude via the Python `claude-agent-sdk`). See
+  [`scratch/sdk-local-backend/status.md`](scratch/sdk-local-backend/status.md) for its build
+  state.
+
+`RivetBackend` and `InProcessPiBackend` are different engines that happen to share the
+`utils` wire and transport helpers; neither subclasses the other.
+
+### Environment (where it runs)
+
+An `Environment` wraps a backend and owns the sandbox policy: by default a fresh sandbox per
+session (the cold model, strong isolation). Share one `Environment` across harnesses to
+share its sandbox, or use one per harness to isolate them. The workflow handler builds an
+`Environment(backend)` and never touches the backend's sandbox calls directly.
+
+### Harness (the conversation, per harness type)
+
+A `Harness` wraps an `Environment` for one harness type (`PiHarness`, `ClaudeHarness`). It
+does two jobs. First, it validates at construction that the environment's backend can drive
+it; if not, it raises `UnsupportedHarnessError` immediately:
 
 ```python
-class AgentSession:
-    async def prompt(self, messages, *, on_event=None) -> AgentResult: ...
-    async def destroy(self) -> None: ...
+ClaudeHarness(Environment(InProcessPiBackend()))
+# UnsupportedHarnessError: InProcessPiBackend cannot drive harness 'claude'; it supports: pi
 ```
 
-The workflow handler always works through the session: `create_session(config)`, then
-`session.prompt(messages)`, then `session.destroy()`. Under the cold model the session
-holds no warm daemon, so each `prompt` is a fresh `invoke` that replays the supplied
-history. The abstraction is real and stable even though the lifecycle behind it is cold.
-[Sessions](sessions.md) explains why we kept it cold and what a warm session would change.
+Second, it holds the per-harness adaptation logic, the part that used to live in the
+TypeScript runner. `Harness._to_harness_config` maps the neutral `SessionConfig` into the
+harness's own config, and the two harnesses genuinely differ:
+
+- **`PiHarness`** keeps built-in tool names, delivers resolved tools natively (Pi has no
+  MCP), and forces the permission policy to `auto` because Pi does not gate tool use.
+- **`ClaudeHarness`** drops Pi built-ins (Claude has none), delivers tools over MCP, and
+  honors the permission policy because Claude gates tool use.
 
-## The engine is config, not a class
+Both normalize the resolved tool specs (a name, a description, a JSON-Schema `inputSchema`,
+the `callRef`). The backend below stays pure plumbing; this layer owns the harness knowledge.
 
-A reader expecting three Python classes (one per agent) will be surprised. There are two
-*transports*, and the *engine* is a value they pass, not a class hierarchy.
+A `make_harness(harness_type, environment)` factory maps the playground's harness string to
+the right class.
 
-The two transports differ only in how they reach the TypeScript runner:
+The workflow handler runs a turn through these ports:
+
+```python
+backend = select_backend(selection)          # RivetBackend or InProcessPiBackend
+harness = make_harness(selection.harness, Environment(backend))
+await harness.setup()
+result = await harness.prompt(session_config, messages)
+await harness.cleanup()
+```
 
-- **`SubprocessHarness`** spawns the runner's CLI through an `Environment` and hands it the
-  request on stdin. This is the local, no-Docker path.
-- **`HttpHarness`** sends a `POST /run` to the sidecar. This is the deployed path.
+## The configs
 
-Each transport carries a `backend` value (`rivet` or `pi`) that tells the runner which
-engine to use. So the choice of *agent engine* is one string on the wire, and the choice of
-*how Python reaches the runner* is the transport. Collapsing the engine into config is what
-replaced the old `PiHarness` / `PiHttpHarness` / `RivetHarness` trio with two transports
-and one wire contract.
+`AgentConfig` is the one neutral config the platform and playground speak: instructions
+(written as `AGENTS.md`), model, and provider-agnostic tool references.
+`AgentConfig.from_params` parses a downloaded config dict (the `agent` element, a `prompt`
+prompt-template, or a flat shape) so a standalone user runs exactly what the playground
+stores. `RunSelection` carries the run-time choices stored alongside it (harness, sandbox,
+permission policy); the caller reads it to pick a backend and a harness class.
 
-## How the service picks an engine and a transport
+`SessionConfig` bundles everything one run needs except where it runs: the `AgentConfig`,
+the provider secrets, the permission policy, the trace context, and the resolved tool
+delivery (built-in names, custom specs, the `/tools/call` callback). Sandbox is deliberately
+not in it; that is a backend and environment concern.
 
-The handler makes both choices on every request, in `agent/app.py`.
+The per-harness configs (`PiAgentConfig`, `ClaudeAgentConfig`) are what a backend plumbs.
+Each shapes its own tool and permission fields for the wire, so the difference between Pi's
+native tools and Claude's MCP tools lives in the config types, not in a runtime branch.
 
-It picks the **engine** with `select_backend(harness, sandbox)`. The rule is simple: use
-`rivet` when `AGENTA_AGENT_RUNTIME=rivet` is set, or when the harness is anything other than
-`pi`, or when the sandbox is anything other than `local`. The legacy in-process Pi engine
-only knows how to run Pi locally, so any Claude or Daytona selection forces `rivet` rather
-than silently dropping the choice.
+## How the service picks a backend
 
-It picks the **transport** with `build_harness(backend)`. If `AGENTA_AGENT_PI_URL` is set
-(the Docker deployment), it uses `HttpHarness` against the sidecar. If it is unset (a local
-checkout), it uses `SubprocessHarness` and spawns the runner directly.
+The handler chooses on every request, in `services/oss/src/agent/app.py`. `select_backend`
+returns a backend instance: `InProcessPiBackend` for Pi running locally, and `RivetBackend`
+otherwise (any other harness, a non-local sandbox, or `AGENTA_AGENT_RUNTIME=rivet`). The
+in-process Pi engine only knows how to run Pi locally, so anything else routes to rivet
+rather than silently dropping the choice.
 
-Engine and transport are deployment concerns. Harness and sandbox are workflow config. The
-seam keeps the two kinds of choice from tangling.
+The transport to the runner is a deployment detail each backend takes as a constructor
+argument: `AGENTA_AGENT_PI_URL` set (the Docker deployment) means HTTP to the sidecar; unset
+(a local checkout) means spawn the runner CLI from the wrapper directory.
 
 ## The wire contract: one `/run` shape
 
-Both transports send the same camelCase JSON and parse the same result back. The shape
-lives once in `harness/wire.py` on the Python side and `protocol.ts` on the TypeScript
-side. This contract is the actual boundary of the system.
+Both transports send the same camelCase JSON to the TypeScript runner and parse the same
+result back. The shape lives once in `utils/wire.py` on the Python side and `protocol.ts` on
+the TypeScript side. This contract is the actual boundary of the system.
 
-**Request** (the `SessionConfig` plus the conversation):
+**Request** (the harness-shaped config plus the conversation):
 
 | Field | Meaning |
 | --- | --- |
+| `backend` | The engine the runner uses (`rivet` or `pi`), set by the backend |
 | `harness`, `sandbox` | The two swap axes |
 | `sessionId` | Continue a prior run by replaying its history |
 | `agentsMd` | The agent's instructions, written as `AGENTS.md` |
@@ -139,34 +166,31 @@ side. This contract is the actual boundary of the system.
 
 ## The shared vocabulary: capabilities, content blocks, events
 
-Three neutral types travel on that wire. They are ours, not rivet's, so a non-rivet adapter
-implements them too.
+Three neutral types travel on that wire. They are ours, not any one engine's, so a non-rivet
+adapter implements them too.
 
-**Capabilities** describe what a harness can do: `mcpTools`, `images`, `usage`,
-`streamingDeltas`, `permissions`, and the rest. The rivet engine probes them live from the
+**Capabilities** describe what a harness can do: `mcp_tools`, `images`, `usage`,
+`streaming_deltas`, `permissions`, and the rest. The rivet runner probes them live from the
 daemon and returns them in the result. This is what removed the brittle `if harness == "pi"`
-branches: the runner now branches on a flag, where the live answer is. For example, it
-delivers tools over MCP only when the harness reports `mcpTools`, instead of guessing from
-the name.
+branches in the runner: it now branches on a flag, where the live answer is. For example, it
+delivers tools over MCP only when the harness reports `mcp_tools`.
 
 **Content blocks** mirror ACP: a message's content is either a plain string or a list of
 `text` / `image` / `resource` blocks. Today the playground sends only text. The image and
-resource kinds are plumbed through the types so an image-capable harness can take them once
-the playground sends them.
+resource kinds are plumbed through the types so an image-capable harness can take them.
 
 **Events** are the structured stream. Each event is one of `message`, `thought`,
 `tool_call`, `tool_result`, `usage`, `error`, or `done`. The runner builds this log from the
 harness as the run proceeds and returns it on the result. An `on_event` sink can also
-receive the events. Today the transports deliver the whole log at once after the run, since
-`/run` is request-and-response; live streaming over the HTTP edge is a documented follow-on.
-This event vocabulary is also what makes a Vercel-AI-style stream easy to add later, because
-the event kinds line up with that protocol's parts almost one to one.
+receive them. Today the transports deliver the whole log at once after the run, since `/run`
+is request-and-response; live streaming over the HTTP edge is a documented follow-on.
 
 ## Why this shape
 
-The port mirrors rivet's vocabulary but keeps the types ours. That gives us rivet's rich
-session, capability probe, and event stream without making the port a rivet wrapper. The
-single neutral seam carries two engines today (rivet over ACP, legacy in-process Pi) and has
-room for a third tomorrow. The cost of that flexibility is one extra hop and one wire
-contract to keep in sync across two languages, which the `wire.py` / `protocol.ts` pairing
-contains in one place each.
+The port mirrors rivet's vocabulary but keeps the types ours, so rivet is one adapter behind
+the seam, not the seam itself. The same ports carry two working engines (rivet over ACP,
+in-process Pi) and have room for a standalone local engine. Making the engine a real
+`Backend` class, rather than a string the transport carries, is what lets a backend hard-code
+its own identity and lets a standalone SDK user construct one directly. The cost of the
+flexibility is one extra hop and one wire contract to keep in sync across two languages, which
+the `utils/wire.py` and `protocol.ts` pairing contains in one place each.
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/README.md b/docs/design/agent-workflows/scratch/harness-port-redesign/README.md
index 3402652d15..3ba4dbd2c8 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/README.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/README.md
@@ -1,3 +1,5 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Harness + Runtime port redesign
 
 Status: research and proposal, scope approved (full A to E arc, cold per invoke). Not
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md b/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md
index 084afb6e3d..a465409557 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md
@@ -1,30 +1,38 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Implementation notes
 
-How the approved A to E arc lands in code, with the cold + replay constraint. This is
-the as-built reference for the rewrite (kept in sync with the code).
+How the approved A to E arc was expected to land in code, with the cold + replay
+constraint. This was the working note for that effort. The as-built design later diverged:
+the neutral runtime moved into the SDK at `agenta.sdk.agents`, the engine became a `Backend`
+class rather than a wire string, and the old `Harness`/`AgentSession` shape was replaced by
+the `Backend` / `Environment` / `Harness` / `Session` ports. See the
+[design pages](../../README.md) for what shipped.
 
 ## Module layout
 
-### Python — two packages
+### Python — two packages (the plan at the time)
 
-The engine-agnostic runtime and the Agenta workflow integration are separate packages, so
-nothing in the runtime is Agenta-specific and the god-module is gone.
+The plan was to split the engine-agnostic runtime from the Agenta workflow integration, so
+nothing in the runtime was Agenta-specific and the god-module was gone. The as-built design
+went further: the runtime moved out of the service entirely and into the SDK at
+`sdks/python/agenta/sdk/agents/` (`dtos.py`, `interfaces.py`, `errors.py`, `adapters/`,
+`utils/`). The package and file names below are the superseded plan, not what shipped.
 
-`services/oss/src/harness/` — the engine-agnostic runtime:
+The planned `services/oss/src/harness/` runtime package:
 
 | File | Holds |
 | --- | --- |
 | `ports.py` | The neutral types and the two seams. Types: `HarnessCapabilities`, `ContentBlock`, `Message`, `AgentEvent`, `TraceContext`, `ToolCallback`, `SessionConfig`, `AgentRequest`, `AgentResult`. Seams: `Environment` (where it runs) and `Harness` (the agent), plus the concrete `AgentSession`. |
-| `transports.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. Replaces `pi_harness.py`, `pi_http_harness.py`, `rivet_harness.py`. |
-| `environment.py` | `LocalEnvironment` (subprocess on this host). Replaces `local_runtime.py`. |
+| `transports.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. |
+| `environment.py` | `LocalEnvironment` (subprocess on this host). |
 | `wire.py` | Serializes an `AgentRequest` to the camelCase `/run` JSON and parses an `AgentResult` back. The wire shape lives once. |
 
 `services/oss/src/agent/` — the Agenta workflow app (was the single `agent.py` god-module):
 
 | File | Holds |
 | --- | --- |
-| `app.py` | The `/invoke` handler plus `select_backend` / `build_harness`. Thin: it orchestrates the modules below. |
-| `inputs.py` | Request parsing: `resolve_run_config`, `to_messages`, `_system_text`. |
+| `app.py` | The `/invoke` handler plus backend selection. Thin: it orchestrates the modules below. |
 | `tools.py` | Tool resolution through `/tools/resolve` (and slug parsing). |
 | `secrets.py` | Provider keys from the project vault. |
 | `tracing.py` | `trace_context` and `record_usage` (the OTel glue). |
@@ -32,11 +40,13 @@ nothing in the runtime is Agenta-specific and the god-module is gone.
 | `schemas.py` | The `/inspect` schemas. Gains the permission-policy parameter. |
 | `config.py` | The file-backed `AgentConfig` and the TS runner path. |
 
-The backend engine (legacy in-process Pi vs rivet ACP) is no longer a Python class. It is
-one env value (`AGENT_BACKEND`) the transport passes to the TS runner, so Python has two
-transports, not three backend adapters. The harness folder is named for the seam, not for
-Pi: harness choice (pi/claude) lives inside the runtime, which is why there is no
-`agent_claude` package.
+This plan modelled the backend engine as a wire string the transport carried, with two
+transports rather than per-engine classes. The as-built design rejected that: the engine is
+a `Backend` class (`RivetBackend`, `InProcessPiBackend`, `LocalBackend`) that hard-codes its
+own engine id and supported harnesses, and the HTTP vs subprocess delivery is a transport
+helper each backend takes as a constructor argument. Request parsing also moved onto the
+DTOs (`AgentConfig.from_params`, `RunSelection.from_params`) instead of a separate
+`inputs.py`.
 
 ### TypeScript (`services/agent/src/`) — grouped by role
 
@@ -55,7 +65,13 @@ The folder grouping (entrypoints + contract at the top, `engines/`, `tracing/`,
 `extensions/`) replaced a flat `src/` of ten files that had grown one work package at a
 time. No behavior change.
 
-## The seams
+## The seams (the planned shape)
+
+This was the seam shape this effort planned. The as-built design replaced it: there is no
+`invoke` transport verb and no `AgentSession` class. Instead a `Backend` owns the sandbox
+and session lifecycle, a `Harness` adapter (`PiHarness`, `ClaudeHarness`) holds the
+per-harness mapping over an `Environment`, and a `Session` port is the conversation
+(`prompt`, `destroy`). See [ports and adapters](../../ports-and-adapters.md).
 
 ```python
 class Harness(ABC):
@@ -70,10 +86,10 @@ class AgentSession:                 # sugar over invoke; the first-class session
     async def destroy(self) -> None: ...
 ```
 
-`invoke` is the single transport call (one cold run). `AgentSession` is the rivet-shaped
-abstraction on top: `create_session(config)` then `session.prompt(messages)`. Under cold +
-replay the session holds no warm daemon; continuation replays the caller-supplied history
-into a fresh run, exactly as WP-8 does today. Server-side persisted history is the
+In this plan `invoke` was the single transport call (one cold run) and `AgentSession` was
+the rivet-shaped abstraction on top: `create_session(config)` then `session.prompt(messages)`.
+Under cold + replay the session holds no warm daemon; continuation replays the caller-supplied
+history into a fresh run, exactly as WP-8 does today. Server-side persisted history is the
 deferred Phase C bit (see Deferred below).
 
 ## Capabilities: probed in TS, reported in the result
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md b/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md
index f7cbc39a99..c23f0ce820 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md
@@ -1,3 +1,5 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Build plan
 
 Scope set by the user (2026-06-17): full A to E arc, cold per invoke (no warm daemon).
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md b/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md
index 8adde2e18f..9d6c78f89a 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md
@@ -1,3 +1,5 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Proposal: evolve the ports toward a session shaped seam
 
 ## Principle
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/research.md b/docs/design/agent-workflows/scratch/harness-port-redesign/research.md
index e8913189a6..7d12e19942 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/research.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/research.md
@@ -1,3 +1,5 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Research: our ports vs the rivet SDK
 
 Source verified June 2026 against the installed `sandbox-agent@0.4.2` SDK
diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/status.md b/docs/design/agent-workflows/scratch/harness-port-redesign/status.md
index 479fc42a88..1f19d84982 100644
--- a/docs/design/agent-workflows/scratch/harness-port-redesign/status.md
+++ b/docs/design/agent-workflows/scratch/harness-port-redesign/status.md
@@ -1,3 +1,5 @@
+> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history.
+
 # Status
 
 Source of truth for this design effort. Keep it current.
@@ -11,11 +13,16 @@ The as-built reference is [`implementation.md`](implementation.md); the comparis
 [`proposal.md`](proposal.md) and [`plan.md`](plan.md). Builds on the shipped WP-8 runtime
 ([`../wp-8-rivet-acp-runtime/status.md`](../wp-8-rivet-acp-runtime/status.md)).
 
-The new port (`Environment` + `Harness` + `AgentSession`, capabilities, content blocks,
-structured events/result) ships with both backends (rivet ACP, legacy in-process Pi) on
-two transports sharing one wire contract. Verified live: pi, rivet+pi+local,
-rivet+claude+local, rivet+pi+daytona; a playground run nests `invoke_agent` under the
-`/invoke` span with usage. A high-effort review found and fixed 10 issues.
+The port this effort shipped (`Environment` + `Harness` + `AgentSession`, capabilities,
+content blocks, structured events/result) drove both backends (rivet ACP, in-process Pi)
+over a shared wire contract. Verified live: pi, rivet+pi+local, rivet+claude+local,
+rivet+pi+daytona; a playground run nests `invoke_agent` under the `/invoke` span with usage.
+A high-effort review found and fixed 10 issues.
+
+The runtime later moved into the SDK at `agenta.sdk.agents` and the ports were reshaped
+into `Backend` / `Environment` / `Harness` / `Session`, with the engine modelled as a
+`Backend` class rather than a wire string. The names in this status file are from the
+original effort. See the [design pages](../../README.md) for the as-built shape.
 
 ## Recommendation in one line
 
diff --git a/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md b/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md
index a4f13732ca..5eb0848e84 100644
--- a/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md
+++ b/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Pi agent harness: diskless / in-memory config
 
 Research target: Pi coding agent (pi.dev, Earendil Inc.), npm
diff --git a/docs/design/agent-workflows/scratch/research/open-questions.md b/docs/design/agent-workflows/scratch/research/open-questions.md
index dd9d37fd47..f1883fd408 100644
--- a/docs/design/agent-workflows/scratch/research/open-questions.md
+++ b/docs/design/agent-workflows/scratch/research/open-questions.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Agent Workflows: Daytona and pi.dev due-diligence
 
 Status: research only. Broad due-diligence to surface what the focused research topics
diff --git a/docs/design/agent-workflows/scratch/research/pi-interaction.md b/docs/design/agent-workflows/scratch/research/pi-interaction.md
index c5a1fee83c..d982693113 100644
--- a/docs/design/agent-workflows/scratch/research/pi-interaction.md
+++ b/docs/design/agent-workflows/scratch/research/pi-interaction.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Research: Programmatically driving the pi.dev agent harness
 
 Status: research only. No code changed outside this file.
diff --git a/docs/design/agent-workflows/scratch/sdk-local-backend/status.md b/docs/design/agent-workflows/scratch/sdk-local-backend/status.md
new file mode 100644
index 0000000000..1f5506fbec
--- /dev/null
+++ b/docs/design/agent-workflows/scratch/sdk-local-backend/status.md
@@ -0,0 +1,81 @@
+# Status: SDK-owned agent runtime + local backend
+
+Source of truth for this effort and the handoff for whoever continues it. This is the only
+page in `docs/design/agent-workflows/` that describes things that do not fully exist yet; the
+design pages describe only what is built.
+
+## What this effort is
+
+Two things, layered on the agreed three-layer port redesign (Backend / Environment / Harness
+plus neutral and per-harness configs):
+
+1. Move the neutral agent runtime out of the service and into the published Python SDK, so an
+   SDK user can download an agent config and run it locally with no Agenta backend.
+2. Add a `LocalBackend` that runs a harness on the user's own machine (Pi via a bundled JS
+   runner, Claude via the Python `claude-agent-sdk`).
+
+## Current state (2026-06-18)
+
+### Done and verified (by import + wire-equivalence; live `/invoke` not re-run, see below)
+
+- **SDK runtime** at `sdks/python/agenta/sdk/agents/`, hexagonal layout:
+  - `dtos.py` — Pydantic data contracts: `AgentConfig` (+ `from_params`), `RunSelection`,
+    `SessionConfig`, `Message`, `ContentBlock`, `AgentEvent`, `AgentResult`,
+    `HarnessCapabilities`, `HarnessType`, `TraceContext`, `ToolCallback`,
+    `HarnessAgentConfig` + `PiAgentConfig` / `ClaudeAgentConfig`.
+  - `interfaces.py` — the ports (ABCs): `Backend`, `Environment`, `Sandbox`, `Session`,
+    `Harness`.
+  - `errors.py` — `UnsupportedHarnessError`.
+  - `adapters/rivet.py` — `RivetBackend` (engine hard-coded `rivet`; pi + claude; `sandbox`
+    kwarg) + `RivetSandbox` / `RivetSession`.
+  - `adapters/in_process.py` — `InProcessPiBackend` (engine hard-coded `pi`; pi only, local
+    only; the reference backend) + its sandbox/session.
+  - `adapters/local.py` — `LocalBackend`, STUB (raises `NotImplementedError`).
+  - `adapters/harnesses.py` — `PiHarness`, `ClaudeHarness`, `make_harness`; this holds the
+    per-harness adaptation (tool-spec normalization; Pi keeps built-ins and forces
+    `permissionPolicy=auto`; Claude drops built-ins and honors the policy).
+  - `utils/wire.py` — `request_to_wire` / `result_from_wire` (the `/run` shape).
+  - `utils/ts_runner.py` — `deliver_http` / `deliver_subprocess`.
+- **Public surface**: `ag.AgentConfig`, `ag.SessionConfig`, `ag.RunSelection`,
+  `ag.Environment`, `ag.RivetBackend`, `ag.InProcessPiBackend`, `ag.LocalBackend`,
+  `ag.PiHarness`, `ag.ClaudeHarness`, `ag.make_harness`. `ag.Message` is deliberately the
+  prompt type (unchanged); import the agents `Message` from `agenta.sdk.agents`.
+- **Service rewired**: `services/oss/src/agent/app.py` builds `AgentConfig.from_params` +
+  `RunSelection`, picks a backend via `select_backend`, runs through `Environment` +
+  `make_harness`. `tools.py` / `tracing.py` import the SDK `ToolCallback` / `TraceContext`.
+  `services/oss/src/agent/inputs.py` and the whole `services/oss/src/harness/` package were
+  deleted (their content now lives in the SDK).
+- The full `_agent` handler emits a `/run` payload byte-identical to the previous one, so the
+  TypeScript runner (`services/agent/`) is unchanged. `ruff format` + `ruff check` pass.
+
+### Not done yet (take over here)
+
+- **`LocalBackend` (the new feature).** Two mechanisms, one per harness:
+  - Pi → bundled JS runner. Needs a `pnpm` build step that bundles the in-process Pi engine
+    to a single JS file shipped inside the `agenta` wheel, and `LocalBackend` invoking it
+    with `node`. (Decision: bundle prebuilt JS in the wheel.)
+  - Claude → the pure-Python `claude-agent-sdk`, in-process, no TS bridge. (Decision: use
+    `claude-agent-sdk`, not a TS engine.)
+  Both need build/dependency setup to verify, which is why they are not started.
+- **Live verification.** Everything above is verified by import + wire-equivalence only. A
+  real `/invoke` run on the dev stack (pi+local, rivet+pi, rivet+claude, rivet+pi+daytona)
+  has NOT been re-run since the refactor. Do this before treating the rewrite as shipped; see
+  the `debug-local-deployment` skill.
+
+## Locked decisions
+
+- Vocabulary follows `api/`: `dtos.py` (data), `interfaces.py` (ports/ABCs), `adapters/`
+  (implementations). A port is an interface; an adapter is an implementation.
+- Backends are NOT a class hierarchy. Each hard-codes its engine id and supported harnesses;
+  they share only the `utils` functions. `InProcessPiBackend` is the reference backend.
+- DTOs are Pydantic.
+- `Harness` (not the backend) owns the per-harness adaptation logic, especially tools.
+- Sandbox is a backend/environment concern, not a `SessionConfig` field.
+- The TS runner and the `/run` wire stay unchanged.
+
+## Dependency direction
+
+`service -> SDK`, never the reverse. The SDK runtime never calls the Agenta API. The service
+resolves tools (`/tools/resolve`), vault secrets (`/secrets/`), and the trace context, and
+hands the SDK already-resolved data on the `SessionConfig`. A standalone SDK user resolves
+their own (env keys, their own tools, no tracing) and uses `LocalBackend`.
diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md
index 0be59d585c..dbb2c72a50 100644
--- a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md
+++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Integrating the Pi tracing extension into the agent runtime
 
 Status: ready to integrate. Audience: whoever builds the Dockerized Pi agent runtime
diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md
index 977427469c..9c53d2f4bd 100644
--- a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md
+++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Tracing the agent run into the response, like completion and chat
 
 Status: built and verified end to end against the dev box (2026-06-15). Audience:
diff --git a/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md
index c0a5731f6a..433e368998 100644
--- a/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md
+++ b/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # WP-2: Agent service wrapping Pi
 
 Status: not started.
diff --git a/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md
index f905fd6d7a..881030e73b 100644
--- a/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md
+++ b/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # WP-2 implementation plan: agent service wrapping Pi
 
 Status: MVP built and verified by curl (2026-06-15). Decisions below were taken; the
diff --git a/docs/design/agent-workflows/scratch/wp-7-tools/README.md b/docs/design/agent-workflows/scratch/wp-7-tools/README.md
index 483f5dc688..2e02f02f18 100644
--- a/docs/design/agent-workflows/scratch/wp-7-tools/README.md
+++ b/docs/design/agent-workflows/scratch/wp-7-tools/README.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # WP-7: Runnable tools as agent configuration
 
 Status: Composio MVP implemented. Resolution lives in `api`; the bridge routes Pi tool
diff --git a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md
index 716a97d60e..fa033f7848 100644
--- a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md
+++ b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/README.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # WP-8: Rivet + ACP agent runtime
 
 Status: design ready to implement. Start at [`plan.md`](plan.md). Decisions and open
diff --git a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md
index a9e71321f3..9aa721406b 100644
--- a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md
+++ b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/architecture.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Architecture
 
 ## Principle
diff --git a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md
index fe7d1ecac0..516648eec9 100644
--- a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md
+++ b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/context.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Context: the code that exists today
 
 Read this to orient on the current service before changing it. All paths are in this repo
diff --git a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md
index 17a0051827..9a71a49b3e 100644
--- a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md
+++ b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/plan.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Plan
 
 Phased so each phase is demonstrable and reversible. Phases 0 to 2 deliver the four
diff --git a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md
index 836d60f6ee..1a70a2301d 100644
--- a/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md
+++ b/docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/status.md
@@ -1,3 +1,4 @@
+> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md).
 # Status
 
 Source of truth for this WP. Keep it current.
diff --git a/docs/design/agent-workflows/sessions.md b/docs/design/agent-workflows/sessions.md
index 2af252ee48..20a30089b0 100644
--- a/docs/design/agent-workflows/sessions.md
+++ b/docs/design/agent-workflows/sessions.md
@@ -31,19 +31,20 @@ it forward.
 
 ## The session is already a first-class object
 
-Even though the lifecycle is cold, the port models a session as a real object. The workflow
-handler does not call `invoke` directly. It calls:
+Even though the lifecycle is cold, the ports model a session as a real object. The workflow
+handler works through a `Harness` over an `Environment`:
 
 ```python
-session = harness.create_session(config)
+harness = make_harness(harness_type, Environment(backend))
+session = await harness.create_session(config)
 result = await session.prompt(messages)
 await session.destroy()
 ```
 
-`AgentSession` is the rivet-shaped abstraction described on the
-[ports and adapters](ports-and-adapters.md) page. Under the cold model, `prompt` is a fresh
-`invoke` that replays history and `destroy` is a no-op. The abstraction is stable. Only the
-mechanism behind it is cold. This matters because it gives a future session store a clean
+`Session` is the conversation abstraction described on the
+[ports and adapters](ports-and-adapters.md) page. Under the cold model, `prompt` sends a
+fresh `/run` that replays history and `destroy` is a no-op. The abstraction is stable. Only
+the mechanism behind it is cold. This matters because it gives a future session store a clean
 place to attach, with no change to the handler above it.
 
 ## Why we kept it cold on purpose
@@ -73,7 +74,7 @@ backend database, or by a file for a standalone run) holds the event history. To
 the service replays the persisted history into a fresh cold sandbox, exactly as today, but
 the platform owns the record instead of the client. This keeps the strong isolation of the
 cold model and still gives durable, server-owned sessions. It is the smaller step, and the
-`AgentSession` object is already the place it attaches.
+`Session` object is already the place it attaches.
 
 **Path two: a warm daemon with `session/load`.** Keep a daemon alive between turns and use
 the ACP `session/load` call to restore the real in-harness session, no transcript replay.
diff --git a/docs/design/agent-workflows/streaming-and-sessions.md b/docs/design/agent-workflows/streaming-and-sessions.md
new file mode 100644
index 0000000000..443aad4963
--- /dev/null
+++ b/docs/design/agent-workflows/streaming-and-sessions.md
@@ -0,0 +1,481 @@
+# RFC: Streaming and sessions for the agent interface
+
+Status: **Proposed**. Audience: the frontend lead who will build against this, and the
+backend engineer who will implement it. This RFC adds two things to the existing workflow
+interface. It does not replace it.
+
+This is the design document: the why, the options, and the trade-offs. The normative wire
+spec (endpoints, message formats, MUST/SHOULD rules) lives in the
+[Agent protocol RFC](agent-protocol-rfc.md). Read this for the reasoning, that one to build.
+
+## Why this exists
+
+Today every workflow, including the agent, runs behind one request and one response. The
+playground sends `POST /invoke`, waits, and renders the final answer. That works for a
+prompt that calls a model once. It does not work for an agent.
+
+An agent runs a loop. It thinks, calls a tool, reads the result, and calls the model again,
+sometimes for a minute or more before it has a final answer. Two things break under the
+single-response model:
+
+1. **The user sees nothing until the end.** No tokens, no "the agent is calling a tool
+   now," no thinking. For a long run this reads as a hang.
+2. **Multi-turn conversation is the client's job.** The client holds the whole history and
+   replays it on every turn (see [sessions.md](sessions.md)). The platform does not own the
+   conversation, so the client cannot reconnect, reload, or share it.
+
+This RFC addresses both. It streams the agent's work to the browser as it happens, in the
+[Vercel AI SDK](https://ai-sdk.dev) wire format so the frontend can use `useChat` directly.
+And it gives the agent a named **session** so a conversation can be grouped, reloaded, and
+later moved server-side. The streaming piece lands in full. The session piece lands as the
+identifier and the load endpoint now, with server-owned history as the next step.
+
+## What we are adding, in one paragraph
+
+We add a new endpoint, `POST /messages`, for the chat agent. It sits next to the existing
+`/invoke`, which does not change. `/messages` carries an optional `session_id` and offers two
+response modes. Ask for JSON and you get a single response, like `/invoke` gives today. Ask
+with `Accept: text/event-stream` and the same call streams the run as Vercel UI-message parts
+over SSE. Pass a `session_id`, and the platform ties the turn to a named conversation: it
+records the turn under that id and returns the id. A second endpoint, `load-session`, returns
+a session's history so the client can rebuild the conversation in the UI.
+
+Why a new endpoint and not a flag on `/invoke`? The chat contract differs enough to stand on
+its own. The conversation is a first-class `messages` input in the Vercel `UIMessage` shape,
+the response can stream, and a turn belongs to a session. Overloading `/invoke` with all of
+that would blur the simple, stateless workflow call. A sibling endpoint keeps each contract
+clean.
+
+For now the client still sends the full message history on every turn, exactly as it does
+today. The `session_id` rides alongside that history. It names the conversation, it lets
+turns be grouped and reloaded, and it is the foothold for the larger step of moving history
+into the platform so the client sends only the new turn. That larger step is the
+[next direction](#what-stays-client-side-for-now), not part of this RFC.
+
+Three pieces, each additive:
+
+| Piece | Endpoint | What it does |
+| --- | --- | --- |
+| Session id | `POST /messages` | Names the conversation a turn belongs to; returns the id |
+| Streaming | `POST /messages` with `Accept: text/event-stream` | Streams the run in Vercel format |
+| Load | `POST /load-session` | Returns a session's history for the UI to rebuild |
+
+## Background: the contract we are extending
+
+The shapes below are the current contract. The RFC adds fields, it does not rename them.
+
+**Request** is `WorkflowInvokeRequest` (`sdks/python/agenta/sdk/models/workflows.py:257`).
+The body that matters is the `data` envelope (`workflows.py:206`): `inputs` (the template
+variables, including `messages` for a chat workflow), `parameters` (the agent config), and
+`trace`. The agent app reads `inputs.messages` and `parameters.agent`
+(`services/oss/src/agent/app.py:65`).
+
+**Response** is `WorkflowServiceResponse` (`workflows.py:321`). The assistant reply rides in
+`data.outputs` as `{"role": "assistant", "content": ...}`. The envelope also carries
+`trace_id` and `span_id` at the top level (`workflows.py:289`). Token usage is **not** in
+the response. It lives on the trace span and the client reads it from tracing.
+
+**The agent run** already produces a structured event stream internally. The runner emits
+`AgentEvent`s as the run proceeds (`services/agent/src/protocol.ts:74`):
+
+```ts
+type AgentEvent =
+  | { type: "message";     text: string }
+  | { type: "thought";     text: string }
+  | { type: "tool_call";   id?: string; name?: string; input?: unknown }
+  | { type: "tool_result"; id?: string; output?: string; isError?: boolean }
+  | { type: "usage";       input?; output?; total?; cost? }
+  | { type: "error";       message: string }
+  | { type: "done";        stopReason?: string };
+```
+
+Today the runner buffers these and returns the whole log on the result, because `/invoke`
+is request-and-response. An `on_event` sink already exists to receive them live
+(`Harness.invoke(..., on_event=...)`, `ports-and-adapters.md`). **Streaming is the act of
+wiring that sink to the HTTP edge and encoding each event as a Vercel part.** The event
+kinds line up with the Vercel parts almost one to one, which is why this is an encoder, not
+a rewrite.
+
+`session_id` already flows through the agent runner (`SessionConfig.session_id`,
+`AgentResult.session_id`) and rides on the trace as `ag.session.id`. It just never reached
+the HTTP body. The new `/messages` endpoint carries it in the request and response body.
+
+## The session model
+
+A session is a named conversation identified by a `session_id`. The id appears in the
+request body and the response body, never in a header. For now it names and records the
+conversation. It does not yet hold the context the model sees, because the client still
+sends the full history (see [what stays client-side](#what-stays-client-side-for-now)).
+
+### How a session id is resolved
+
+```
+client sends session_id?
+├── no  → server mints a new id, records the turn under it, returns the id
+└── yes → does a session with this id exist for this project?
+          ├── no  → create the session with the client's id, record the turn
+          └── yes → record the turn under the existing session
+```
+
+This is an upsert keyed by `(project_id, session_id)`. The same call creates or continues.
+"Continue" means the turn is recorded under that session. The conversation context still
+comes from the messages the client sends, not from the server's record. That changes when
+history moves server-side.
+
+### Client lifecycle
+
+```
+New conversation
+  1. client generates session_id (or omits it and adopts the one the server returns)
+  2. POST /messages { session_id, full history }   → stream
+  3. reuse session_id for every later turn
+
+Returning to a known conversation (new page load, another device)
+  1. POST /load-session { session_id }           → history
+  2. render it, and hold it to send on the next turn
+  3. POST /messages { session_id, full history } → stream continues it
+```
+
+A fresh client holds no history. `load-session` is how it gets the conversation back, both
+to render it and to have it to resend on the next turn.
+
+### What stays client-side for now
+
+The client still sends the full message history on every turn, the same as today. The
+`session_id` rides alongside it. The server does not yet read its own record to build the
+model's context, so today the history on the wire is authoritative.
+
+Moving that history into the platform is the next step, not this RFC. When it lands, a
+request with a `session_id` carries only the new turn and the platform supplies the rest.
+That is what makes reconnect and sharing cheap, and it is why the `session_id` belongs in
+the contract now even though the payload has not shrunk yet. [sessions.md](sessions.md)
+covers the server-owned-history work in full.
+
+### My notes on the session decisions
+
+The four rules you proposed are sound and they match how `useChat` already works. Three
+things to lock down before building:
+
+- **Scope every id to the project, and check ownership on resume.** "Resume if it exists"
+  must mean "resume if it exists *and belongs to this caller*." Otherwise a client can pass
+  another tenant's `session_id` and read their conversation. If the id exists under a
+  different project, treat it as not found, do not resume. The unique key is
+  `(project_id, session_id)`, not `session_id` alone.
+- **Validate client-supplied ids.** Accepting a client id means the client controls that
+  part of the id space. Bound the length and the charset and treat the id as an opaque
+  token, never interpolate it into a storage path or a query without escaping. The Vercel
+  docs raise the same path-traversal warning.
+- **Prefer a client-generated id for the `useChat` path, keep server-minting for the
+  rest.** `useChat` takes a fixed `id` up front and round-trips it. If the server mints a
+  *different* id, the client has to adopt it after the first turn, which is awkward in that
+  hook. So for the browser, let the client generate the id and send it from turn one. Keep
+  server-minting for callers that do not care (curl, the SDK, a script). Both paths are
+  supported. This is the one place I would steer the frontend rather than leave it open.
+
+## Streaming: the Vercel UI Message Stream
+
+We stream in the format `useChat` consumes, so the frontend gets messages, tool calls,
+reasoning, and status with no custom parser. This section is the part to build against.
+
+### How a client asks for a stream
+
+Negotiation uses the standard `Accept` header, which the SDK route already honors
+(`routing.py:236`):
+
+- `Accept: application/json` (or no header): the current single JSON response. Unchanged.
+- `Accept: text/event-stream`: the Vercel stream described below.
+
+The `useChat` transport sets this header in one line (see [the frontend
+wiring](#frontend-wiring)). The header `x-vercel-ai-ui-message-stream: v1` is a **response**
+header the server sets, not something the client sends. You were right that headers are the
+wrong place for `session_id`. They are the right place for content negotiation.
+
+### What the format is
+
+The Vercel UI Message Stream (AI SDK v5 and v6) is plain SSE. Each part is one event:
+
+```
+data: <compact json>\n\n
+```
+
+and the stream ends with a literal `data: [DONE]\n\n`. A message is a list of **parts**, and
+the part types are:
+
+| Part family | Parts | Carries |
+| --- | --- | --- |
+| Lifecycle | `start`, `start-step`, `finish-step`, `finish` | message id, step boundaries, finish reason |
+| Text | `text-start`, `text-delta`, `text-end` | streamed assistant text, grouped by an `id` |
+| Reasoning | `reasoning-start`, `reasoning-delta`, `reasoning-end` | the model's thinking |
+| Tool input | `tool-input-start`, `tool-input-delta`, `tool-input-available` | `toolCallId`, `toolName`, the arguments |
+| Tool output | `tool-output-available`, `tool-output-error` | `toolCallId`, the result or an error |
+| File | `file` | `url`, `mediaType` (a data: URL works) |
+| Data / generative UI | `data-<name>` | any JSON, rendered by a custom component on the client |
+| Error | `error` | `errorText` |
+
+One field name to not get wrong: text and reasoning deltas use `delta`, but tool input
+deltas use `inputTextDelta`.
+
+**Tool calls** stream as a start, optional argument deltas, then the assembled input:
+
+```
+data: {"type":"tool-input-start","toolCallId":"call_1","toolName":"getWeather"}
+data: {"type":"tool-input-available","toolCallId":"call_1","toolName":"getWeather","input":{"city":"Paris"}}
+```
+
+**Tool results** come back as their own part, keyed by the same `toolCallId`:
+
+```
+data: {"type":"tool-output-available","toolCallId":"call_1","output":{"weather":"sunny"}}
+```
+
+or, on failure, `{"type":"tool-output-error","toolCallId":"call_1","errorText":"..."}`.
+
+**Files** stream as a `file` part: `{"type":"file","url":"...","mediaType":"image/png"}`.
+The url can be an `https://` link or an inline `data:` URL.
+
+**Generative UI** is the `data-<name>` part. The server emits
+`{"type":"data-plan","data":{...}}` and the client renders a component for parts of type
+`data-plan`. Mark a part `"transient": true` to deliver it only to the `onData` callback
+without storing it on the message. This is the extension point for agent-specific UI (a plan
+view, a diff, a progress card). We do not need it for v1, but the format gives it to us for
+free.
+
+### Does the stream also send the whole message at the end?
+
+No. This was your open question, so to be precise: the protocol streams deltas only. There
+is no final full-snapshot event. The client assembles the parts into the final `UIMessage`
+as they arrive, and `finish` then `[DONE]` close it out. The complete message exists
+server-side too (we need it to persist the turn), but we do not re-emit it on the wire.
+
+So the two modes differ cleanly:
+
+- **Non-streaming** (`Accept: application/json`): one JSON response with the whole answer in
+  `data.outputs`, exactly as today.
+- **Streaming** (`Accept: text/event-stream`): deltas, no final snapshot, the client
+  assembles. The turn is recorded on the trace as it is today, which is also what
+  `load-session` reads back.
+
+### Mapping our events to Vercel parts
+
+The streaming edge consumes the `on_event` sink and encodes each `AgentEvent` as one or more
+parts. The mapping:
+
+| Our `AgentEvent` | Vercel parts emitted |
+| --- | --- |
+| run starts (synthesized) | `start` (carries `messageId` and `messageMetadata.sessionId`), then `start-step` |
+| `message` | `text-start` → `text-delta` → `text-end` |
+| `thought` | `reasoning-start` → `reasoning-delta` → `reasoning-end` |
+| `tool_call` | `tool-input-start` then `tool-input-available` |
+| `tool_result` (`isError` false) | `tool-output-available` |
+| `tool_result` (`isError` true) | `tool-output-error` |
+| `usage` | `messageMetadata` on the `finish` part |
+| `error` | `error` |
+| `done` | `finish-step`, then `finish` (`finishReason` = `stopReason`), then `[DONE]` |
+
+Two implementation notes:
+
+- **Steps.** The agent loop's turns map to `start-step` / `finish-step` pairs. Each model
+  call that ends in a tool call closes one step; the post-tool continuation opens the next.
+  The edge synthesizes these boundaries around our native events.
+- **Deltas when we have them.** Our `message` event today carries whole text, not token
+  deltas. When the harness reports `capabilities.streamingDeltas`, the edge forwards real
+  deltas. When it does not, it emits `text-start`, one `text-delta` with the full text, and
+  `text-end`. The wire shape is identical either way, so the frontend does not care.
+
+### Where the session id rides in the stream
+
+The stream's "body" is the event sequence, so `session_id` cannot be a plain top-level
+field the way it is in the JSON response. It rides on the first event, as metadata on
+`start`:
+
+```
+data: {"type":"start","messageId":"msg_abc","messageMetadata":{"sessionId":"sess_123"}}
+```
+
+The client reads it from the assembled message's metadata. For the server-minted case, this
+is how the client learns the id. For the client-generated case, it is a confirming echo. We
+will also mirror it to a response header at no cost for non-`useChat` callers, but the body
+is the source of truth.
+
+## The contract
+
+### `POST /messages`
+
+Carries `session_id` (optional) at the envelope top level, alongside `trace_id` and
+`span_id`. The conversation is a first-class `data.messages` member in the `UIMessage` shape;
+`data.inputs` holds the named input variables.
+
+Request:
+
+```jsonc
+{
+  "session_id": "sess_123",          // optional; omit to let the server mint one
+  "data": {
+    "messages":   [ /* the full conversation so far, as UIMessage[] */ ],
+    "inputs":     { /* named input variables, no longer holds messages */ },
+    "parameters": { "agent": { "instructions": "...", "model": "...", "tools": [ ... ] } }
+  }
+}
+```
+
+For now `data.messages` carries the full history, the same as today, and the `session_id`
+rides alongside it. When history moves server-side, this shrinks to the new turn only and
+the platform supplies the rest. The field stays the same either way.
+
+Non-streaming response (`Accept: application/json`) adds `session_id` to the envelope:
+
+```jsonc
+{
+  "trace_id": "...",
+  "span_id":  "...",
+  "session_id": "sess_123",
+  "status":   { "code": 200 },
+  "data":     { "outputs": { "role": "assistant", "content": "Berlin." } }
+}
+```
+
+Streaming response (`Accept: text/event-stream`) sets these headers:
+
+```
+content-type: text/event-stream
+cache-control: no-cache
+x-vercel-ai-ui-message-stream: v1
+x-accel-buffering: no
+```
+
+and emits the part sequence above, with `session_id` in the `start` metadata. The
+[appendix](#appendix-a-stream-transcript) shows a full transcript.
+
+### `POST /load-session`
+
+Returns a session's history so the client can rebuild the conversation before its next turn.
+
+Request:
+
+```jsonc
+{ "session_id": "sess_123" }
+```
+
+Response: the conversation as Vercel `UIMessage`s, the exact shape `useChat` takes as its
+initial `messages`:
+
+```jsonc
+{
+  "session_id": "sess_123",
+  "messages": [
+    { "id": "m1", "role": "user",      "parts": [ { "type": "text", "text": "capital of France?" } ] },
+    { "id": "m2", "role": "assistant", "parts": [ { "type": "text", "text": "Paris." } ] }
+  ]
+}
+```
+
+**Open: folded messages or a delta replay?** You described it as "all events from the
+beginning," and the return shape is not settled. Two options:
+
+- **Folded `UIMessage`s** (shown above). The client renders them at once, and `useChat`
+  takes them directly as its initial `messages`. Fast, no animation. This is the simpler
+  path and the natural fit for rebuilding the UI on load.
+- **A delta replay** behind `Accept: text/event-stream` on this same endpoint: re-emit the
+  stored stream part by part. This reuses the streaming encoder and matches "all events,"
+  but it animates the whole history on every load, which is rarely what a reload wants. It
+  earns its keep mainly when resuming a run that is still in flight.
+
+Leaving this open. The endpoint can serve both by content negotiation, the same way
+`/messages` does, so we do not have to choose now.
+
+**Where the history comes from.** Every turn's events are already persisted as spans keyed
+by `ag.session.id` (`api/.../tracing`). So `load-session` can fold those spans into its
+response with no new storage. A dedicated session store is the durable evolution
+([sessions.md](sessions.md), path one), and it slots in behind the same response shape.
+
+### Frontend wiring
+
+The frontend points `useChat` at our endpoint and customizes the body and headers through
+the transport. This is the whole integration:
+
+```ts
+const transport = new DefaultChatTransport({
+  api: "/messages",
+  headers: { Accept: "text/event-stream" },
+  prepareSendMessagesRequest: ({ id, messages }) => ({
+    body: {
+      session_id: id,                  // client-generated, stable across turns
+      data: {
+        messages,                      // full history for now; shrinks to the new turn later
+        inputs: { /* named variables */ },
+        parameters: { agent: agentConfig },
+      },
+    },
+  }),
+});
+
+const { messages, sendMessage, status } = useChat({ id: sessionId, transport });
+```
+
+To rebuild a known conversation on load, fetch `load-session` and pass the result to
+`useChat({ id, messages })`.
+
+## Out of scope for v1
+
+We are forward-compatible with these, but they are not in this RFC:
+
+- **Resuming an in-flight stream** after a dropped connection. Vercel supports it with a
+  `GET /messages/{session_id}/stream` and resumable-stream storage. Worth adding once runs
+  get long, but the reload-and-load-session path covers the common case first.
+- **Client file and image input.** Our `ContentBlock` already models `image` and `resource`
+  (`protocol.ts:10`), and Vercel sends files in the body, so the plumbing exists. Turning it
+  on is its own change.
+- **Generative UI components.** The `data-<name>` part is ready on the wire. Designing the
+  agent-specific parts (plan, diff, progress) and their React components is a later step.
+- **Session deletion and forking.** A `DELETE` for cleanup and a `fork` for branching a
+  conversation (`session/fork`, [sessions.md](sessions.md), path two) come with the warm
+  daemon, not here.
+
+## Appendix A: stream transcript
+
+One agent turn: the model calls a weather tool, reads the result, and answers. Every `data:`
+line in order, blank line (`\n\n`) after each.
+
+```
+data: {"type":"start","messageId":"msg_1","messageMetadata":{"sessionId":"sess_123"}}
+
+data: {"type":"start-step"}
+
+data: {"type":"tool-input-start","toolCallId":"call_1","toolName":"getWeather"}
+
+data: {"type":"tool-input-available","toolCallId":"call_1","toolName":"getWeather","input":{"city":"Paris"}}
+
+data: {"type":"tool-output-available","toolCallId":"call_1","output":{"weather":"sunny","temp":24}}
+
+data: {"type":"finish-step"}
+
+data: {"type":"start-step"}
+
+data: {"type":"text-start","id":"t1"}
+
+data: {"type":"text-delta","id":"t1","delta":"It is sunny "}
+
+data: {"type":"text-delta","id":"t1","delta":"and 24°C in Paris."}
+
+data: {"type":"text-end","id":"t1"}
+
+data: {"type":"finish-step"}
+
+data: {"type":"finish","messageMetadata":{"usage":{"input":820,"output":36,"cost":0.004}}}
+
+data: [DONE]
+```
+
+## Appendix B: sources
+
+- The current contract: `sdks/python/agenta/sdk/models/workflows.py`,
+  `sdks/python/agenta/sdk/decorators/routing.py` (SSE negotiation at `:236`),
+  `api/oss/src/core/workflows/service.py`.
+- The agent events and session id: `services/agent/src/protocol.ts:74`,
+  `sdks/python/agenta/sdk/agents/dtos.py`, `services/oss/src/agent/app.py`.
+- Sessions today and tomorrow: [sessions.md](sessions.md).
+- Vercel UI Message Stream (v5/v6): the `useChat`, stream-protocol, tool-usage,
+  generative-UI, persistence, and transport pages at https://ai-sdk.dev, and the chunk
+  schema at
+  https://github.com/vercel/ai/blob/main/packages/ai/src/ui-message-stream/ui-message-chunks.ts.
+```
diff --git a/sdks/python/agenta/__init__.py b/sdks/python/agenta/__init__.py
index df014c4e00..dc01c3396a 100644
--- a/sdks/python/agenta/__init__.py
+++ b/sdks/python/agenta/__init__.py
@@ -52,6 +52,23 @@
 from .sdk.utils.logging import get_module_logger  # noqa: F401
 from .sdk.utils.preinit import PreInitObject  # noqa: F401
 
+# Agent runtime (the agents subsystem). `Message` is intentionally not re-exported here:
+# `agenta.Message` already names the prompt message type; import the agents one from
+# `agenta.sdk.agents` when needed.
+from .sdk.agents import (  # noqa: F401
+    AgentaHarness,
+    AgentConfig,
+    ClaudeHarness,
+    Environment,
+    InProcessPiBackend,
+    LocalBackend,
+    PiHarness,
+    RivetBackend,
+    RunSelection,
+    SessionConfig,
+    make_harness,
+)
+
 DEFAULT_AGENTA_SINGLETON_INSTANCE = AgentaSingleton()
 
 types = client_types
diff --git a/sdks/python/agenta/sdk/agents/__init__.py b/sdks/python/agenta/sdk/agents/__init__.py
new file mode 100644
index 0000000000..38c5daca39
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/__init__.py
@@ -0,0 +1,91 @@
+"""Agenta agent runtime: run a coding harness (Pi, Claude, ...) as a swappable port.
+
+Layers (Agenta's hexagonal vocabulary):
+
+- ``dtos.py`` — data contracts (``AgentConfig``, ``SessionConfig``, ``Message``, ...).
+- ``interfaces.py`` — the ports (ABCs): ``Backend``, ``Environment``, ``Sandbox``,
+  ``Session``, ``Harness``.
+- ``adapters/`` — implementations: ``RivetBackend`` / ``InProcessPiBackend`` / ``LocalBackend``
+  and ``PiHarness`` / ``ClaudeHarness``.
+- ``utils/`` — shared plumbing (the ``/run`` wire and the transports to the TS runner).
+
+Standalone usage::
+
+    import agenta as ag
+    from agenta.sdk.agents import Message
+
+    cfg = ag.ConfigManager.get_from_registry(app_slug="my-agent")
+    agent = ag.AgentConfig.from_params(cfg)
+    harness = ag.PiHarness(ag.Environment(ag.RivetBackend()))
+    result = await harness.prompt(ag.SessionConfig(agent=agent), [Message(role="user", content="hi")])
+"""
+
+from .adapters import (
+    AgentaHarness,
+    ClaudeHarness,
+    InProcessPiBackend,
+    LocalBackend,
+    PiHarness,
+    RivetBackend,
+    make_harness,
+)
+from .dtos import (
+    AgentaAgentConfig,
+    AgentConfig,
+    AgentEvent,
+    AgentResult,
+    ClaudeAgentConfig,
+    ContentBlock,
+    HarnessAgentConfig,
+    HarnessCapabilities,
+    HarnessType,
+    Message,
+    PermissionPolicy,
+    PiAgentConfig,
+    RunSelection,
+    SessionConfig,
+    ToolCallback,
+    TraceContext,
+    to_messages,
+)
+from .errors import UnsupportedHarnessError
+from .interfaces import Backend, Environment, Harness, Sandbox, Session
+from .streaming import AgentRun
+
+__all__ = [
+    # DTOs
+    "AgentConfig",
+    "RunSelection",
+    "SessionConfig",
+    "HarnessAgentConfig",
+    "PiAgentConfig",
+    "ClaudeAgentConfig",
+    "AgentaAgentConfig",
+    "HarnessType",
+    "HarnessCapabilities",
+    "ContentBlock",
+    "Message",
+    "to_messages",
+    "AgentEvent",
+    "AgentResult",
+    "AgentRun",
+    "TraceContext",
+    "ToolCallback",
+    "PermissionPolicy",
+    # Interfaces (ports)
+    "Backend",
+    "Sandbox",
+    "Session",
+    "Environment",
+    "Harness",
+    # Errors
+    "UnsupportedHarnessError",
+    # Adapters
+    "RivetBackend",
+    "InProcessPiBackend",
+    "LocalBackend",
+    "PiHarness",
+    "ClaudeHarness",
+    "AgentaHarness",
+    "make_harness",
+]
diff --git a/sdks/python/agenta/sdk/agents/adapters/__init__.py b/sdks/python/agenta/sdk/agents/adapters/__init__.py
new file mode 100644
index 0000000000..e8002be7af
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/__init__.py
@@ -0,0 +1,23 @@
+"""Adapters: concrete implementations of the agent runtime ports.
+
+- Backend adapters: ``RivetBackend`` (rivet over ACP), ``InProcessPiBackend`` (in-process Pi,
+  the reference backend), ``LocalBackend`` (standalone SDK runs; not yet implemented).
+- Harness adapters: ``PiHarness``, ``ClaudeHarness``, ``AgentaHarness`` (+ ``make_harness``).
+
+Shared plumbing for the runner-backed adapters lives in ``agents/utils``.
+"""
+
+from .harnesses import AgentaHarness, ClaudeHarness, PiHarness, make_harness
+from .in_process import InProcessPiBackend
+from .local import LocalBackend
+from .rivet import RivetBackend
+
+__all__ = [
+    "RivetBackend",
+    "InProcessPiBackend",
+    "LocalBackend",
+    "PiHarness",
+    "ClaudeHarness",
+    "AgentaHarness",
+    "make_harness",
+]
diff --git a/sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py b/sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py
new file mode 100644
index 0000000000..b5fae23bd2
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py
@@ -0,0 +1,90 @@
+"""The Agenta harness's forced defaults: the things ``AgentaHarness`` always applies.
+
+``AgentaHarness`` is Pi with an opinion. It is the same engine as :class:`PiHarness`, but
+every run carries a fixed set of Agenta-shipped extras the author cannot turn off:
+
+- a base **persona** appended to Pi's system prompt (``AGENTA_FORCED_APPEND_SYSTEM``),
+- a base **AGENTS.md preamble** the author's instructions are appended to (``AGENTA_PREAMBLE``),
+- a set of **forced tools** (``AGENTA_FORCED_TOOLS``), and
+- a set of **forced skills** (``AGENTA_FORCED_SKILLS``).
+
+The forced *policy* lives here (harness knowledge). The forced skill *files* live with the
+runner that runs Pi, under ``services/agent/skills/<name>/``; the contract between the two is
+the skill directory **name**, so each entry in ``AGENTA_FORCED_SKILLS`` must match a committed
+directory there.
+
+Two layers, kept distinct on purpose (matching Pi's own split, see :class:`PiAgentConfig`):
+the *persona* is an ``append_system`` (changes Pi's base prompt), while *project conventions*
+belong in ``AGENTS.md``. ``AGENTA_PREAMBLE`` is the AGENTS.md layer; ``AGENTA_FORCED_APPEND_SYSTEM``
+is the persona layer.
+"""
+
+from __future__ import annotations
+
+from typing import List, Optional
+
+# The base AGENTS.md preamble. The author's own ``instructions`` are appended after this, so
+# the final AGENTS.md is ``AGENTA_PREAMBLE`` + the author's project conventions.
+#
+# TODO(product): replace this placeholder with the real Agenta AGENTS.md preamble.
+AGENTA_PREAMBLE = """\
+# Agenta agent
+
+You are an agent running on the Agenta platform. The instructions below are Agenta's
+baseline; the user's own instructions follow and take precedence where they are more
+specific.
+
+- Prefer the tools and skills provided to you over guessing.
+- When a skill matches the task, read its SKILL.md fully before acting.
+- Keep answers grounded in what the tools and skills actually return."""
+
+# The base persona, always appended to Pi's built-in system prompt (never replaces it). This
+# is the "who the agent is" layer, distinct from the AGENTS.md project-context layer above.
+#
+# TODO(product): replace this placeholder with the real Agenta persona framing.
+AGENTA_FORCED_APPEND_SYSTEM = """\
+You are an Agenta agent. Be precise, cite what your tools and skills return, and do not
+fabricate results."""
+
+# Built-in tools every Agenta run forces on, unioned with the agent's resolved tools.
+# ``read`` is mandatory: Pi only renders the skills section into the system prompt when the
+# ``read`` tool is available. ``bash`` lets skills run their helper scripts.
+AGENTA_FORCED_TOOLS: List[str] = ["read", "bash"]
+
+# Built-in skills every Agenta run forces on. Each name must match a committed directory under
+# the runner's ``services/agent/skills/<name>/`` (the runner resolves names to those dirs).
+#
+# TODO(product): grow this with the real Agenta skill set.
+AGENTA_FORCED_SKILLS: List[str] = ["agenta-getting-started"]
+
+
+def _join(*parts: Optional[str]) -> Optional[str]:
+    """Join the non-empty parts with a blank line, or ``None`` when nothing remains."""
+    kept = [part.strip() for part in parts if part and part.strip()]
+    if not kept:
+        return None
+    return "\n\n".join(kept)
+
+
+def compose_instructions(user: Optional[str]) -> Optional[str]:
+    """The AGENTS.md the harness ships: the base preamble with the author's instructions
+    appended after it."""
+    return _join(AGENTA_PREAMBLE, user)
+
+
+def compose_append_system(user: Optional[str]) -> Optional[str]:
+    """The ``append_system`` the harness ships: the forced base persona with the author's own
+    ``append_system`` appended after it."""
+    return _join(AGENTA_FORCED_APPEND_SYSTEM, user)
+
+
+def force_tools(builtin_tools: List[str]) -> List[str]:
+    """Union the resolved built-in tools with the forced set, order-stable and de-duplicated
+    (resolved tools first, then any forced tools not already present)."""
+    seen = set()
+    out: List[str] = []
+    for name in list(builtin_tools) + AGENTA_FORCED_TOOLS:
+        if name and name not in seen:
+            seen.add(name)
+            out.append(name)
+    return out
diff --git a/sdks/python/agenta/sdk/agents/adapters/harnesses.py b/sdks/python/agenta/sdk/agents/adapters/harnesses.py
new file mode 100644
index 0000000000..31e52d73da
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/harnesses.py
@@ -0,0 +1,168 @@
+"""Adapters of the :class:`~agenta.sdk.agents.interfaces.Harness` port: one per harness type.
+
+This is where the per-harness adaptation lives (the logic that used to sit in the TS runner):
+turning the neutral :class:`SessionConfig` into the harness's own config, especially the
+*tools*. The harnesses genuinely differ, so the two adapters do different work:
+
+- **Pi** takes built-in tools by name *and* resolved tool specs, delivered natively (Pi has
+  no MCP). Pi does not gate tool use, so the permission policy does not apply.
+- **Claude** has no built-in tools (they are a Pi concept), delivers tools over MCP, and
+  gates tool use, so the permission policy applies.
+- **Agenta** is Pi with an opinion: the same engine and config shape, plus a fixed set of
+  forced tools, skills, a base AGENTS.md preamble, and a persona (see :mod:`.agenta_builtins`).
+
+The backend below stays pure plumbing; this layer owns the harness knowledge.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Type
+
+from agenta.sdk.utils.logging import get_module_logger
+
+from ..dtos import (
+    AgentaAgentConfig,
+    ClaudeAgentConfig,
+    HarnessType,
+    PiAgentConfig,
+    SessionConfig,
+)
+from ..interfaces import Environment, Harness
+from .agenta_builtins import (
+    AGENTA_FORCED_SKILLS,
+    compose_append_system,
+    compose_instructions,
+    force_tools,
+)
+
+log = get_module_logger(__name__)
+
+_EMPTY_OBJECT_SCHEMA: Dict[str, Any] = {"type": "object", "properties": {}}
+
+
+def _opt_str(value: Any) -> Any:
+    """Keep a harness option only if it is a non-empty string; otherwise drop it to ``None``
+    so an empty or malformed value never reaches the wire as a real override."""
+    if isinstance(value, str) and value.strip():
+        return value
+    return None
+
+
+def _normalize_tool_specs(specs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Coerce resolved tool specs into the shape every harness expects.
+
+    Drops malformed entries (no name) and fills the defaults the harness runtimes need: a
+    description (falls back to the name) and a JSON-Schema ``inputSchema`` (an empty object
+    when none was resolved). ``callRef`` is preserved so the call routes back to Agenta.
+    """
+    normalized: List[Dict[str, Any]] = []
+    for spec in specs or []:
+        if not isinstance(spec, dict):
+            continue
+        name = spec.get("name")
+        if not name:
+            continue
+        normalized.append(
+            {
+                "name": name,
+                "description": spec.get("description") or name,
+                "inputSchema": spec.get("inputSchema") or dict(_EMPTY_OBJECT_SCHEMA),
+                "callRef": spec.get("callRef"),
+            }
+        )
+    return normalized
+
+
+class PiHarness(Harness):
+    harness_type = HarnessType.PI
+
+    def _to_harness_config(self, config: SessionConfig) -> PiAgentConfig:
+        # Pi delivers tools natively: built-in names plus resolved specs registered through
+        # the Pi extension. Pi does not gate tool use, so the permission policy is dropped.
+        # Pi reads its own slice of the neutral harness_options bag: `system` replaces Pi's
+        # base prompt, `append_system` extends it (both leave AGENTS.md untouched).
+        pi_options = config.agent.harness_options.get(HarnessType.PI.value, {})
+        return PiAgentConfig(
+            agents_md=config.agent.instructions,
+            model=config.agent.model,
+            builtin_tools=list(config.builtin_tools),
+            custom_tools=_normalize_tool_specs(config.custom_tools),
+            tool_callback=config.tool_callback,
+            system=_opt_str(pi_options.get("system")),
+            append_system=_opt_str(pi_options.get("append_system")),
+        )
+
+
+class ClaudeHarness(Harness):
+    harness_type = HarnessType.CLAUDE
+
+    def _to_harness_config(self, config: SessionConfig) -> ClaudeAgentConfig:
+        # Claude has no Pi built-in tools; drop them rather than ship a name Claude cannot
+        # honor. Tools go over MCP, and Claude gates tool use, so the permission policy is
+        # carried through.
+        if config.builtin_tools:
+            log.warning(
+                "ClaudeHarness ignores %d built-in tool(s); built-ins are a Pi concept",
+                len(config.builtin_tools),
+            )
+        return ClaudeAgentConfig(
+            agents_md=config.agent.instructions,
+            model=config.agent.model,
+            custom_tools=_normalize_tool_specs(config.custom_tools),
+            tool_callback=config.tool_callback,
+            permission_policy=config.permission_policy,
+        )
+
+
+class AgentaHarness(Harness):
+    """Pi with an Agenta opinion. Same engine as :class:`PiHarness`, but every run carries the
+    forced Agenta extras (see :mod:`.agenta_builtins`): a base AGENTS.md preamble the author's
+    instructions are appended to, a forced persona ``append_system``, forced tools, and forced
+    skills. The author's own Pi ``harness_options`` (``system`` / ``append_system``) still
+    apply, layered after the forced bits."""
+
+    harness_type = HarnessType.AGENTA
+
+    def _to_harness_config(self, config: SessionConfig) -> AgentaAgentConfig:
+        # The author's Pi options still apply; the Agenta harness reads the same `pi` slice as
+        # PiHarness (it drives Pi) and layers its forced extras on top.
+        pi_options = config.agent.harness_options.get(HarnessType.PI.value, {})
+        return AgentaAgentConfig(
+            agents_md=compose_instructions(config.agent.instructions),
+            model=config.agent.model,
+            builtin_tools=force_tools(list(config.builtin_tools)),
+            custom_tools=_normalize_tool_specs(config.custom_tools),
+            tool_callback=config.tool_callback,
+            system=_opt_str(pi_options.get("system")),
+            append_system=compose_append_system(
+                _opt_str(pi_options.get("append_system"))
+            ),
+            skills=list(AGENTA_FORCED_SKILLS),
+        )
+
+
+_HARNESSES: Dict[HarnessType, Type[Harness]] = {
+    HarnessType.PI: PiHarness,
+    HarnessType.CLAUDE: ClaudeHarness,
+    HarnessType.AGENTA: AgentaHarness,
+}
+
+
+def make_harness(
+    harness_type: "HarnessType | str", environment: Environment
+) -> Harness:
+    """Construct the Harness for a harness type over an environment.
+
+    Maps the playground/config string to the right class. Raises
+    :class:`~agenta.sdk.agents.errors.UnsupportedHarnessError` if the environment's backend
+    cannot drive it.
+    """
+    resolved = HarnessType.coerce(harness_type)
+    try:
+        cls = _HARNESSES[resolved]
+    except KeyError as exc:
+        known = ", ".join(sorted(h.value for h in _HARNESSES))
+        raise ValueError(
+            f"unknown harness '{resolved.value}'; known harnesses: {known}"
+        ) from exc
+    return cls(environment)
diff --git a/sdks/python/agenta/sdk/agents/adapters/in_process.py b/sdks/python/agenta/sdk/agents/adapters/in_process.py
new file mode 100644
index 0000000000..bfd1528bd7
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/in_process.py
@@ -0,0 +1,170 @@
+"""InProcessPiBackend: drive Pi in-process through the TS runner, no rivet daemon.
+
+This was the first backend implementation and stays as the simplest one: a single harness
+(Pi), a single place (local), the legacy in-process Pi engine (``engines/pi.ts``). It is the
+reference to read when writing a new backend.
+
+It is its own class and hard-codes its differences (the ``pi`` engine, Pi-only support,
+local-only). It is deliberately NOT a subclass of ``RivetBackend``; the two are different
+engines that happen to share the ``utils`` wire and transport helpers.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, AsyncIterator, Dict, List, Mapping, Optional, Sequence
+
+from ..dtos import (
+    AgentResult,
+    EventSink,
+    HarnessAgentConfig,
+    HarnessType,
+    Message,
+    TraceContext,
+)
+from ..interfaces import Backend, Sandbox, Session
+from ..streaming import AgentRun
+from ..utils import (
+    deliver_http,
+    deliver_http_stream,
+    deliver_subprocess,
+    deliver_subprocess_stream,
+    request_to_wire,
+    result_from_wire,
+)
+
+_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
+
+
+class InProcessSandbox(Sandbox):
+    """The local host. In-process Pi runs here directly; provisioning files are buffered
+    (AGENTS.md rides the wire today)."""
+
+    def __init__(self) -> None:
+        self.files: Dict[str, bytes] = {}
+
+    async def add_files(self, files: Mapping[str, bytes]) -> None:
+        self.files.update(files)
+
+
+class InProcessPiSession(Session):
+    """One turn-per-prompt Pi session driven in-process by the TS runner."""
+
+    def __init__(
+        self,
+        backend: "InProcessPiBackend",
+        config: HarnessAgentConfig,
+        *,
+        secrets: Optional[Mapping[str, str]],
+        trace: Optional[TraceContext],
+        session_id: Optional[str],
+    ) -> None:
+        self._backend = backend
+        self._config = config
+        self._secrets = dict(secrets or {})
+        self._trace = trace
+        self._session_id = session_id
+
+    @property
+    def id(self) -> Optional[str]:
+        return self._session_id
+
+    def _wire_payload(self, messages: Sequence[Message]) -> Dict[str, Any]:
+        """The ``/run`` request JSON for this turn (shared by ``prompt`` and ``stream``)."""
+        return request_to_wire(
+            engine=InProcessPiBackend._ENGINE,
+            harness=HarnessType.PI,
+            sandbox="local",
+            config=self._config,
+            messages=messages,
+            secrets=self._secrets,
+            trace=self._trace,
+            session_id=self._session_id,
+        )
+
+    def _absorb_result(self, result: AgentResult) -> None:
+        """Carry the run's session id forward so a follow-up turn resumes it."""
+        if result.session_id:
+            self._session_id = result.session_id
+
+    async def prompt(
+        self,
+        messages: Sequence[Message],
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        data = await self._backend._deliver(self._wire_payload(messages))
+        result = result_from_wire(data)
+        self._absorb_result(result)
+        if on_event:
+            for event in result.events:
+                try:
+                    on_event(event)
+                except Exception:  # pylint: disable=broad-except
+                    pass
+        return result
+
+    def stream(self, messages: Sequence[Message]) -> AgentRun:
+        """Run one turn over the streaming transport, yielding events live (see AgentRun)."""
+        records = self._backend._deliver_stream(self._wire_payload(messages))
+        return AgentRun(records).on_result(self._absorb_result)
+
+
+class InProcessPiBackend(Backend):
+    """The in-process Pi engine: drives the Pi SDK directly in the TS runner. Pi only, local
+    only, no rivet daemon."""
+
+    # Agenta is Pi with an opinion: same in-process engine, so this backend drives it too.
+    supported_harnesses = frozenset({HarnessType.PI, HarnessType.AGENTA})
+    _ENGINE = "pi"  # hard-coded engine identity
+
+    def __init__(
+        self,
+        *,
+        url: Optional[str] = None,
+        command: Optional[Sequence[str]] = None,
+        cwd: Optional[str] = None,
+        timeout: float = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180")),
+    ) -> None:
+        self._url = url
+        self._command: List[str] = list(command or _DEFAULT_COMMAND)
+        self._cwd = cwd
+        self._timeout = timeout
+
+    async def create_sandbox(self) -> InProcessSandbox:
+        return InProcessSandbox()
+
+    async def create_session(
+        self,
+        sandbox: Sandbox,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        secrets: Optional[Mapping[str, str]] = None,
+        trace: Optional[TraceContext] = None,
+        session_id: Optional[str] = None,
+    ) -> InProcessPiSession:
+        return InProcessPiSession(
+            self,
+            config,
+            secrets=secrets,
+            trace=trace,
+            session_id=session_id,
+        )
+
+    async def _deliver(self, payload: Dict[str, Any]) -> Dict[str, Any]:
+        if self._url:
+            return await deliver_http(self._url, payload, timeout=self._timeout)
+        env = {**os.environ, "AGENT_BACKEND": self._ENGINE}
+        return await deliver_subprocess(
+            self._command, payload, cwd=self._cwd, env=env, timeout=self._timeout
+        )
+
+    def _deliver_stream(self, payload: Dict[str, Any]) -> AsyncIterator[Dict[str, Any]]:
+        """The live counterpart of ``_deliver``: an NDJSON record stream from the runner."""
+        if self._url:
+            return deliver_http_stream(self._url, payload, timeout=self._timeout)
+        env = {**os.environ, "AGENT_BACKEND": self._ENGINE}
+        return deliver_subprocess_stream(
+            self._command, payload, cwd=self._cwd, env=env, timeout=self._timeout
+        )
diff --git a/sdks/python/agenta/sdk/agents/adapters/local.py b/sdks/python/agenta/sdk/agents/adapters/local.py
new file mode 100644
index 0000000000..5435ea4751
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/local.py
@@ -0,0 +1,48 @@
+"""LocalBackend: run a harness on this machine, no rivet daemon and no Agenta sidecar.
+
+This is the backend a standalone SDK user gets. It is two mechanisms, one per harness, which
+is exactly a backend's "plumbing per harness" job:
+
+- Pi   -> the bundled JS runner (the in-process Pi engine), shipped inside the wheel, run
+          with ``node``.
+- Claude -> the pure-Python ``claude-agent-sdk``, in-process, no TS bridge.
+
+NOT YET IMPLEMENTED. Tracked as Phase 3 (Pi) and Phase 4 (Claude) in
+``docs/design/agent-workflows/scratch/sdk-local-backend/plan.md``. The class is present so
+the adapter layout is complete and the port shape is visible; the methods raise until the
+bundling build step and the ``claude-agent-sdk`` wiring land.
+"""
+
+from __future__ import annotations
+
+from typing import Mapping, Optional
+
+from ..dtos import HarnessAgentConfig, HarnessType, TraceContext
+from ..interfaces import Backend, Sandbox, Session
+
+
+class LocalBackend(Backend):
+    """Run Pi (bundled JS) or Claude (``claude-agent-sdk``) on this machine."""
+
+    supported_harnesses = frozenset({HarnessType.PI, HarnessType.CLAUDE})
+
+    async def create_sandbox(self) -> Sandbox:
+        raise NotImplementedError(
+            "LocalBackend is not implemented yet (Phase 3: Pi via bundled JS, "
+            "Phase 4: Claude via claude-agent-sdk)."
+        )
+
+    async def create_session(
+        self,
+        sandbox: Sandbox,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        secrets: Optional[Mapping[str, str]] = None,
+        trace: Optional[TraceContext] = None,
+        session_id: Optional[str] = None,
+    ) -> Session:
+        raise NotImplementedError(
+            "LocalBackend is not implemented yet (Phase 3: Pi via bundled JS, "
+            "Phase 4: Claude via claude-agent-sdk)."
+        )
diff --git a/sdks/python/agenta/sdk/agents/adapters/rivet.py b/sdks/python/agenta/sdk/agents/adapters/rivet.py
new file mode 100644
index 0000000000..2316eb0dea
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/rivet.py
@@ -0,0 +1,186 @@
+"""RivetBackend: drive a harness over ACP via the TypeScript rivet runner.
+
+This backend hard-codes that it is the rivet engine. It reaches the same runner the deployed
+sidecar runs (HTTP when a ``url`` is set, otherwise a subprocess CLI), and the runner starts
+the rivet daemon, the ACP adapter, and the harness. Supports Pi and Claude. The ``sandbox``
+axis (``local`` / ``daytona``) is a real runtime choice, so it stays a constructor arg.
+
+It is its own class, not a subclass of any other backend; it shares only the ``utils`` wire
+and transport helpers.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, AsyncIterator, Dict, List, Mapping, Optional, Sequence
+
+from ..dtos import (
+    AgentResult,
+    EventSink,
+    HarnessAgentConfig,
+    HarnessType,
+    Message,
+    TraceContext,
+)
+from ..interfaces import Backend, Sandbox, Session
+from ..streaming import AgentRun
+from ..utils import (
+    deliver_http,
+    deliver_http_stream,
+    deliver_subprocess,
+    deliver_subprocess_stream,
+    request_to_wire,
+    result_from_wire,
+)
+
+_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
+
+
+class RivetSandbox(Sandbox):
+    """Carries the sandbox axis for the run. The real sandbox (a local daemon or a Daytona
+    VM) is created inside the TS runner; here we hold the axis and buffer provisioning files
+    (today AGENTS.md rides the wire, so this is informational)."""
+
+    def __init__(self, sandbox_id: str) -> None:
+        self.sandbox_id = sandbox_id
+        self.files: Dict[str, bytes] = {}
+
+    async def add_files(self, files: Mapping[str, bytes]) -> None:
+        self.files.update(files)
+
+
+class RivetSession(Session):
+    """One turn-per-prompt session. Each prompt sends one ``/run`` (cold + replay)."""
+
+    def __init__(
+        self,
+        backend: "RivetBackend",
+        sandbox: RivetSandbox,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        secrets: Optional[Mapping[str, str]],
+        trace: Optional[TraceContext],
+        session_id: Optional[str],
+    ) -> None:
+        self._backend = backend
+        self._sandbox = sandbox
+        self._config = config
+        self._harness = harness
+        self._secrets = dict(secrets or {})
+        self._trace = trace
+        self._session_id = session_id
+
+    @property
+    def id(self) -> Optional[str]:
+        return self._session_id
+
+    def _wire_payload(self, messages: Sequence[Message]) -> Dict[str, Any]:
+        """The ``/run`` request JSON for this turn (shared by ``prompt`` and ``stream``)."""
+        return request_to_wire(
+            engine=RivetBackend._ENGINE,
+            harness=self._harness,
+            sandbox=self._sandbox.sandbox_id,
+            config=self._config,
+            messages=messages,
+            secrets=self._secrets,
+            trace=self._trace,
+            session_id=self._session_id,
+        )
+
+    def _absorb_result(self, result: AgentResult) -> None:
+        """Carry the run's session id forward so a follow-up turn resumes it."""
+        if result.session_id:
+            self._session_id = result.session_id
+
+    async def prompt(
+        self,
+        messages: Sequence[Message],
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        data = await self._backend._deliver(self._wire_payload(messages))
+        result = result_from_wire(data)
+        self._absorb_result(result)
+        _emit_events(result, on_event)
+        return result
+
+    def stream(self, messages: Sequence[Message]) -> AgentRun:
+        """Run one turn over the streaming transport, yielding events live (see AgentRun)."""
+        records = self._backend._deliver_stream(self._wire_payload(messages))
+        return AgentRun(records).on_result(self._absorb_result)
+
+
+class RivetBackend(Backend):
+    """The rivet engine: a harness over ACP through the TS runner. Pi and Claude."""
+
+    supported_harnesses = frozenset({HarnessType.PI, HarnessType.CLAUDE})
+    _ENGINE = "rivet"  # hard-coded engine identity, not a constructor arg
+
+    def __init__(
+        self,
+        *,
+        sandbox: str = "local",
+        url: Optional[str] = None,
+        command: Optional[Sequence[str]] = None,
+        cwd: Optional[str] = None,
+        timeout: float = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180")),
+    ) -> None:
+        self._sandbox = sandbox
+        self._url = url
+        self._command: List[str] = list(command or _DEFAULT_COMMAND)
+        self._cwd = cwd
+        self._timeout = timeout
+
+    async def create_sandbox(self) -> RivetSandbox:
+        return RivetSandbox(self._sandbox)
+
+    async def create_session(
+        self,
+        sandbox: Sandbox,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        secrets: Optional[Mapping[str, str]] = None,
+        trace: Optional[TraceContext] = None,
+        session_id: Optional[str] = None,
+    ) -> RivetSession:
+        if not isinstance(sandbox, RivetSandbox):
+            raise TypeError("RivetBackend.create_session requires a RivetSandbox")
+        return RivetSession(
+            self,
+            sandbox,
+            config,
+            harness=harness,
+            secrets=secrets,
+            trace=trace,
+            session_id=session_id,
+        )
+
+    async def _deliver(self, payload: Dict[str, Any]) -> Dict[str, Any]:
+        if self._url:
+            return await deliver_http(self._url, payload, timeout=self._timeout)
+        env = {**os.environ, "AGENT_BACKEND": self._ENGINE}
+        return await deliver_subprocess(
+            self._command, payload, cwd=self._cwd, env=env, timeout=self._timeout
+        )
+
+    def _deliver_stream(self, payload: Dict[str, Any]) -> AsyncIterator[Dict[str, Any]]:
+        """The live counterpart of ``_deliver``: an NDJSON record stream from the runner."""
+        if self._url:
+            return deliver_http_stream(self._url, payload, timeout=self._timeout)
+        env = {**os.environ, "AGENT_BACKEND": self._ENGINE}
+        return deliver_subprocess_stream(
+            self._command, payload, cwd=self._cwd, env=env, timeout=self._timeout
+        )
+
+
+def _emit_events(result: AgentResult, on_event: Optional[EventSink]) -> None:
+    """Replay the result's event log to a live sink (the one-shot transports batch it)."""
+    if not on_event:
+        return
+    for event in result.events:
+        try:
+            on_event(event)
+        except Exception:  # pylint: disable=broad-except
+            pass
diff --git a/sdks/python/agenta/sdk/agents/dtos.py b/sdks/python/agenta/sdk/agents/dtos.py
new file mode 100644
index 0000000000..578ece4901
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/dtos.py
@@ -0,0 +1,554 @@
+"""Data contracts for the agent runtime (the DTO layer).
+
+Everything the ports and adapters pass around: harness identity, capabilities, content
+blocks, messages, run events, the run result, trace/tool-callback plumbing, the neutral
+``AgentConfig``, the per-harness configs a backend plumbs, and the ``SessionConfig`` bundle.
+
+These are Pydantic models (the SDK already depends on Pydantic), kept neutral: an adapter
+translates them to and from its engine's own shapes at its edge.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Union
+
+from pydantic import BaseModel, Field
+
+
+# ---------------------------------------------------------------------------
+# Harness identity
+# ---------------------------------------------------------------------------
+
+
+class HarnessType(str, Enum):
+    """The coding agent program a run drives. A backend declares which it supports."""
+
+    PI = "pi"
+    CLAUDE = "claude"
+    AGENTA = "agenta"
+
+    @classmethod
+    def coerce(cls, value: "HarnessType | str") -> "HarnessType":
+        """Accept either an enum or a loose string (the playground sends a string)."""
+        if isinstance(value, cls):
+            return value
+        return cls(str(value).lower())
+
+
+# Permission policy for harness tool use in a headless run. ``auto`` approves (tools are
+# backend-resolved and trusted, no human to prompt); ``deny`` rejects.
+PermissionPolicy = str  # "auto" | "deny"
+
+
+# ---------------------------------------------------------------------------
+# Capabilities
+# ---------------------------------------------------------------------------
+
+
+class HarnessCapabilities(BaseModel):
+    """What a harness can do, probed by the backend (rivet ``AgentCapabilities``).
+
+    Adapters branch on these flags rather than the harness name (no ``if pi``): deliver
+    tools over MCP only when ``mcp_tools`` is set, skip image blocks without ``images``.
+    """
+
+    text_messages: bool = True
+    images: bool = False
+    file_attachments: bool = False
+    mcp_tools: bool = False
+    tool_calls: bool = False
+    reasoning: bool = False
+    plan_mode: bool = False
+    permissions: bool = False
+    usage: bool = False
+    streaming_deltas: bool = False
+    session_lifecycle: bool = False
+
+    @classmethod
+    def from_wire(
+        cls, data: Optional[Dict[str, Any]]
+    ) -> Optional["HarnessCapabilities"]:
+        """Parse the camelCase capability object an adapter returns. ``None`` passes through."""
+        if not isinstance(data, dict):
+            return None
+        return cls(
+            text_messages=bool(data.get("textMessages", True)),
+            images=bool(data.get("images", False)),
+            file_attachments=bool(data.get("fileAttachments", False)),
+            mcp_tools=bool(data.get("mcpTools", False)),
+            tool_calls=bool(data.get("toolCalls", False)),
+            reasoning=bool(data.get("reasoning", False)),
+            plan_mode=bool(data.get("planMode", False)),
+            permissions=bool(data.get("permissions", False)),
+            usage=bool(data.get("usage", False)),
+            streaming_deltas=bool(data.get("streamingDeltas", False)),
+            session_lifecycle=bool(data.get("sessionLifecycle", False)),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Turn input: content blocks and messages
+# ---------------------------------------------------------------------------
+
+
+class ContentBlock(BaseModel):
+    """One piece of a message, mirroring the ACP content-block kinds.
+
+    ``text`` is the only kind callers send today; ``image`` and ``resource`` are plumbed so
+    an image-capable harness can take them. A bare string normalizes to a single ``text``
+    block on the wire.
+    """
+
+    type: str  # "text" | "image" | "resource"
+    text: Optional[str] = None
+    data: Optional[str] = None  # base64 payload, used when type != "text"
+    mime_type: Optional[str] = None
+    uri: Optional[str] = None
+
+    def to_wire(self) -> Dict[str, Any]:
+        block: Dict[str, Any] = {"type": self.type}
+        if self.text is not None:
+            block["text"] = self.text
+        if self.data is not None:
+            block["data"] = self.data
+        if self.mime_type is not None:
+            block["mimeType"] = self.mime_type
+        if self.uri is not None:
+            block["uri"] = self.uri
+        return block
+
+    @classmethod
+    def from_raw(cls, raw: Any) -> "ContentBlock":
+        """Coerce a loose block (string or dict) into a ContentBlock."""
+        if isinstance(raw, ContentBlock):
+            return raw
+        if isinstance(raw, str):
+            return cls(type="text", text=raw)
+        if isinstance(raw, dict):
+            return cls(
+                type=str(raw.get("type", "text")),
+                text=raw.get("text"),
+                data=raw.get("data"),
+                mime_type=raw.get("mimeType") or raw.get("mime_type"),
+                uri=raw.get("uri"),
+            )
+        return cls(type="text", text=str(raw))
+
+
+# A message's content is either a plain string or a list of content blocks.
+MessageContent = Union[str, List[ContentBlock]]
+
+
+class Message(BaseModel):
+    """A chat message in the conversation. ``content`` is text or content blocks.
+
+    This is the runtime's own message type, distinct from the SDK's prompt ``Message``
+    (``agenta.Message``); the two serve different layers.
+    """
+
+    role: str
+    content: MessageContent = ""
+
+    def to_wire(self) -> Dict[str, Any]:
+        if isinstance(self.content, str):
+            content: Any = self.content
+        else:
+            content = [block.to_wire() for block in self.content]
+        return {"role": self.role, "content": content}
+
+    @classmethod
+    def from_raw(cls, raw: Any) -> Optional["Message"]:
+        """Coerce a loose dict (the playground's message shape) into a Message."""
+        if isinstance(raw, Message):
+            return raw
+        if not isinstance(raw, dict) or "role" not in raw:
+            return None
+        content = raw.get("content", "")
+        if isinstance(content, list):
+            content = [ContentBlock.from_raw(block) for block in content]
+        return cls(role=str(raw["role"]), content=content)
+
+
+def to_messages(raw: Optional[List[Any]]) -> List[Message]:
+    """Coerce a list of loose message dicts into :class:`Message` objects."""
+    messages: List[Message] = []
+    for item in raw or []:
+        message = Message.from_raw(item)
+        if message is not None:
+            messages.append(message)
+    return messages
+
+
+# ---------------------------------------------------------------------------
+# Run events
+# ---------------------------------------------------------------------------
+
+
+class AgentEvent(BaseModel):
+    """One structured event from a run, mapped from an ACP ``session/update``.
+
+    ``type`` is one of ``message``, ``thought``, ``tool_call``, ``tool_result``, ``usage``,
+    ``error``, ``done``. ``data`` carries the rest verbatim.
+    """
+
+    type: str
+    data: Dict[str, Any] = Field(default_factory=dict)
+
+    @classmethod
+    def from_wire(cls, raw: Any) -> Optional["AgentEvent"]:
+        if not isinstance(raw, dict) or not raw.get("type"):
+            return None
+        return cls(type=str(raw["type"]), data=raw)
+
+
+# A live event sink. Synchronous: adapters invoke it as events arrive (or as a batch).
+EventSink = Callable[[AgentEvent], None]
+
+
+# ---------------------------------------------------------------------------
+# Cross-boundary plumbing
+# ---------------------------------------------------------------------------
+
+
+class TraceContext(BaseModel):
+    """Agenta trace context threaded into a harness run, so it nests under the caller's
+    workflow span. All fields optional; with none set the run traces standalone (or not at
+    all), the standalone-SDK case."""
+
+    traceparent: Optional[str] = None
+    baggage: Optional[str] = None
+    endpoint: Optional[str] = None  # OTLP traces URL
+    authorization: Optional[str] = None  # full Authorization header value
+    capture_content: bool = True
+
+    def to_wire(self) -> Dict[str, Any]:
+        return {
+            "traceparent": self.traceparent,
+            "baggage": self.baggage,
+            "endpoint": self.endpoint,
+            "authorization": self.authorization,
+            "captureContent": self.capture_content,
+        }
+
+
+class ToolCallback(BaseModel):
+    """How a harness routes a tool call back through Agenta's ``/tools/call``. The provider
+    key and connection auth stay server-side. Empty for a standalone run with no
+    Agenta-resolved tools."""
+
+    endpoint: str  # full ``/tools/call`` URL
+    authorization: Optional[str] = None
+
+    def to_wire(self) -> Dict[str, Any]:
+        return {"endpoint": self.endpoint, "authorization": self.authorization}
+
+
+# ---------------------------------------------------------------------------
+# Run result
+# ---------------------------------------------------------------------------
+
+
+class AgentResult(BaseModel):
+    """A run's reply plus structured metadata. ``output`` is the final assistant text;
+    ``usage`` rolls token/cost onto a workflow span; ``capabilities`` is what the harness
+    was probed to support."""
+
+    output: str = ""
+    messages: List[Message] = Field(default_factory=list)
+    events: List[AgentEvent] = Field(default_factory=list)
+    usage: Optional[Dict[str, Any]] = None
+    stop_reason: Optional[str] = None
+    capabilities: Optional[HarnessCapabilities] = None
+    session_id: Optional[str] = None
+    model: Optional[str] = None
+    trace_id: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# The neutral agent definition + run selection
+# ---------------------------------------------------------------------------
+
+
+class AgentConfig(BaseModel):
+    """What an agent IS, independent of where or how it runs. ``instructions`` becomes
+    ``AGENTS.md``. ``tools`` are provider-agnostic references; resolving them into runnable
+    specs is the caller's job (the Agenta service does it server-side).
+
+    ``harness_options`` is the neutral config's one escape hatch: a map keyed by harness
+    name (``"pi"``, ``"claude"``) whose value is a free-form bag of knobs only that harness
+    understands, for example Pi's ``system`` / ``append_system`` prompt overrides. The
+    config stays harness-agnostic because each Harness adapter reads only its own slice and
+    ignores the rest; a key for a harness that is not running is simply never looked at.
+    """
+
+    instructions: Optional[str] = None
+    model: Optional[str] = None
+    tools: List[Any] = Field(default_factory=list)
+    harness_options: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
+
+    @classmethod
+    def from_params(
+        cls,
+        params: Dict[str, Any],
+        *,
+        defaults: Optional["AgentConfig"] = None,
+    ) -> "AgentConfig":
+        """Build an :class:`AgentConfig` from a request/config dict.
+
+        Accepts three shapes, in priority order: the dedicated ``agent`` element, the
+        playground ``prompt`` prompt-template (system message -> instructions, ``llm_config``
+        -> model + tools), and a flat ``{model, agents_md, tools}``. Unset fields fall back
+        to ``defaults``. ``harness_options`` is read from the ``agent`` element (or the flat
+        request) when present.
+        """
+        base = defaults or cls()
+        instructions, model, tools = _parse_agent_fields(params, base)
+        return cls(
+            instructions=instructions,
+            model=model,
+            tools=_as_list(tools),
+            harness_options=_parse_harness_options(params, base),
+        )
+
+
+class RunSelection(BaseModel):
+    """The run-time choices stored next to the agent config: which harness, which sandbox,
+    the permission policy. Read by the caller to pick a backend and harness class;
+    deliberately not part of the neutral :class:`AgentConfig`."""
+
+    harness: str = "pi"
+    sandbox: str = "local"
+    permission_policy: PermissionPolicy = "auto"
+
+    @classmethod
+    def from_params(
+        cls,
+        params: Dict[str, Any],
+        *,
+        default_harness: str = "pi",
+        default_sandbox: str = "local",
+    ) -> "RunSelection":
+        agent = params.get("agent")
+        source = agent if isinstance(agent, dict) else params
+        return cls(
+            harness=str(source.get("harness") or default_harness).lower(),
+            sandbox=str(source.get("sandbox") or default_sandbox).lower(),
+            permission_policy=str(source.get("permission_policy") or "auto").lower(),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-harness configs (what an adapter consumes)
+# ---------------------------------------------------------------------------
+
+
+class HarnessAgentConfig(BaseModel):
+    """Base for a harness-specific config. A Harness produces one of these from the neutral
+    config; a backend plumbs it as-is, with no business logic about how the harness works.
+
+    The two subclasses differ in their *shape*, not just their identity, because the
+    harnesses differ: Pi takes built-in tool names plus native tool specs and never gates
+    tool use; Claude has no built-ins, delivers tools over MCP, and gates tool use behind a
+    permission policy. ``wire_tools`` is where each config emits its own tool/permission
+    fields for the ``/run`` payload.
+    """
+
+    harness: ClassVar[HarnessType]
+
+    agents_md: Optional[str] = None
+    model: Optional[str] = None
+    tool_callback: Optional[ToolCallback] = None
+
+    def wire_tools(self) -> Dict[str, Any]:
+        """The tool + permission fields this harness contributes to the ``/run`` payload."""
+        raise NotImplementedError
+
+    def wire_prompt(self) -> Dict[str, Any]:
+        """The system-prompt fields this harness contributes to the ``/run`` payload. Empty
+        by default; a harness that exposes prompt overrides (Pi) emits them here."""
+        return {}
+
+
+class PiAgentConfig(HarnessAgentConfig):
+    """Pi's config. Built-in tools by name plus resolved specs delivered natively (Pi has no
+    MCP; the runner registers them through the Pi extension). Pi does not gate tool use, so
+    no permission policy applies.
+
+    ``system`` and ``append_system`` are Pi's two system-prompt layers, distinct from
+    ``agents_md``. ``system`` *replaces* Pi's built-in base prompt outright (Pi's ``SYSTEM.md``
+    / ``--system-prompt``); ``append_system`` *adds* to the base prompt without replacing it
+    (Pi's ``APPEND_SYSTEM.md`` / ``--append-system-prompt``). Both are independent of
+    ``agents_md``: Pi still appends the AGENTS.md project context after the system prompt
+    either way, so AGENTS.md remains the right home for project conventions and these are
+    only for changing or extending Pi's base persona."""
+
+    harness: ClassVar[HarnessType] = HarnessType.PI
+
+    builtin_tools: List[str] = Field(default_factory=list)
+    custom_tools: List[Dict[str, Any]] = Field(default_factory=list)
+    system: Optional[str] = None
+    append_system: Optional[str] = None
+
+    def wire_tools(self) -> Dict[str, Any]:
+        return {
+            "tools": list(self.builtin_tools),
+            "customTools": list(self.custom_tools),
+            "toolCallback": self.tool_callback.to_wire()
+            if self.tool_callback
+            else None,
+            "permissionPolicy": "auto",  # Pi never gates tool use
+        }
+
+    def wire_prompt(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {}
+        if self.system is not None:
+            out["systemPrompt"] = self.system
+        if self.append_system is not None:
+            out["appendSystemPrompt"] = self.append_system
+        return out
+
+
+class ClaudeAgentConfig(HarnessAgentConfig):
+    """Claude's config. No Pi built-ins; tools are delivered over MCP, and
+    ``permission_policy`` answers Claude's tool-use prompts in a headless run."""
+
+    harness: ClassVar[HarnessType] = HarnessType.CLAUDE
+
+    custom_tools: List[Dict[str, Any]] = Field(default_factory=list)
+    permission_policy: PermissionPolicy = "auto"
+
+    def wire_tools(self) -> Dict[str, Any]:
+        return {
+            "tools": [],  # Claude has no Pi built-in tools
+            "customTools": list(self.custom_tools),
+            "toolCallback": self.tool_callback.to_wire()
+            if self.tool_callback
+            else None,
+            "permissionPolicy": self.permission_policy,
+        }
+
+
+class AgentaAgentConfig(PiAgentConfig):
+    """The Agenta harness's config. It *is* a Pi config (same engine, same tool delivery and
+    system-prompt layers), plus the forced ``skills`` the Agenta harness always ships.
+
+    ``skills`` are skill directory names the runner resolves against its bundled
+    ``services/agent/skills/`` root and loads into Pi's resource loader, so they appear in the
+    system prompt on every run."""
+
+    harness: ClassVar[HarnessType] = HarnessType.AGENTA
+
+    skills: List[str] = Field(default_factory=list)
+
+    def wire_tools(self) -> Dict[str, Any]:
+        # Same tool fields as Pi, plus the forced skill names the runner loads.
+        return {**super().wire_tools(), "skills": list(self.skills)}
+
+
+# ---------------------------------------------------------------------------
+# The session bundle
+# ---------------------------------------------------------------------------
+
+
+class SessionConfig(BaseModel):
+    """Everything one run needs except where it runs.
+
+    ``agent`` is the neutral definition. ``secrets`` are provider keys injected as harness
+    env, never written to the agent filesystem. The ``builtin_tools`` / ``custom_tools`` /
+    ``tool_callback`` triple is the resolved tool delivery (Agenta produces it server-side;
+    empty for a bare standalone run). Sandbox is intentionally absent: it is a
+    backend/environment concern."""
+
+    agent: AgentConfig
+    secrets: Dict[str, str] = Field(default_factory=dict)
+    permission_policy: PermissionPolicy = "auto"
+    trace: Optional[TraceContext] = None
+    session_id: Optional[str] = None
+    builtin_tools: List[str] = Field(default_factory=list)
+    custom_tools: List[Dict[str, Any]] = Field(default_factory=list)
+    tool_callback: Optional[ToolCallback] = None
+
+
+# ---------------------------------------------------------------------------
+# Parsing helpers (ported from the agent service's inputs.py)
+# ---------------------------------------------------------------------------
+
+
+def _as_list(raw: Any) -> List[Any]:
+    if isinstance(raw, dict):
+        return [raw]
+    if isinstance(raw, list):
+        return raw
+    return []
+
+
+def _parse_harness_options(
+    params: Dict[str, Any],
+    defaults: AgentConfig,
+) -> Dict[str, Dict[str, Any]]:
+    """Pull the per-harness options bag from a request/config dict, falling back to defaults.
+
+    Reads ``harness_options`` from the ``agent`` element when present, else from the flat
+    request. Keeps only well-formed entries (a harness name mapping to an options dict) and
+    lower-cases the harness key so it matches :class:`HarnessType` values.
+    """
+    agent = params.get("agent")
+    source = agent if isinstance(agent, dict) else params
+    raw = source.get("harness_options")
+    if not isinstance(raw, dict):
+        return dict(defaults.harness_options)
+    options: Dict[str, Dict[str, Any]] = {}
+    for name, opts in raw.items():
+        if isinstance(opts, dict):
+            options[str(name).lower()] = dict(opts)
+    return options or dict(defaults.harness_options)
+
+
+def _system_text(messages: Optional[List[Any]]) -> str:
+    """Join the system-message content of a prompt-template into AGENTS.md text."""
+    parts: List[str] = []
+    for message in messages or []:
+        if not isinstance(message, dict) or message.get("role") != "system":
+            continue
+        content = message.get("content")
+        if isinstance(content, str):
+            parts.append(content)
+        elif isinstance(content, list):
+            parts.extend(
+                block.get("text", "")
+                for block in content
+                if isinstance(block, dict) and block.get("type") == "text"
+            )
+    return "\n\n".join(part for part in parts if part)
+
+
+def _parse_agent_fields(
+    params: Dict[str, Any],
+    defaults: AgentConfig,
+) -> Tuple[Optional[str], Optional[str], Any]:
+    """Pull (instructions, model, tools) from a request/config dict, with fallbacks."""
+    agent = params.get("agent")
+    if isinstance(agent, dict):
+        return (
+            agent.get("instructions") or defaults.instructions,
+            agent.get("model") or defaults.model,
+            agent.get("tools"),
+        )
+
+    prompt_cfg = params.get("prompt")
+    if isinstance(prompt_cfg, dict):
+        llm_config = prompt_cfg.get("llm_config") or {}
+        model = llm_config.get("model") or defaults.model
+        instructions = _system_text(prompt_cfg.get("messages")) or defaults.instructions
+        raw_tools = llm_config.get("tools")
+        if raw_tools is None:
+            raw_tools = prompt_cfg.get("tools")
+    else:
+        model = params.get("model") or defaults.model
+        instructions = params.get("agents_md") or defaults.instructions
+        raw_tools = params.get("tools")
+
+    if raw_tools is None:
+        raw_tools = defaults.tools
+    return instructions, model, raw_tools
diff --git a/sdks/python/agenta/sdk/agents/errors.py b/sdks/python/agenta/sdk/agents/errors.py
new file mode 100644
index 0000000000..dfe412253d
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/errors.py
@@ -0,0 +1,23 @@
+"""Typed errors for the agent runtime."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .dtos import HarnessType
+
+if TYPE_CHECKING:
+    from .interfaces import Backend
+
+
+class UnsupportedHarnessError(RuntimeError):
+    """Raised when a harness is asked to run on a backend that cannot drive it."""
+
+    def __init__(self, harness: HarnessType, backend: "Backend") -> None:
+        supported = ", ".join(sorted(h.value for h in backend.supported_harnesses))
+        super().__init__(
+            f"{type(backend).__name__} cannot drive harness '{harness.value}'; "
+            f"it supports: {supported or '(none)'}"
+        )
+        self.harness = harness
+        self.backend = backend
diff --git a/sdks/python/agenta/sdk/agents/interfaces.py b/sdks/python/agenta/sdk/agents/interfaces.py
new file mode 100644
index 0000000000..3822c11b12
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/interfaces.py
@@ -0,0 +1,279 @@
+"""The ports of the agent runtime: the abstract contracts (Agenta calls these interfaces).
+
+Three layers, lowest to highest:
+
+- ``Backend`` is the engine. It declares which harnesses it can drive
+  (``supported_harnesses``), owns sandbox + session lifecycle, and is pure plumbing: it
+  takes an already-harness-shaped config and launches it. Adapters: ``RivetBackend``,
+  ``InProcessPiBackend``, ``LocalBackend``.
+- ``Sandbox`` is where a session's process tree lives, plus the provisioning verb
+  (``add_files``).
+- ``Session`` is one conversation (``prompt``, ``destroy``).
+- ``Environment`` sits above a backend and owns the sandbox policy.
+
+The ``Harness`` port (with its ``PiHarness`` / ``ClaudeHarness`` adapters) sits above an
+``Environment`` and validates against ``Backend.supported_harnesses``.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import ClassVar, FrozenSet, Mapping, Optional, Sequence
+
+from .dtos import (
+    AgentResult,
+    EventSink,
+    HarnessAgentConfig,
+    HarnessType,
+    Message,
+    SessionConfig,
+    TraceContext,
+)
+from .errors import UnsupportedHarnessError
+from .streaming import AgentRun
+
+
+# ---------------------------------------------------------------------------
+# Sandbox and Session
+# ---------------------------------------------------------------------------
+
+
+class Sandbox(ABC):
+    """Where a session's process tree runs. Holds the provisioning verb and teardown.
+
+    ``add_files`` lays files into the sandbox before the session prompts (AGENTS.md, a
+    bundled extension, an uploaded login). Provisioning, used by the runtime, never exposed
+    to the agent-config author.
+    """
+
+    async def add_files(self, files: Mapping[str, bytes]) -> None:
+        """Write files into the sandbox. No-op by default (an adapter may need nothing)."""
+        return None
+
+    async def destroy(self) -> None:
+        """Tear the sandbox down. No-op by default."""
+        return None
+
+
+class Session(ABC):
+    """One conversation over a harness running in a sandbox."""
+
+    @property
+    @abstractmethod
+    def id(self) -> Optional[str]:
+        """The engine's session id, carried forward so a follow-up turn can resume it."""
+
+    @abstractmethod
+    async def prompt(
+        self,
+        messages: Sequence[Message],
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        """Run one turn and return the structured result (the one-shot path)."""
+
+    @abstractmethod
+    def stream(self, messages: Sequence[Message]) -> AgentRun:
+        """Run one turn, yielding events live across the boundary.
+
+        Returns an :class:`~agenta.sdk.agents.streaming.AgentRun`: an async-iterable of
+        ``AgentEvent`` that also carries the terminal ``AgentResult`` once consumed. This is
+        the live counterpart of :meth:`prompt`.
+        """
+
+    async def destroy(self) -> None:
+        """Drop the session's resources. A no-op under cold + replay."""
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Backend (the engine)
+# ---------------------------------------------------------------------------
+
+
+class Backend(ABC):
+    """The engine. Declares supported harnesses; owns sandbox + session lifecycle.
+
+    Each concrete backend is its own thing and hard-codes what makes it that engine (its
+    engine id, its supported harnesses). They do not share a base beyond this ABC.
+    """
+
+    #: The single source of truth for what this engine can run.
+    supported_harnesses: ClassVar[FrozenSet[HarnessType]] = frozenset()
+
+    def supports(self, harness: HarnessType) -> bool:
+        return harness in self.supported_harnesses
+
+    async def setup(self) -> None:
+        """Bring the backend up. No-op by default."""
+        return None
+
+    async def shutdown(self) -> None:
+        """Release backend resources. No-op by default."""
+        return None
+
+    @abstractmethod
+    async def create_sandbox(self) -> Sandbox:
+        """Create a sandbox this backend can run a session in."""
+
+    @abstractmethod
+    async def create_session(
+        self,
+        sandbox: Sandbox,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        secrets: Optional[Mapping[str, str]] = None,
+        trace: Optional[TraceContext] = None,
+        session_id: Optional[str] = None,
+    ) -> Session:
+        """Open a session in ``sandbox`` for an already-harness-shaped ``config``."""
+
+
+# ---------------------------------------------------------------------------
+# Environment (sandbox policy over a backend)
+# ---------------------------------------------------------------------------
+
+
+class Environment:
+    """A layer above a backend that owns the sandbox policy.
+
+    Default ``sandbox_per_session=True`` gives each session a fresh sandbox (the cold model,
+    strong isolation). Pass ``False`` to keep one sandbox and run many sessions in it; share
+    a single ``Environment`` across harnesses to share that sandbox.
+    """
+
+    def __init__(self, backend: Backend, *, sandbox_per_session: bool = True) -> None:
+        self._backend = backend
+        self._sandbox_per_session = sandbox_per_session
+        self._shared: Optional[Sandbox] = None
+
+    @property
+    def backend(self) -> Backend:
+        return self._backend
+
+    async def setup(self) -> None:
+        await self._backend.setup()
+
+    async def shutdown(self) -> None:
+        if self._shared is not None:
+            await self._shared.destroy()
+            self._shared = None
+        await self._backend.shutdown()
+
+    async def _sandbox(self) -> Sandbox:
+        if self._sandbox_per_session:
+            return await self._backend.create_sandbox()
+        if self._shared is None:
+            self._shared = await self._backend.create_sandbox()
+        return self._shared
+
+    async def create_session(
+        self,
+        config: HarnessAgentConfig,
+        *,
+        harness: HarnessType,
+        session_config: SessionConfig,
+        provisioning: Optional[Mapping[str, bytes]] = None,
+    ) -> Session:
+        """Provision a sandbox per policy, then open a session in it."""
+        sandbox = await self._sandbox()
+        if provisioning:
+            await sandbox.add_files(provisioning)
+        return await self._backend.create_session(
+            sandbox,
+            config,
+            harness=harness,
+            secrets=session_config.secrets,
+            trace=session_config.trace,
+            session_id=session_config.session_id,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Harness (the port; adapters live in adapters/harnesses.py)
+# ---------------------------------------------------------------------------
+
+
+class Harness(ABC):
+    """A harness-type-specific wrapper over an :class:`Environment`.
+
+    Holds the mapping from the neutral :class:`~agenta.sdk.agents.dtos.SessionConfig` to this
+    harness's config, and validates at construction that the environment's backend can drive
+    it (raising :class:`UnsupportedHarnessError` otherwise). The backend stays pure plumbing;
+    the per-harness knowledge lives here.
+    """
+
+    harness_type: ClassVar[HarnessType]
+
+    def __init__(self, environment: Environment) -> None:
+        if not environment.backend.supports(self.harness_type):
+            raise UnsupportedHarnessError(self.harness_type, environment.backend)
+        self._env = environment
+
+    @property
+    def environment(self) -> Environment:
+        return self._env
+
+    async def setup(self) -> None:
+        await self._env.setup()
+
+    async def cleanup(self) -> None:
+        await self._env.shutdown()
+
+    @abstractmethod
+    def _to_harness_config(self, config: SessionConfig) -> HarnessAgentConfig:
+        """Map the neutral config into this harness's own config (the mapping logic)."""
+
+    def _provisioning(self, config: SessionConfig) -> Mapping[str, bytes]:
+        """Files this harness needs laid into the sandbox before the run."""
+        files: dict[str, bytes] = {}
+        instructions = config.agent.instructions
+        if instructions and instructions.strip():
+            files["AGENTS.md"] = instructions.encode("utf-8")
+        return files
+
+    async def create_session(self, config: SessionConfig) -> Session:
+        return await self._env.create_session(
+            self._to_harness_config(config),
+            harness=self.harness_type,
+            session_config=config,
+            provisioning=self._provisioning(config),
+        )
+
+    async def prompt(
+        self,
+        config: SessionConfig,
+        messages: Sequence[Message],
+        *,
+        on_event: Optional[EventSink] = None,
+    ) -> AgentResult:
+        """Convenience: open a session, run one turn, and destroy it (the cold path)."""
+        session = await self.create_session(config)
+        try:
+            result = await session.prompt(messages, on_event=on_event)
+            if result.session_id:
+                config.session_id = result.session_id
+            return result
+        finally:
+            await session.destroy()
+
+    async def stream(
+        self,
+        config: SessionConfig,
+        messages: Sequence[Message],
+    ) -> AgentRun:
+        """Convenience: open a cold session and stream one turn (the live counterpart of
+        :meth:`prompt`).
+
+        The session id is carried onto ``config`` when the terminal result arrives, and the
+        session is destroyed when the stream ends — by drain, ``break``, or cancellation —
+        via the run's cleanup hook.
+        """
+        session = await self.create_session(config)
+
+        def _absorb(result: AgentResult) -> None:
+            if result.session_id:
+                config.session_id = result.session_id
+
+        return session.stream(messages).on_result(_absorb).on_cleanup(session.destroy)
diff --git a/sdks/python/agenta/sdk/agents/streaming.py b/sdks/python/agenta/sdk/agents/streaming.py
new file mode 100644
index 0000000000..e631d0ecdc
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/streaming.py
@@ -0,0 +1,91 @@
+"""Live streaming surface: ``AgentRun`` turns the runner's NDJSON record stream into a live
+``AgentEvent`` async-iterable plus the one terminal ``AgentResult``.
+
+A streaming transport (``utils.deliver_*_stream``) yields the runner's ``StreamRecord`` lines:
+``{"kind":"event", ...}`` for every event the moment it is built, then exactly one
+``{"kind":"result", ...}`` terminal record. ``AgentRun`` wraps that source so a caller can::
+
+    run = session.stream(messages)
+    async for event in run:
+        ...               # event is an AgentEvent, flushed live
+    result = run.result()  # the terminal AgentResult (session_id, usage, stop_reason, ...)
+
+This lives in its own module (not ``dtos``) because parsing the terminal record reuses
+``utils.wire.result_from_wire``, which imports the DTOs — keeping ``AgentRun`` above both
+avoids an import cycle.
+"""
+
+from __future__ import annotations
+
+from typing import (
+    Any,
+    AsyncIterator,
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    Optional,
+)
+
+from .dtos import AgentEvent, AgentResult
+from .utils import result_from_wire
+
+# Hooks: a result hook sees the terminal result once; a cleanup runs when iteration ends
+# (drain, break, or cancel).
+ResultHook = Callable[[AgentResult], None]
+Cleanup = Callable[[], Awaitable[None]]
+
+
+class AgentRun:
+    """An async-iterable over a run's live ``AgentEvent``s that also carries the terminal
+    ``AgentResult``.
+
+    Iterate it once. Each ``{"kind":"event"}`` record is yielded as an ``AgentEvent``; the
+    ``{"kind":"result"}`` record is parsed (raising the run's error when ``ok`` is false,
+    just like the one-shot path) and ends iteration. ``result()`` returns it afterwards.
+    """
+
+    def __init__(self, records: AsyncIterator[Dict[str, Any]]) -> None:
+        self._records = records
+        self._result: Optional[AgentResult] = None
+        self._result_hooks: List[ResultHook] = []
+        self._cleanups: List[Cleanup] = []
+
+    def on_result(self, hook: ResultHook) -> "AgentRun":
+        """Register a callback to run when the terminal result arrives (chainable)."""
+        self._result_hooks.append(hook)
+        return self
+
+    def on_cleanup(self, cleanup: Cleanup) -> "AgentRun":
+        """Register an async cleanup to run when iteration ends, any way it ends (chainable)."""
+        self._cleanups.append(cleanup)
+        return self
+
+    async def __aiter__(self) -> AsyncIterator[AgentEvent]:
+        try:
+            async for record in self._records:
+                kind = record.get("kind")
+                if kind == "event":
+                    event = AgentEvent.from_wire(record.get("event"))
+                    if event is not None:
+                        yield event
+                elif kind == "result":
+                    # result_from_wire raises on ok=false — surface it to the consumer.
+                    self._result = result_from_wire(record.get("result") or {})
+                    for hook in self._result_hooks:
+                        hook(self._result)
+                    return
+        finally:
+            for cleanup in self._cleanups:
+                try:
+                    await cleanup()
+                except Exception:  # pylint: disable=broad-except
+                    pass
+
+    def result(self) -> AgentResult:
+        """The terminal result. Available only after the stream is fully consumed."""
+        if self._result is None:
+            raise RuntimeError(
+                "AgentRun result is not available until the stream is fully consumed"
+            )
+        return self._result
diff --git a/sdks/python/agenta/sdk/agents/utils/__init__.py b/sdks/python/agenta/sdk/agents/utils/__init__.py
new file mode 100644
index 0000000000..620e3b1b7e
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/utils/__init__.py
@@ -0,0 +1,19 @@
+"""Shared plumbing for the runner-backed adapters: the ``/run`` wire shape and the two
+transports to the TypeScript runner."""
+
+from .ts_runner import (
+    deliver_http,
+    deliver_http_stream,
+    deliver_subprocess,
+    deliver_subprocess_stream,
+)
+from .wire import request_to_wire, result_from_wire
+
+__all__ = [
+    "request_to_wire",
+    "result_from_wire",
+    "deliver_http",
+    "deliver_subprocess",
+    "deliver_http_stream",
+    "deliver_subprocess_stream",
+]
diff --git a/sdks/python/agenta/sdk/agents/utils/ts_runner.py b/sdks/python/agenta/sdk/agents/utils/ts_runner.py
new file mode 100644
index 0000000000..f7a5497d1c
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/utils/ts_runner.py
@@ -0,0 +1,163 @@
+"""Transports to the TypeScript runner: HTTP (a running sidecar) or subprocess (a CLI).
+
+Shared by the runner-backed adapters. Each adapter chooses a transport and hard-codes its
+own engine id on the payload (via ``utils.wire``); this module only delivers the JSON.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+from typing import Any, AsyncIterator, Dict, Optional, Sequence
+
+_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
+
+
+async def deliver_http(
+    base_url: str,
+    payload: Dict[str, Any],
+    *,
+    timeout: float = _DEFAULT_TIMEOUT,
+) -> Dict[str, Any]:
+    """POST ``/run`` to a running runner and return the parsed JSON body."""
+    import httpx  # local import: only the HTTP transport needs it
+
+    url = base_url.rstrip("/") + "/run"
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        response = await client.post(url, json=payload)
+    if response.status_code >= 500:
+        raise RuntimeError(
+            f"Agent runner HTTP {response.status_code}: {response.text[:1000]}"
+        )
+    return response.json()
+
+
+async def deliver_subprocess(
+    command: Sequence[str],
+    payload: Dict[str, Any],
+    *,
+    cwd: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    timeout: float = _DEFAULT_TIMEOUT,
+) -> Dict[str, Any]:
+    """Spawn the runner CLI, feed the request on stdin, and parse the JSON on stdout."""
+    proc = await asyncio.create_subprocess_exec(
+        *command,
+        cwd=cwd,
+        env=env,
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    body = json.dumps(payload).encode("utf-8")
+    try:
+        stdout, stderr = await asyncio.wait_for(
+            proc.communicate(input=body), timeout=timeout
+        )
+    except asyncio.TimeoutError:
+        proc.kill()
+        await proc.wait()
+        raise RuntimeError(
+            f"Agent runner timed out after {timeout}s: {' '.join(command)}"
+        )
+
+    out = stdout.decode("utf-8", "replace")
+    err = stderr.decode("utf-8", "replace")
+    if not out.strip():
+        raise RuntimeError(
+            f"Agent runner returned no output. exit={proc.returncode} stderr={err[-2000:]}"
+        )
+    try:
+        return json.loads(out)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(
+            f"Agent runner returned invalid JSON. stdout={out[:500]} stderr={err[-1000:]}"
+        ) from exc
+
+
+# ---------------------------------------------------------------------------
+# Streaming transports (NDJSON): one parsed record per line, live.
+#
+# Each yields the runner's ``StreamRecord`` lines as they arrive — ``{"kind":"event",...}``
+# for every event the moment it is built, then exactly one ``{"kind":"result",...}`` terminal
+# record. The caller (a ``Session.stream``) turns these into live ``AgentEvent``s and the
+# terminal ``AgentResult``. Cancellation closes the underlying connection / kills the child.
+# ---------------------------------------------------------------------------
+
+
+async def deliver_http_stream(
+    base_url: str,
+    payload: Dict[str, Any],
+    *,
+    timeout: float = _DEFAULT_TIMEOUT,
+) -> AsyncIterator[Dict[str, Any]]:
+    """POST ``/run`` asking for NDJSON and yield each parsed record as it arrives.
+
+    The ``async with`` closes the connection when the generator is closed or cancelled, which
+    the runner observes as a client disconnect and turns into run cancellation.
+    """
+    import httpx  # local import: only the HTTP transport needs it
+
+    url = base_url.rstrip("/") + "/run"
+    headers = {"Accept": "application/x-ndjson"}
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        async with client.stream(
+            "POST", url, json=payload, headers=headers
+        ) as response:
+            if response.status_code >= 500:
+                body = await response.aread()
+                raise RuntimeError(
+                    f"Agent runner HTTP {response.status_code}: {body[:1000]!r}"
+                )
+            async for line in response.aiter_lines():
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+
+
+async def deliver_subprocess_stream(
+    command: Sequence[str],
+    payload: Dict[str, Any],
+    *,
+    cwd: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    timeout: float = _DEFAULT_TIMEOUT,
+) -> AsyncIterator[Dict[str, Any]]:
+    """Spawn the runner CLI in ``--stream`` mode and yield each NDJSON record from stdout.
+
+    The ``finally`` kills the child if the consumer stops early (break/cancel), so a dropped
+    stream does not leave a runner process behind.
+    """
+    proc = await asyncio.create_subprocess_exec(
+        *command,
+        "--stream",
+        cwd=cwd,
+        env=env,
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    assert proc.stdin is not None and proc.stdout is not None
+    proc.stdin.write(json.dumps(payload).encode("utf-8"))
+    proc.stdin.close()
+    loop = asyncio.get_event_loop()
+    deadline = loop.time() + timeout
+    try:
+        while True:
+            remaining = deadline - loop.time()
+            if remaining <= 0:
+                raise RuntimeError(
+                    f"Agent runner stream timed out after {timeout}s: {' '.join(command)}"
+                )
+            raw = await asyncio.wait_for(proc.stdout.readline(), timeout=remaining)
+            if not raw:  # EOF
+                break
+            line = raw.decode("utf-8", "replace").strip()
+            if line:
+                yield json.loads(line)
+        await proc.wait()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            await proc.wait()
diff --git a/sdks/python/agenta/sdk/agents/utils/wire.py b/sdks/python/agenta/sdk/agents/utils/wire.py
new file mode 100644
index 0000000000..e21ae6268d
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/utils/wire.py
@@ -0,0 +1,88 @@
+"""The ``/run`` wire contract: our DTOs <-> the runner's camelCase JSON.
+
+Shared by the runner-backed adapters (rivet, in-process Pi). The TS side mirrors these names
+in ``services/agent/src/protocol.ts``, and the contract is pinned by shared golden fixtures
+under ``sdks/python/oss/tests/pytest/unit/agents/golden/`` (see ``test_wire_contract.py``).
+The caller passes the engine id explicitly, since each adapter hard-codes its own.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional, Sequence
+
+from ..dtos import (
+    AgentEvent,
+    AgentResult,
+    HarnessAgentConfig,
+    HarnessCapabilities,
+    HarnessType,
+    Message,
+    TraceContext,
+)
+
+
+def request_to_wire(
+    *,
+    engine: str,
+    harness: HarnessType,
+    sandbox: str,
+    config: HarnessAgentConfig,
+    messages: Sequence[Message],
+    secrets: Optional[Dict[str, str]] = None,
+    trace: Optional[TraceContext] = None,
+    session_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Serialize one turn into the ``/run`` request JSON.
+
+    The tool + permission fields come from ``config.wire_tools()`` so each harness shapes its
+    own (Pi: built-ins + native specs, no gating; Claude: MCP specs + permission policy).
+    ``config.wire_prompt()`` adds any system-prompt overrides the harness exposes (Pi's
+    ``systemPrompt`` / ``appendSystemPrompt``); it is empty for harnesses that have none.
+    """
+    return {
+        "backend": engine,
+        "harness": harness.value,
+        "sandbox": sandbox,
+        "sessionId": session_id,
+        "agentsMd": config.agents_md,
+        "model": config.model,
+        "messages": [message.to_wire() for message in messages],
+        "secrets": dict(secrets or {}),
+        "trace": trace.to_wire() if trace else None,
+        **config.wire_tools(),
+        **config.wire_prompt(),
+    }
+
+
+def result_from_wire(data: Dict[str, Any]) -> AgentResult:
+    """Parse a ``/run`` result JSON into an :class:`AgentResult`.
+
+    Raises ``RuntimeError`` when the runner reported a failure, so the caller surfaces a
+    clear message rather than handing the model an empty reply.
+    """
+    if not data.get("ok"):
+        raise RuntimeError(f"Agent run failed: {data.get('error')}")
+
+    messages: List[Message] = []
+    for raw in data.get("messages") or []:
+        message = Message.from_raw(raw)
+        if message is not None:
+            messages.append(message)
+
+    events: List[AgentEvent] = []
+    for raw in data.get("events") or []:
+        event = AgentEvent.from_wire(raw)
+        if event is not None:
+            events.append(event)
+
+    return AgentResult(
+        output=data.get("output", "") or "",
+        messages=messages,
+        events=events,
+        usage=data.get("usage"),
+        stop_reason=data.get("stopReason"),
+        capabilities=HarnessCapabilities.from_wire(data.get("capabilities")),
+        session_id=data.get("sessionId"),
+        model=data.get("model"),
+        trace_id=data.get("traceId"),
+    )
diff --git a/sdks/python/agenta/tests/agents/test_streaming.py b/sdks/python/agenta/tests/agents/test_streaming.py
new file mode 100644
index 0000000000..bd378a2ece
--- /dev/null
+++ b/sdks/python/agenta/tests/agents/test_streaming.py
@@ -0,0 +1,167 @@
+"""Tests for the live streaming boundary: ``AgentRun`` and the NDJSON subprocess transport.
+
+Two layers:
+
+- ``AgentRun`` over a fake record source — pure, fast: events are yielded live, the terminal
+  result is captured, hooks/cleanup fire, and an ``ok:false`` terminal raises.
+- ``deliver_subprocess_stream`` against a fake NDJSON emitter — proves records arrive
+  incrementally (not buffered then dumped) and that closing the stream kills the child.
+
+A final integration test drives the real ``cli.ts --stream`` when ``pnpm`` is available.
+
+Run: ``uv run pytest agenta/tests/agents/test_streaming.py`` from ``sdks/python``.
+"""
+
+from __future__ import annotations
+
+import shutil
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List
+
+import pytest
+
+from agenta.sdk.agents import AgentRun
+from agenta.sdk.agents.utils import deliver_subprocess_stream
+
+
+async def _from_list(records: List[Dict[str, Any]]):
+    for record in records:
+        yield record
+
+
+# --- AgentRun ---------------------------------------------------------------
+
+
+async def test_agentrun_yields_events_then_captures_result() -> None:
+    seen_result: Dict[str, Any] = {}
+    cleaned: List[bool] = []
+
+    async def _cleanup() -> None:
+        cleaned.append(True)
+
+    records = [
+        {"kind": "event", "event": {"type": "message_start", "id": "m0"}},
+        {
+            "kind": "event",
+            "event": {"type": "message_delta", "id": "m0", "delta": "Hi"},
+        },
+        {"kind": "event", "event": {"type": "message_end", "id": "m0"}},
+        {
+            "kind": "result",
+            "result": {
+                "ok": True,
+                "output": "Hi",
+                "sessionId": "s1",
+                "stopReason": "end_turn",
+            },
+        },
+    ]
+    run = AgentRun(_from_list(records))
+    run.on_result(lambda r: seen_result.update({"id": r.session_id}))
+    run.on_cleanup(_cleanup)
+
+    events = [event async for event in run]
+
+    assert [e.type for e in events] == ["message_start", "message_delta", "message_end"]
+    assert run.result().output == "Hi"
+    assert run.result().session_id == "s1"
+    assert run.result().stop_reason == "end_turn"
+    assert seen_result == {"id": "s1"}  # on_result fired with the terminal result
+    assert cleaned == [True]  # cleanup ran when iteration ended
+
+
+async def test_agentrun_raises_on_error_terminal() -> None:
+    records = [
+        {"kind": "event", "event": {"type": "message_start", "id": "m0"}},
+        {"kind": "result", "result": {"ok": False, "error": "boom"}},
+    ]
+    run = AgentRun(_from_list(records))
+    with pytest.raises(RuntimeError, match="boom"):
+        async for _ in run:
+            pass
+
+
+async def test_agentrun_result_unavailable_before_drain() -> None:
+    run = AgentRun(_from_list([{"kind": "event", "event": {"type": "done"}}]))
+    with pytest.raises(RuntimeError, match="not available"):
+        run.result()
+
+
+# --- deliver_subprocess_stream (fake NDJSON emitter) ------------------------
+
+# Emits 3 event lines with a small gap, then one terminal result line. `-u` + flush so the
+# parent observes each line as it is written, not at process exit.
+_EMITTER = r"""
+import sys, time, json
+for i in range(3):
+    sys.stdout.write(json.dumps({"kind":"event","event":{"type":"message_delta","id":"m","delta":"d%d"%i}})+"\n")
+    sys.stdout.flush()
+    time.sleep(0.05)
+sys.stdout.write(json.dumps({"kind":"result","result":{"ok":True,"output":"d0d1d2","sessionId":"s1"}})+"\n")
+sys.stdout.flush()
+"""
+
+
+async def test_subprocess_stream_is_incremental() -> None:
+    cmd = [sys.executable, "-u", "-c", _EMITTER]
+    stamped = []
+    async for record in deliver_subprocess_stream(cmd, {}):
+        stamped.append((time.monotonic(), record))
+
+    kinds = [r["kind"] for _, r in stamped]
+    assert kinds == ["event", "event", "event", "result"], (
+        "events precede the single terminal result"
+    )
+    assert kinds.count("result") == 1, "exactly one terminal record"
+    # Incremental, not buffered-then-dumped: the first event lands well before the result.
+    first_event_t = stamped[0][0]
+    result_t = stamped[-1][0]
+    assert result_t - first_event_t >= 0.1, (
+        "records were spread out over time, not delivered in one batch"
+    )
+
+
+# Emits one event, then blocks for a long time. Closing the stream must kill it promptly.
+_HANGING_EMITTER = r"""
+import sys, time, json
+sys.stdout.write(json.dumps({"kind":"event","event":{"type":"message_delta","id":"m","delta":"x"}})+"\n")
+sys.stdout.flush()
+time.sleep(60)
+"""
+
+
+async def test_subprocess_stream_cancellation_kills_child() -> None:
+    cmd = [sys.executable, "-u", "-c", _HANGING_EMITTER]
+    agen = deliver_subprocess_stream(cmd, {})
+    first = await agen.__anext__()
+    assert first["kind"] == "event"
+
+    started = time.monotonic()
+    await agen.aclose()  # runs the finally: proc.kill() + await proc.wait()
+    elapsed = time.monotonic() - started
+    assert elapsed < 5, "aclose() killed the child instead of waiting out its 60s sleep"
+
+
+# --- Real cli.ts --stream boundary (integration) ----------------------------
+
+
+@pytest.mark.skipif(shutil.which("pnpm") is None, reason="pnpm not available")
+async def test_cli_stream_terminal_only_on_empty_request() -> None:
+    agent_dir = Path(__file__).resolve().parents[5] / "services" / "agent"
+    cmd = ["pnpm", "exec", "tsx", "src/cli.ts"]
+    records = []
+    async for record in deliver_subprocess_stream(cmd, {}, cwd=str(agent_dir)):
+        records.append(record)
+
+    # An empty request fails before any event, so the stream is exactly one result record.
+    assert len(records) == 1, records
+    assert records[0]["kind"] == "result"
+    assert records[0]["result"]["ok"] is False
+
+    # AgentRun surfaces that failure as a RuntimeError, just like the one-shot path.
+    run = AgentRun(deliver_subprocess_stream(cmd, {}, cwd=str(agent_dir)))
+    with pytest.raises(RuntimeError):
+        async for _ in run:
+            pass
diff --git a/sdks/python/oss/tests/pytest/unit/agents/__init__.py b/sdks/python/oss/tests/pytest/unit/agents/__init__.py
new file mode 100644
index 0000000000..4db23c7442
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/__init__.py
@@ -0,0 +1 @@
+# Unit tests for the agent runtime (agenta.sdk.agents).
diff --git a/sdks/python/oss/tests/pytest/unit/agents/conftest.py b/sdks/python/oss/tests/pytest/unit/agents/conftest.py
new file mode 100644
index 0000000000..a434fdacc5
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/conftest.py
@@ -0,0 +1,198 @@
+"""Shared fakes and fixtures for the agent-runtime unit tests.
+
+The fakes implement the real ports (``Backend`` / ``Sandbox`` / ``Session`` from
+``agenta.sdk.agents.interfaces``) so the port contract keeps them honest: if a port grows an
+abstract method, the fake fails to instantiate and these tests flag that the fake needs
+updating. They record what they receive so a test can assert on lifecycle and translation
+without a runner, a sandbox, an LLM, or the network.
+
+Everything is exposed through fixtures because pytest's prepend import mode makes a plain
+``from .fakes import ...`` brittle across components; a fixture factory sidesteps that.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Sequence
+
+import pytest
+
+from agenta.sdk.agents import (
+    AgentResult,
+    Environment,
+    HarnessType,
+)
+from agenta.sdk.agents.interfaces import Backend, Sandbox, Session
+from agenta.sdk.agents.streaming import AgentRun
+
+
+class FakeSandbox(Sandbox):
+    """Records provisioning and teardown."""
+
+    def __init__(self) -> None:
+        self.files: Dict[str, bytes] = {}
+        self.destroyed = False
+
+    async def add_files(self, files: Mapping[str, bytes]) -> None:
+        self.files.update(files)
+
+    async def destroy(self) -> None:
+        self.destroyed = True
+
+
+class FakeSession(Session):
+    """Returns a canned result, records prompts, and tracks teardown. Can be told to raise."""
+
+    def __init__(
+        self,
+        *,
+        result: AgentResult,
+        session_id: Optional[str] = None,
+        raise_on_prompt: bool = False,
+    ) -> None:
+        self._result = result
+        self._session_id = session_id
+        self._raise = raise_on_prompt
+        self.prompts: List[List[Any]] = []
+        self.destroyed = False
+
+    @property
+    def id(self) -> Optional[str]:
+        return self._session_id
+
+    async def prompt(self, messages, *, on_event=None) -> AgentResult:
+        self.prompts.append(list(messages))
+        if self._raise:
+            raise RuntimeError("boom from fake session")
+        if on_event:
+            for event in self._result.events:
+                on_event(event)
+        return self._result
+
+    def stream(self, messages) -> AgentRun:
+        # Mirror the runner's NDJSON stream: an event record per event, then one terminal
+        # result record (the shape `result_from_wire`/`AgentRun` expect).
+        self.prompts.append(list(messages))
+        result = self._result
+        raising = self._raise
+
+        async def _records():
+            if raising:
+                yield {
+                    "kind": "result",
+                    "result": {"ok": False, "error": "boom from fake session"},
+                }
+                return
+            for event in result.events:
+                yield {"kind": "event", "event": event.data}
+            yield {
+                "kind": "result",
+                "result": {
+                    "ok": True,
+                    "output": result.output,
+                    "sessionId": result.session_id,
+                },
+            }
+
+        return AgentRun(_records())
+
+    async def destroy(self) -> None:
+        self.destroyed = True
+
+
+class FakeBackend(Backend):
+    """A backend that hands out fakes and records every lifecycle call."""
+
+    def __init__(
+        self,
+        *,
+        supported: Sequence[HarnessType] = (HarnessType.PI, HarnessType.CLAUDE),
+        result: Optional[AgentResult] = None,
+        result_session_id: Optional[str] = None,
+        raise_on_prompt: bool = False,
+    ) -> None:
+        # Instance attribute shadows the ClassVar so `supports()` reflects this fake.
+        self.supported_harnesses = frozenset(supported)
+        self._result = result if result is not None else AgentResult(output="ok")
+        self._result_session_id = result_session_id
+        self._raise = raise_on_prompt
+        self.sandboxes: List[FakeSandbox] = []
+        self.sessions: List[FakeSession] = []
+        self.created_sessions: List[Dict[str, Any]] = []
+        self.setup_calls = 0
+        self.shutdown_calls = 0
+
+    async def setup(self) -> None:
+        self.setup_calls += 1
+
+    async def shutdown(self) -> None:
+        self.shutdown_calls += 1
+
+    async def create_sandbox(self) -> FakeSandbox:
+        sandbox = FakeSandbox()
+        self.sandboxes.append(sandbox)
+        return sandbox
+
+    async def create_session(
+        self,
+        sandbox,
+        config,
+        *,
+        harness,
+        secrets=None,
+        trace=None,
+        session_id=None,
+    ) -> FakeSession:
+        self.created_sessions.append(
+            {
+                "sandbox": sandbox,
+                "config": config,
+                "harness": harness,
+                "secrets": secrets,
+                "trace": trace,
+                "session_id": session_id,
+            }
+        )
+        session = FakeSession(
+            result=self._result,
+            session_id=self._result_session_id,
+            raise_on_prompt=self._raise,
+        )
+        self.sessions.append(session)
+        return session
+
+
+@pytest.fixture
+def make_backend():
+    """Factory returning a configured :class:`FakeBackend`."""
+
+    def _make(**kwargs) -> FakeBackend:
+        return FakeBackend(**kwargs)
+
+    return _make
+
+
+@pytest.fixture
+def make_env(make_backend):
+    """Factory returning an :class:`Environment` over a fresh :class:`FakeBackend`.
+
+    Returns the Environment; reach its backend via ``env.backend`` to assert on recordings.
+    """
+
+    def _make(*, sandbox_per_session: bool = True, **backend_kwargs) -> Environment:
+        backend = make_backend(**backend_kwargs)
+        return Environment(backend, sandbox_per_session=sandbox_per_session)
+
+    return _make
+
+
+@pytest.fixture
+def golden():
+    """Load a checked-in golden ``/run`` fixture (the cross-language wire contract anchor)."""
+    base = Path(__file__).parent / "golden"
+
+    def _load(name: str) -> Dict[str, Any]:
+        return json.loads((base / name).read_text(encoding="utf-8"))
+
+    return _load
diff --git a/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.claude.json b/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.claude.json
new file mode 100644
index 0000000000..9c6315110e
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.claude.json
@@ -0,0 +1,27 @@
+{
+  "backend": "rivet",
+  "harness": "claude",
+  "sandbox": "local",
+  "sessionId": null,
+  "agentsMd": "You are a helpful assistant.",
+  "model": "claude-sonnet-4-6",
+  "messages": [
+    {"role": "user", "content": "hi"}
+  ],
+  "secrets": {"ANTHROPIC_API_KEY": "sk-ant"},
+  "trace": null,
+  "tools": [],
+  "customTools": [
+    {
+      "name": "get_user",
+      "description": "Get a user",
+      "inputSchema": {"type": "object", "properties": {}},
+      "callRef": "tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn"
+    }
+  ],
+  "toolCallback": {
+    "endpoint": "https://api.example/tools/call",
+    "authorization": "Access tok-123"
+  },
+  "permissionPolicy": "deny"
+}
diff --git a/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.pi.json b/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.pi.json
new file mode 100644
index 0000000000..ae1dbae468
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/golden/run_request.pi.json
@@ -0,0 +1,35 @@
+{
+  "backend": "pi",
+  "harness": "pi",
+  "sandbox": "local",
+  "sessionId": "sess-1",
+  "agentsMd": "You are a helpful assistant.",
+  "model": "openai-codex/gpt-5.5",
+  "messages": [
+    {"role": "user", "content": "hi"}
+  ],
+  "secrets": {"OPENAI_API_KEY": "sk-test"},
+  "trace": {
+    "traceparent": "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
+    "baggage": null,
+    "endpoint": "https://otlp.example/v1/traces",
+    "authorization": "Access tok-123",
+    "captureContent": true
+  },
+  "tools": ["read", "write"],
+  "customTools": [
+    {
+      "name": "get_user",
+      "description": "Get a user",
+      "inputSchema": {"type": "object", "properties": {}},
+      "callRef": "tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn"
+    }
+  ],
+  "toolCallback": {
+    "endpoint": "https://api.example/tools/call",
+    "authorization": "Access tok-123"
+  },
+  "permissionPolicy": "auto",
+  "systemPrompt": "You are Pi.",
+  "appendSystemPrompt": "Be terse."
+}
diff --git a/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.error.json b/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.error.json
new file mode 100644
index 0000000000..9791d5a4ea
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.error.json
@@ -0,0 +1,4 @@
+{
+  "ok": false,
+  "error": "model exploded"
+}
diff --git a/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.ok.json b/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.ok.json
new file mode 100644
index 0000000000..0943d2d047
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/golden/run_result.ok.json
@@ -0,0 +1,31 @@
+{
+  "ok": true,
+  "output": "Hello!",
+  "messages": [
+    {"role": "assistant", "content": "Hello!"}
+  ],
+  "events": [
+    {"type": "message", "text": "Hello!"},
+    {"type": "usage", "input": 10, "output": 5, "total": 15, "cost": 0.001},
+    {"type": "done", "stopReason": "end_turn"},
+    {"text": "an event with no type, dropped on parse"}
+  ],
+  "usage": {"input": 10, "output": 5, "total": 15, "cost": 0.001},
+  "stopReason": "end_turn",
+  "capabilities": {
+    "textMessages": true,
+    "images": false,
+    "fileAttachments": false,
+    "mcpTools": true,
+    "toolCalls": true,
+    "reasoning": true,
+    "planMode": false,
+    "permissions": false,
+    "usage": true,
+    "streamingDeltas": false,
+    "sessionLifecycle": false
+  },
+  "sessionId": "sess-42",
+  "model": "gpt-5.5",
+  "traceId": "trace-abc"
+}
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_dtos_agent_config.py b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_agent_config.py
new file mode 100644
index 0000000000..0b7c4744ee
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_agent_config.py
@@ -0,0 +1,151 @@
+"""``AgentConfig.from_params`` (the three request shapes) and ``RunSelection.from_params``.
+
+The handler parses whatever the playground or a stored config sends into a neutral
+``AgentConfig`` plus a ``RunSelection``. This file locks the three accepted shapes, the
+defaults fall-through, the ``harness_options`` escape hatch, and the run-selection parsing.
+"""
+
+from __future__ import annotations
+
+from agenta.sdk.agents import AgentConfig, RunSelection
+
+_DEFAULTS = AgentConfig(instructions="default-md", model="default-model", tools=["d"])
+
+
+# ----------------------------------------------------------- AgentConfig shapes
+
+
+def test_from_params_agent_element_shape():
+    config = AgentConfig.from_params(
+        {
+            "agent": {
+                "instructions": "I",
+                "model": "M",
+                "tools": [{"type": "builtin", "name": "read"}],
+                "harness_options": {"pi": {"system": "S"}},
+            }
+        },
+        defaults=_DEFAULTS,
+    )
+    assert config.instructions == "I"
+    assert config.model == "M"
+    assert config.tools == [{"type": "builtin", "name": "read"}]
+    assert config.harness_options == {"pi": {"system": "S"}}
+
+
+def test_from_params_prompt_template_shape():
+    config = AgentConfig.from_params(
+        {
+            "prompt": {
+                "messages": [
+                    {"role": "system", "content": "You are helpful."},
+                    {"role": "user", "content": "ignored for instructions"},
+                ],
+                "llm_config": {"model": "M", "tools": ["t"]},
+            }
+        },
+        defaults=_DEFAULTS,
+    )
+    assert config.instructions == "You are helpful."  # system message -> instructions
+    assert config.model == "M"
+    assert config.tools == ["t"]
+
+
+def test_from_params_prompt_template_joins_multiple_system_messages():
+    config = AgentConfig.from_params(
+        {
+            "prompt": {
+                "messages": [
+                    {"role": "system", "content": "First."},
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": "Second."}],
+                    },
+                ],
+                "llm_config": {"model": "M"},
+            }
+        }
+    )
+    assert config.instructions == "First.\n\nSecond."
+
+
+def test_from_params_flat_shape():
+    config = AgentConfig.from_params(
+        {"model": "M", "agents_md": "A", "tools": [{"name": "x"}]},
+        defaults=_DEFAULTS,
+    )
+    assert config.instructions == "A"
+    assert config.model == "M"
+    assert config.tools == [{"name": "x"}]
+
+
+def test_from_params_falls_back_to_defaults():
+    config = AgentConfig.from_params({}, defaults=_DEFAULTS)
+    assert config.instructions == "default-md"
+    assert config.model == "default-model"
+    assert config.tools == ["d"]
+
+
+def test_from_params_coerces_single_tool_dict_to_list():
+    config = AgentConfig.from_params({"agent": {"tools": {"name": "solo"}}})
+    assert config.tools == [{"name": "solo"}]
+
+
+def test_harness_options_drops_malformed_and_lowercases_keys():
+    config = AgentConfig.from_params(
+        {
+            "agent": {
+                "harness_options": {
+                    "PI": {"system": "S"},  # key lower-cased
+                    "claude": "not a dict",  # dropped
+                }
+            }
+        }
+    )
+    assert config.harness_options == {"pi": {"system": "S"}}
+
+
+def test_harness_options_falls_back_to_defaults_when_absent():
+    defaults = AgentConfig(harness_options={"pi": {"system": "D"}})
+    config = AgentConfig.from_params(
+        {"agent": {"instructions": "I"}}, defaults=defaults
+    )
+    assert config.harness_options == {"pi": {"system": "D"}}
+
+
+# -------------------------------------------------------------- RunSelection
+
+
+def test_run_selection_defaults():
+    sel = RunSelection.from_params({})
+    assert (sel.harness, sel.sandbox, sel.permission_policy) == ("pi", "local", "auto")
+
+
+def test_run_selection_reads_agent_subdict_and_lowercases():
+    sel = RunSelection.from_params(
+        {
+            "agent": {
+                "harness": "Claude",
+                "sandbox": "Daytona",
+                "permission_policy": "Deny",
+            }
+        }
+    )
+    assert (sel.harness, sel.sandbox, sel.permission_policy) == (
+        "claude",
+        "daytona",
+        "deny",
+    )
+
+
+def test_run_selection_honors_custom_defaults():
+    sel = RunSelection.from_params(
+        {}, default_harness="claude", default_sandbox="daytona"
+    )
+    assert sel.harness == "claude"
+    assert sel.sandbox == "daytona"
+
+
+def test_run_selection_reads_flat_request():
+    sel = RunSelection.from_params({"harness": "claude"})
+    assert sel.harness == "claude"
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_dtos_capabilities_events.py b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_capabilities_events.py
new file mode 100644
index 0000000000..5d6ce90e8c
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_capabilities_events.py
@@ -0,0 +1,81 @@
+"""Capabilities, events, and the small cross-boundary DTOs.
+
+Capabilities are what lets adapters branch on a flag instead of the harness name, so their
+camelCase parsing is contract-critical. Events feed tracing; the trace/tool-callback DTOs
+plumb the run into Agenta.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import (
+    AgentEvent,
+    HarnessCapabilities,
+    HarnessType,
+    ToolCallback,
+    TraceContext,
+)
+
+
+def test_capabilities_none_and_non_dict_pass_through_as_none():
+    assert HarnessCapabilities.from_wire(None) is None
+    assert HarnessCapabilities.from_wire("nope") is None
+
+
+def test_capabilities_defaults_text_messages_true():
+    caps = HarnessCapabilities.from_wire({})
+    assert caps is not None
+    assert caps.text_messages is True  # the one flag that defaults on
+    assert caps.mcp_tools is False
+    assert caps.images is False
+
+
+def test_capabilities_map_camelcase_flags():
+    caps = HarnessCapabilities.from_wire(
+        {"mcpTools": True, "fileAttachments": True, "sessionLifecycle": True}
+    )
+    assert caps.mcp_tools is True
+    assert caps.file_attachments is True
+    assert caps.session_lifecycle is True
+
+
+def test_agent_event_requires_type():
+    assert AgentEvent.from_wire({"text": "no type"}) is None
+    assert AgentEvent.from_wire({"type": ""}) is None  # falsy type
+    assert AgentEvent.from_wire("not a dict") is None
+
+
+def test_agent_event_keeps_full_payload_in_data():
+    event = AgentEvent.from_wire(
+        {"type": "tool_call", "name": "search", "input": {"q": "x"}}
+    )
+    assert event.type == "tool_call"
+    # `data` carries the rest verbatim, including the type key.
+    assert event.data == {"type": "tool_call", "name": "search", "input": {"q": "x"}}
+
+
+def test_trace_context_to_wire_emits_all_keys_camelcase():
+    wire = TraceContext(traceparent="tp", endpoint="ep").to_wire()
+    assert wire == {
+        "traceparent": "tp",
+        "baggage": None,
+        "endpoint": "ep",
+        "authorization": None,
+        "captureContent": True,  # defaults on, camelCase
+    }
+
+
+def test_tool_callback_to_wire():
+    assert ToolCallback(endpoint="e", authorization="a").to_wire() == {
+        "endpoint": "e",
+        "authorization": "a",
+    }
+
+
+def test_harness_type_coerce():
+    assert HarnessType.coerce(HarnessType.PI) is HarnessType.PI
+    assert HarnessType.coerce("PI") is HarnessType.PI  # case-insensitive
+    assert HarnessType.coerce("claude") is HarnessType.CLAUDE
+    with pytest.raises(ValueError):
+        HarnessType.coerce("bogus")
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_dtos_content_blocks.py b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_content_blocks.py
new file mode 100644
index 0000000000..5c8ba74ade
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_content_blocks.py
@@ -0,0 +1,90 @@
+"""Content blocks and messages: loose-input coercion and wire serialization.
+
+The playground sends loose dicts and bare strings; the runtime coerces them and emits
+camelCase on the wire. These round-trips lock that coercion.
+"""
+
+from __future__ import annotations
+
+from agenta.sdk.agents import ContentBlock, Message, to_messages
+
+
+def test_content_block_from_string():
+    block = ContentBlock.from_raw("hello")
+    assert block.type == "text"
+    assert block.text == "hello"
+
+
+def test_content_block_from_dict_accepts_both_mime_spellings():
+    camel = ContentBlock.from_raw(
+        {"type": "image", "data": "b64", "mimeType": "image/png"}
+    )
+    snake = ContentBlock.from_raw(
+        {"type": "image", "data": "b64", "mime_type": "image/png"}
+    )
+    assert camel.mime_type == "image/png"
+    assert snake.mime_type == "image/png"
+
+
+def test_content_block_passthrough_and_fallback():
+    existing = ContentBlock(type="text", text="x")
+    assert ContentBlock.from_raw(existing) is existing
+    # A non-string, non-dict value stringifies into a text block.
+    assert ContentBlock.from_raw(42).text == "42"
+
+
+def test_content_block_to_wire_omits_none_and_uses_camelcase():
+    block = ContentBlock(type="image", data="b64", mime_type="image/png")
+    wire = block.to_wire()
+    assert wire == {"type": "image", "data": "b64", "mimeType": "image/png"}
+    assert "text" not in wire  # None fields are omitted
+
+
+def test_text_block_round_trips():
+    assert ContentBlock(type="text", text="hi").to_wire() == {
+        "type": "text",
+        "text": "hi",
+    }
+
+
+def test_message_from_raw_requires_role():
+    assert Message.from_raw({"content": "no role"}) is None
+    assert Message.from_raw("not a dict") is None
+    msg = Message.from_raw({"role": "user", "content": "hi"})
+    assert msg is not None and msg.role == "user" and msg.content == "hi"
+
+
+def test_message_from_raw_coerces_block_list():
+    msg = Message.from_raw(
+        {"role": "user", "content": [{"type": "text", "text": "a"}, "b"]}
+    )
+    assert isinstance(msg.content, list)
+    assert [b.text for b in msg.content] == ["a", "b"]
+
+
+def test_message_to_wire_string_and_blocks():
+    assert Message(role="user", content="hi").to_wire() == {
+        "role": "user",
+        "content": "hi",
+    }
+    blocks = Message(role="user", content=[ContentBlock(type="text", text="a")])
+    assert blocks.to_wire() == {
+        "role": "user",
+        "content": [{"type": "text", "text": "a"}],
+    }
+
+
+def test_to_messages_filters_invalid_entries():
+    messages = to_messages(
+        [
+            {"role": "user", "content": "hi"},
+            {"content": "no role"},  # dropped
+            None,  # dropped
+            {"role": "assistant", "content": "yo"},
+        ]
+    )
+    assert [m.role for m in messages] == ["user", "assistant"]
+
+
+def test_to_messages_handles_none():
+    assert to_messages(None) == []
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_dtos_harness_configs.py b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_harness_configs.py
new file mode 100644
index 0000000000..5d96bccad4
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_dtos_harness_configs.py
@@ -0,0 +1,74 @@
+"""Per-harness configs: how each shapes its own tool/prompt fields for the ``/run`` payload.
+
+These are the per-harness halves of the wire contract. ``test_wire_contract`` checks the full
+payload against the golden; this file pins each config's contribution in isolation so a failure
+points straight at the harness whose shape changed.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import (
+    ClaudeAgentConfig,
+    HarnessAgentConfig,
+    PiAgentConfig,
+    ToolCallback,
+)
+
+_CALLBACK = ToolCallback(endpoint="https://api.example/tools/call", authorization="A")
+
+
+def test_pi_wire_tools_is_native_and_never_gates():
+    config = PiAgentConfig(
+        builtin_tools=["read"],
+        custom_tools=[{"name": "t"}],
+        tool_callback=_CALLBACK,
+    )
+    assert config.wire_tools() == {
+        "tools": ["read"],
+        "customTools": [{"name": "t"}],
+        "toolCallback": {
+            "endpoint": "https://api.example/tools/call",
+            "authorization": "A",
+        },
+        "permissionPolicy": "auto",  # Pi never gates tool use
+    }
+
+
+def test_pi_wire_tools_without_callback():
+    assert PiAgentConfig().wire_tools()["toolCallback"] is None
+
+
+def test_pi_wire_prompt_emits_only_set_overrides():
+    assert PiAgentConfig().wire_prompt() == {}
+    assert PiAgentConfig(system="s").wire_prompt() == {"systemPrompt": "s"}
+    assert PiAgentConfig(append_system="a").wire_prompt() == {"appendSystemPrompt": "a"}
+    assert PiAgentConfig(system="", append_system="a").wire_prompt() == {
+        "systemPrompt": "",  # an explicit empty string is still an override here
+        "appendSystemPrompt": "a",
+    }
+
+
+def test_claude_wire_tools_has_no_builtins_and_carries_policy():
+    config = ClaudeAgentConfig(
+        custom_tools=[{"name": "t"}],
+        tool_callback=_CALLBACK,
+        permission_policy="deny",
+    )
+    wire = config.wire_tools()
+    assert wire["tools"] == []  # Claude has no Pi built-ins
+    assert wire["customTools"] == [{"name": "t"}]
+    assert wire["permissionPolicy"] == "deny"
+
+
+def test_claude_defaults_to_auto_policy_and_empty_prompt():
+    assert ClaudeAgentConfig().wire_tools()["permissionPolicy"] == "auto"
+    assert ClaudeAgentConfig().wire_prompt() == {}  # Claude exposes no prompt overrides
+
+
+def test_base_config_wire_tools_is_abstract():
+    # The base class does not know any engine's tool shape.
+    with pytest.raises(NotImplementedError):
+        HarnessAgentConfig().wire_tools()
+    assert HarnessAgentConfig().wire_prompt() == {}
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_environment_lifecycle.py b/sdks/python/oss/tests/pytest/unit/agents/test_environment_lifecycle.py
new file mode 100644
index 0000000000..c84761885f
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_environment_lifecycle.py
@@ -0,0 +1,127 @@
+"""Environment sandbox policy and the cold ``Harness.prompt`` lifecycle.
+
+These lock the isolation guarantees the design docs promise: a fresh sandbox per session
+under the cold model, the session torn down in a ``finally`` even when the turn raises, the
+session id carried forward, and AGENTS.md provisioned only when there are instructions.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import (
+    AgentConfig,
+    AgentResult,
+    HarnessType,
+    Message,
+    PiHarness,
+    SessionConfig,
+)
+
+
+def _config(instructions="hi") -> SessionConfig:
+    return SessionConfig(agent=AgentConfig(instructions=instructions, model="m"))
+
+
+# ------------------------------------------------------------- Environment policy
+
+
+async def test_fresh_sandbox_per_session(make_env):
+    env = make_env(sandbox_per_session=True)
+    config = _config()
+
+    await env.create_session(
+        PiHarness(env)._to_harness_config(config),
+        harness=HarnessType.PI,
+        session_config=config,
+    )
+    await env.create_session(
+        PiHarness(env)._to_harness_config(config),
+        harness=HarnessType.PI,
+        session_config=config,
+    )
+
+    assert len(env.backend.sandboxes) == 2  # a new sandbox each time (cold model)
+
+
+async def test_shared_sandbox_when_not_per_session(make_env):
+    env = make_env(sandbox_per_session=False)
+    config = _config()
+
+    for _ in range(2):
+        await env.create_session(
+            PiHarness(env)._to_harness_config(config),
+            harness=HarnessType.PI,
+            session_config=config,
+        )
+
+    assert len(env.backend.sandboxes) == 1  # one sandbox reused
+    await env.shutdown()
+    assert env.backend.sandboxes[0].destroyed is True  # shutdown tears it down
+    assert env.backend.shutdown_calls == 1
+
+
+async def test_provisioning_writes_agents_md_only_when_present(make_env):
+    env = make_env()
+    harness = PiHarness(env)
+
+    assert harness._provisioning(_config("hello")) == {"AGENTS.md": b"hello"}
+    assert harness._provisioning(_config("")) == {}
+    assert harness._provisioning(_config("   ")) == {}
+    assert harness._provisioning(_config(None)) == {}
+
+
+async def test_create_session_adds_files_when_provisioned(make_env):
+    env = make_env()
+    config = _config("project conventions")
+
+    await PiHarness(env).create_session(config)
+
+    assert env.backend.sandboxes[0].files == {"AGENTS.md": b"project conventions"}
+
+
+# ------------------------------------------------------- Cold Harness.prompt path
+
+
+async def test_prompt_runs_and_tears_down(make_env):
+    env = make_env(result=AgentResult(output="done"))
+    harness = PiHarness(env)
+
+    result = await harness.prompt(_config(), [Message(role="user", content="hi")])
+
+    assert result.output == "done"
+    assert env.backend.sessions[0].destroyed is True  # torn down on the happy path
+
+
+async def test_prompt_destroys_session_even_when_it_raises(make_env):
+    env = make_env(raise_on_prompt=True)
+    harness = PiHarness(env)
+
+    with pytest.raises(RuntimeError, match="boom"):
+        await harness.prompt(_config(), [Message(role="user", content="hi")])
+
+    assert env.backend.sessions[0].destroyed is True  # finally still runs
+
+
+async def test_prompt_carries_session_id_forward(make_env):
+    env = make_env(
+        result=AgentResult(output="x", session_id="sess-new"),
+        result_session_id="sess-new",
+    )
+    harness = PiHarness(env)
+    config = _config()
+
+    await harness.prompt(config, [Message(role="user", content="hi")])
+
+    assert config.session_id == "sess-new"  # next turn can resume it
+
+
+async def test_prompt_leaves_session_id_when_result_has_none(make_env):
+    env = make_env(result=AgentResult(output="x", session_id=None))
+    harness = PiHarness(env)
+    config = _config()
+    config.session_id = "prior"
+
+    await harness.prompt(config, [Message(role="user", content="hi")])
+
+    assert config.session_id == "prior"  # unchanged
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_harness_adapters.py b/sdks/python/oss/tests/pytest/unit/agents/test_harness_adapters.py
new file mode 100644
index 0000000000..d7a03aeed5
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_harness_adapters.py
@@ -0,0 +1,267 @@
+"""Harness adapters: the neutral ``SessionConfig`` -> per-harness config translation.
+
+Pi and Claude genuinely differ (Pi takes built-ins and never gates tool use; Claude has no
+built-ins, delivers tools over MCP, and gates on a permission policy). Agenta is Pi with a
+fixed opinion: a forced preamble, persona, tools, and skills. These tests lock that the
+translation honors those differences and that ``make_harness`` validates support.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import (
+    AgentaAgentConfig,
+    AgentaHarness,
+    AgentConfig,
+    ClaudeAgentConfig,
+    ClaudeHarness,
+    HarnessType,
+    PiAgentConfig,
+    PiHarness,
+    SessionConfig,
+    ToolCallback,
+    UnsupportedHarnessError,
+    make_harness,
+)
+from agenta.sdk.agents.adapters import harnesses
+from agenta.sdk.agents.adapters.agenta_builtins import (
+    AGENTA_FORCED_APPEND_SYSTEM,
+    AGENTA_FORCED_SKILLS,
+    AGENTA_FORCED_TOOLS,
+    AGENTA_PREAMBLE,
+)
+from agenta.sdk.agents.adapters.harnesses import _normalize_tool_specs, _opt_str
+
+_CALLBACK = ToolCallback(endpoint="https://api.example/tools/call", authorization=None)
+
+
+def _session_config(**kwargs) -> SessionConfig:
+    agent = kwargs.pop("agent", AgentConfig(instructions="hi", model="m"))
+    return SessionConfig(agent=agent, **kwargs)
+
+
+# --------------------------------------------------------------------------- Pi
+
+
+def test_pi_keeps_builtins_and_native_tools(make_env):
+    harness = PiHarness(make_env(supported=[HarnessType.PI]))
+    config = _session_config(
+        builtin_tools=["read", "write"],
+        custom_tools=[{"name": "t", "callRef": "ref"}],
+        tool_callback=_CALLBACK,
+    )
+
+    result = harness._to_harness_config(config)
+
+    assert isinstance(result, PiAgentConfig)
+    assert result.builtin_tools == ["read", "write"]
+    assert result.custom_tools[0]["name"] == "t"
+    assert result.tool_callback is _CALLBACK
+    assert result.agents_md == "hi"
+    assert result.model == "m"
+
+
+def test_pi_reads_its_harness_options_slice(make_env):
+    harness = PiHarness(make_env(supported=[HarnessType.PI]))
+    agent = AgentConfig(
+        instructions="hi",
+        harness_options={
+            "pi": {"system": "You are Pi.", "append_system": "Be terse."},
+            "claude": {"system": "ignored for Pi"},
+        },
+    )
+    config = _session_config(agent=agent)
+
+    result = harness._to_harness_config(config)
+
+    assert result.system == "You are Pi."
+    assert result.append_system == "Be terse."
+    # The Pi prompt overrides reach the wire.
+    assert result.wire_prompt() == {
+        "systemPrompt": "You are Pi.",
+        "appendSystemPrompt": "Be terse.",
+    }
+
+
+def test_pi_drops_blank_harness_options(make_env):
+    harness = PiHarness(make_env(supported=[HarnessType.PI]))
+    agent = AgentConfig(
+        instructions="hi",
+        harness_options={"pi": {"system": "   ", "append_system": ""}},
+    )
+
+    result = harness._to_harness_config(_session_config(agent=agent))
+
+    assert result.system is None
+    assert result.append_system is None
+    assert result.wire_prompt() == {}
+
+
+# ------------------------------------------------------------------------- Agenta
+
+
+def test_agenta_forces_skills_tools_preamble_and_persona(make_env):
+    harness = AgentaHarness(make_env(supported=[HarnessType.AGENTA]))
+    config = _session_config(
+        agent=AgentConfig(instructions="My project rules.", model="m"),
+        builtin_tools=["web_search"],
+        custom_tools=[{"name": "t", "callRef": "ref"}],
+        tool_callback=_CALLBACK,
+    )
+
+    result = harness._to_harness_config(config)
+
+    assert isinstance(result, AgentaAgentConfig)
+    # AGENTS.md is the base preamble with the author's instructions appended after it.
+    assert result.agents_md.startswith(AGENTA_PREAMBLE)
+    assert result.agents_md.endswith("My project rules.")
+    # Forced tools are unioned in (and `read` is present so Pi renders the skills section).
+    for forced in AGENTA_FORCED_TOOLS:
+        assert forced in result.builtin_tools
+    assert "web_search" in result.builtin_tools
+    assert "read" in result.builtin_tools
+    # Forced skills ride the config and reach the wire.
+    assert result.skills == list(AGENTA_FORCED_SKILLS)
+    assert result.wire_tools()["skills"] == list(AGENTA_FORCED_SKILLS)
+    # The persona is forced onto append_system; custom tools and callback pass through.
+    assert result.append_system.startswith(AGENTA_FORCED_APPEND_SYSTEM)
+    assert result.custom_tools[0]["name"] == "t"
+    assert result.tool_callback is _CALLBACK
+
+
+def test_agenta_forces_tools_without_duplicates(make_env):
+    harness = AgentaHarness(make_env(supported=[HarnessType.AGENTA]))
+    # `read` already configured: it must not be duplicated when forced.
+    config = _session_config(builtin_tools=["read"])
+
+    result = harness._to_harness_config(config)
+
+    assert result.builtin_tools.count("read") == 1
+
+
+def test_agenta_passes_through_user_pi_options(make_env):
+    harness = AgentaHarness(make_env(supported=[HarnessType.AGENTA]))
+    agent = AgentConfig(
+        instructions="hi",
+        harness_options={"pi": {"system": "You are Pi.", "append_system": "Be terse."}},
+    )
+
+    result = harness._to_harness_config(_session_config(agent=agent))
+
+    # `system` passes through; the author's `append_system` is appended after the forced persona.
+    assert result.system == "You are Pi."
+    assert result.append_system.startswith(AGENTA_FORCED_APPEND_SYSTEM)
+    assert result.append_system.endswith("Be terse.")
+
+
+def test_agenta_is_in_process_pi_supported():
+    from agenta.sdk.agents import InProcessPiBackend
+
+    assert InProcessPiBackend().supports(HarnessType.AGENTA)
+
+
+# ------------------------------------------------------------------------- Claude
+
+
+def test_claude_drops_builtins_and_warns(make_env, monkeypatch):
+    recorded = []
+    monkeypatch.setattr(
+        harnesses,
+        "log",
+        type("L", (), {"warning": lambda self, *a, **k: recorded.append(a)})(),
+    )
+    harness = ClaudeHarness(make_env(supported=[HarnessType.CLAUDE]))
+    config = _session_config(
+        builtin_tools=["read"],
+        custom_tools=[{"name": "t", "callRef": "ref"}],
+        permission_policy="deny",
+    )
+
+    result = harness._to_harness_config(config)
+
+    assert isinstance(result, ClaudeAgentConfig)
+    assert not hasattr(result, "builtin_tools")  # Claude has no built-in tools at all
+    assert result.custom_tools[0]["name"] == "t"
+    assert result.permission_policy == "deny"  # Claude carries the policy
+    assert recorded, "expected a warning when built-ins are dropped"
+
+
+def test_claude_no_warning_without_builtins(make_env, monkeypatch):
+    recorded = []
+    monkeypatch.setattr(
+        harnesses,
+        "log",
+        type("L", (), {"warning": lambda self, *a, **k: recorded.append(a)})(),
+    )
+    harness = ClaudeHarness(make_env(supported=[HarnessType.CLAUDE]))
+
+    harness._to_harness_config(_session_config(permission_policy="auto"))
+
+    assert recorded == []
+
+
+# --------------------------------------------------------------- _normalize_tool_specs
+
+
+def test_normalize_tool_specs_fills_defaults_and_drops_malformed():
+    specs = [
+        {"name": "keep", "callRef": "r1"},  # missing description + inputSchema
+        {"description": "no name"},  # dropped: no name
+        "not a dict",  # dropped: not a dict
+        {
+            "name": "full",
+            "description": "d",
+            "inputSchema": {"type": "object", "properties": {"x": {}}},
+            "callRef": "r2",
+        },
+    ]
+
+    out = _normalize_tool_specs(specs)
+
+    assert [s["name"] for s in out] == ["keep", "full"]
+    # description falls back to the name; inputSchema falls back to an empty object schema.
+    assert out[0]["description"] == "keep"
+    assert out[0]["inputSchema"] == {"type": "object", "properties": {}}
+    assert out[0]["callRef"] == "r1"
+    # provided values are preserved.
+    assert out[1]["description"] == "d"
+    assert out[1]["inputSchema"]["properties"] == {"x": {}}
+
+
+def test_normalize_tool_specs_empty():
+    assert _normalize_tool_specs([]) == []
+    assert _normalize_tool_specs(None) == []
+
+
+def test_opt_str_keeps_only_nonempty_strings():
+    assert _opt_str("hi") == "hi"
+    assert _opt_str("  ") is None
+    assert _opt_str("") is None
+    assert _opt_str(None) is None
+    assert _opt_str(123) is None
+
+
+# -------------------------------------------------------------------- make_harness
+
+
+def test_make_harness_maps_string_to_class(make_env):
+    env = make_env(supported=[HarnessType.PI, HarnessType.CLAUDE, HarnessType.AGENTA])
+    assert isinstance(make_harness("pi", env), PiHarness)
+    assert isinstance(make_harness("PI", env), PiHarness)  # coerced, case-insensitive
+    assert isinstance(make_harness("claude", env), ClaudeHarness)
+    assert isinstance(make_harness(HarnessType.CLAUDE, env), ClaudeHarness)
+    assert isinstance(make_harness("agenta", env), AgentaHarness)
+    assert isinstance(make_harness(HarnessType.AGENTA, env), AgentaHarness)
+
+
+def test_make_harness_unsupported_backend_raises(make_env):
+    env = make_env(supported=[HarnessType.PI])  # backend cannot drive Claude
+    with pytest.raises(UnsupportedHarnessError):
+        make_harness("claude", env)
+
+
+def test_make_harness_unknown_name_raises(make_env):
+    env = make_env(supported=[HarnessType.PI])
+    with pytest.raises(ValueError):
+        make_harness("bogus", env)
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py b/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py
new file mode 100644
index 0000000000..34687695ed
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py
@@ -0,0 +1,224 @@
+"""The ``/run`` wire contract: ``request_to_wire`` / ``result_from_wire``.
+
+This is the highest-value regression guard in the agent runtime. ``wire.py`` (the Python
+producer) and ``services/agent/src/protocol.ts`` (the TS consumer) are hand-mirrored, so the
+two can drift silently. The golden fixtures in ``golden/`` are the shared anchor: this file
+asserts the Python side against them, and the TS side asserts the same files (a later PR).
+
+If a field is added, renamed, or removed on the wire, a golden assertion here fails on
+purpose. Regenerate the golden deliberately, and update ``protocol.ts`` and ``KNOWN_REQUEST_KEYS``
+to match.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import (
+    AgentaAgentConfig,
+    ClaudeAgentConfig,
+    HarnessType,
+    Message,
+    PiAgentConfig,
+    ToolCallback,
+    TraceContext,
+)
+from agenta.sdk.agents.utils.wire import request_to_wire, result_from_wire
+
+# The full set of top-level keys ``request_to_wire`` may emit. The TS ``AgentRunRequest``
+# interface must declare a superset of these. Adding a key here without adding it to
+# protocol.ts is exactly the drift this set exists to catch.
+KNOWN_REQUEST_KEYS = {
+    "backend",
+    "harness",
+    "sandbox",
+    "sessionId",
+    "agentsMd",
+    "model",
+    "messages",
+    "secrets",
+    "trace",
+    "tools",
+    "customTools",
+    "toolCallback",
+    "permissionPolicy",
+    "systemPrompt",
+    "appendSystemPrompt",
+    "skills",
+}
+
+_CUSTOM_TOOL = {
+    "name": "get_user",
+    "description": "Get a user",
+    "inputSchema": {"type": "object", "properties": {}},
+    "callRef": "tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn",
+}
+_CALLBACK = ToolCallback(
+    endpoint="https://api.example/tools/call", authorization="Access tok-123"
+)
+
+
+def _pi_payload():
+    config = PiAgentConfig(
+        agents_md="You are a helpful assistant.",
+        model="openai-codex/gpt-5.5",
+        builtin_tools=["read", "write"],
+        custom_tools=[dict(_CUSTOM_TOOL)],
+        tool_callback=_CALLBACK,
+        system="You are Pi.",
+        append_system="Be terse.",
+    )
+    return request_to_wire(
+        engine="pi",
+        harness=HarnessType.PI,
+        sandbox="local",
+        config=config,
+        messages=[Message(role="user", content="hi")],
+        secrets={"OPENAI_API_KEY": "sk-test"},
+        trace=TraceContext(
+            traceparent="00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
+            endpoint="https://otlp.example/v1/traces",
+            authorization="Access tok-123",
+            capture_content=True,
+        ),
+        session_id="sess-1",
+    )
+
+
+def _claude_payload():
+    config = ClaudeAgentConfig(
+        agents_md="You are a helpful assistant.",
+        model="claude-sonnet-4-6",
+        custom_tools=[dict(_CUSTOM_TOOL)],
+        tool_callback=_CALLBACK,
+        permission_policy="deny",
+    )
+    return request_to_wire(
+        engine="rivet",
+        harness=HarnessType.CLAUDE,
+        sandbox="local",
+        config=config,
+        messages=[Message(role="user", content="hi")],
+        secrets={"ANTHROPIC_API_KEY": "sk-ant"},
+        trace=None,
+        session_id=None,
+    )
+
+
+def _agenta_payload():
+    config = AgentaAgentConfig(
+        agents_md="Agenta preamble + project rules.",
+        model="gpt-5.5",
+        builtin_tools=["read", "bash"],
+        custom_tools=[dict(_CUSTOM_TOOL)],
+        tool_callback=_CALLBACK,
+        append_system="You are an Agenta agent.",
+        skills=["agenta-getting-started"],
+    )
+    return request_to_wire(
+        engine="pi",
+        harness=HarnessType.AGENTA,
+        sandbox="local",
+        config=config,
+        messages=[Message(role="user", content="hi")],
+    )
+
+
+def test_request_to_wire_agenta_carries_skills_and_pi_shape():
+    payload = _agenta_payload()
+    assert set(payload) <= KNOWN_REQUEST_KEYS
+    # Agenta is a Pi config: same tool shape, never gates, exposes the prompt overrides...
+    assert payload["permissionPolicy"] == "auto"
+    assert payload["tools"] == ["read", "bash"]
+    assert payload["appendSystemPrompt"] == "You are an Agenta agent."
+    # ...plus the forced skills the runner loads.
+    assert payload["skills"] == ["agenta-getting-started"]
+
+
+def test_request_to_wire_pi_has_no_skills_key():
+    # Only the Agenta config emits `skills`; the plain Pi config must not.
+    assert "skills" not in _pi_payload()
+
+
+def test_request_to_wire_pi_matches_golden(golden):
+    assert _pi_payload() == golden("run_request.pi.json")
+
+
+def test_request_to_wire_claude_matches_golden(golden):
+    payload = _claude_payload()
+    assert payload == golden("run_request.claude.json")
+    # Claude-specific invariants the golden encodes, asserted explicitly so a failure reads clearly.
+    assert payload["tools"] == []  # Claude has no Pi built-ins
+    assert payload["permissionPolicy"] == "deny"  # Claude gates tool use
+    assert "systemPrompt" not in payload  # Claude exposes no prompt overrides
+    assert "appendSystemPrompt" not in payload
+
+
+def test_request_to_wire_has_no_prompt_key():
+    # The serializer emits `messages` only; the TS side derives the latest turn with
+    # `resolvePromptText`. This asymmetry is intentional and easy to break, so lock it.
+    payload = request_to_wire(
+        engine="pi",
+        harness=HarnessType.PI,
+        sandbox="local",
+        config=PiAgentConfig(),
+        messages=[Message(role="user", content="hi")],
+    )
+    assert "prompt" not in payload
+
+
+def test_request_to_wire_emits_only_known_keys():
+    pi = _pi_payload()
+    claude = _claude_payload()
+    assert set(pi) <= KNOWN_REQUEST_KEYS
+    assert set(claude) <= KNOWN_REQUEST_KEYS
+    # The Pi case must actually exercise the prompt-override keys, otherwise this guard would
+    # silently stop covering them.
+    assert {"systemPrompt", "appendSystemPrompt"} <= set(pi)
+
+
+def test_pi_permission_policy_is_always_auto():
+    # Pi never gates tool use, regardless of any requested policy.
+    payload = request_to_wire(
+        engine="pi",
+        harness=HarnessType.PI,
+        sandbox="local",
+        config=PiAgentConfig(),
+        messages=[Message(role="user", content="hi")],
+    )
+    assert payload["permissionPolicy"] == "auto"
+
+
+def test_result_from_wire_parses_ok(golden):
+    result = result_from_wire(golden("run_result.ok.json"))
+
+    assert result.output == "Hello!"
+    assert [m.role for m in result.messages] == ["assistant"]
+    # The event with no `type` is dropped on parse; the other three survive.
+    assert [e.type for e in result.events] == ["message", "usage", "done"]
+    assert result.events[0].data == {"type": "message", "text": "Hello!"}
+    assert result.usage == {"input": 10, "output": 5, "total": 15, "cost": 0.001}
+    assert result.stop_reason == "end_turn"
+    assert result.session_id == "sess-42"
+    assert result.model == "gpt-5.5"
+    assert result.trace_id == "trace-abc"
+    # Capabilities come back camelCase and map onto snake_case flags.
+    assert result.capabilities is not None
+    assert result.capabilities.mcp_tools is True
+    assert result.capabilities.images is False
+    assert result.capabilities.text_messages is True
+
+
+def test_result_from_wire_raises_on_failure(golden):
+    with pytest.raises(RuntimeError, match="model exploded"):
+        result_from_wire(golden("run_result.error.json"))
+
+
+def test_result_from_wire_minimal_ok():
+    # A bare success: empty output, empty collections, no capabilities.
+    result = result_from_wire({"ok": True})
+    assert result.output == ""
+    assert result.messages == []
+    assert result.events == []
+    assert result.capabilities is None
+    assert result.session_id is None
diff --git a/services/agent/skills/agenta-getting-started/SKILL.md b/services/agent/skills/agenta-getting-started/SKILL.md
new file mode 100644
index 0000000000..44bc6a7a6b
--- /dev/null
+++ b/services/agent/skills/agenta-getting-started/SKILL.md
@@ -0,0 +1,21 @@
+---
+name: agenta-getting-started
+description: Baseline guidance for agents running on the Agenta platform. Use at the start of a task to recall how to work with the tools and skills Agenta provides and how to report results clearly.
+---
+
+# Agenta getting started
+
+This is a placeholder Agenta skill that ships with the `AgentaHarness`. It proves the
+forced-skill path end to end; replace its content with real Agenta guidance.
+
+## When to use
+
+Read this when you begin a task and want a reminder of the Agenta conventions below.
+
+## Conventions
+
+- Prefer the provided tools and skills over guessing; call a tool when one fits.
+- When another skill matches the task, read its `SKILL.md` fully before acting.
+- Keep answers grounded in what the tools and skills actually return. Do not fabricate
+  results or tool output.
+- Be concise. State what you did, what it returned, and what is left.
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
index 5eb771c78b..7f45ebb714 100644
--- a/services/agent/src/cli.ts
+++ b/services/agent/src/cli.ts
@@ -6,15 +6,23 @@
  * to stderr. This is the one-shot "json adapter" the design doc describes; a
  * long-lived RPC adapter can replace it later behind the same Python-side port.
  */
-import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
+import type {
+  AgentRunRequest,
+  AgentRunResult,
+  EmitEvent,
+  StreamRecord,
+} from "./protocol.ts";
 import { runPi } from "./engines/pi.ts";
 import { runRivet } from "./engines/rivet.ts";
 
 // Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
 // legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
-function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
+function runAgent(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+): Promise<AgentRunResult> {
   const backend = (request.backend ?? process.env.AGENT_BACKEND ?? "pi").toLowerCase();
-  return backend === "rivet" ? runRivet(request) : runPi(request);
+  return backend === "rivet" ? runRivet(request, emit) : runPi(request, emit);
 }
 
 async function readStdin(): Promise<string> {
@@ -25,29 +33,56 @@ async function readStdin(): Promise<string> {
   return Buffer.concat(chunks).toString("utf8");
 }
 
-function emit(result: AgentRunResult): void {
+// One-shot mode: the whole result as a single JSON document (the `/invoke` contract).
+function emitResult(result: AgentRunResult): void {
   process.stdout.write(JSON.stringify(result));
 }
 
+// Streaming mode (`--stream`): one NDJSON record per line — an `{kind:"event"}` line the
+// moment each event is built, then exactly one terminal `{kind:"result"}` line.
+function writeRecord(record: StreamRecord): void {
+  process.stdout.write(JSON.stringify(record) + "\n");
+}
+
 async function main(): Promise<void> {
+  const stream = process.argv.includes("--stream");
   const raw = await readStdin();
 
   let request: AgentRunRequest;
   try {
     request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
   } catch (err) {
-    emit({ ok: false, error: `Invalid JSON on stdin: ${String(err)}` });
+    const failure: AgentRunResult = { ok: false, error: `Invalid JSON on stdin: ${String(err)}` };
+    if (stream) writeRecord({ kind: "result", result: failure });
+    else emitResult(failure);
     process.exit(1);
   }
 
+  if (!stream) {
+    try {
+      const result = await runAgent(request);
+      emitResult(result);
+      process.exit(result.ok ? 0 : 1);
+    } catch (err) {
+      emitResult({
+        ok: false,
+        error: err instanceof Error ? err.stack ?? err.message : String(err),
+      });
+      process.exit(1);
+    }
+    return;
+  }
+
+  const emit: EmitEvent = (event) => writeRecord({ kind: "event", event });
+  let result: AgentRunResult;
   try {
-    const result = await runAgent(request);
-    emit(result);
-    process.exit(result.ok ? 0 : 1);
+    result = await runAgent(request, emit);
   } catch (err) {
-    emit({ ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) });
-    process.exit(1);
+    result = { ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) };
   }
+  // Streaming delivered the events live, so don't echo them in the terminal record.
+  writeRecord({ kind: "result", result: { ...result, events: [] } });
+  process.exit(result.ok ? 0 : 1);
 }
 
 main();
diff --git a/services/agent/src/engines/pi.ts b/services/agent/src/engines/pi.ts
index 85a9a3b930..91b63e0a24 100644
--- a/services/agent/src/engines/pi.ts
+++ b/services/agent/src/engines/pi.ts
@@ -16,9 +16,10 @@
  * Important: stdout is reserved for the JSON result (see cli.ts). Everything here logs to
  * stderr so it never pollutes the result channel.
  */
-import { mkdtempSync, rmSync } from "node:fs";
+import { existsSync, mkdtempSync, rmSync, statSync } from "node:fs";
 import { tmpdir } from "node:os";
-import { join } from "node:path";
+import { dirname, isAbsolute, join } from "node:path";
+import { fileURLToPath } from "node:url";
 
 import {
   AuthStorage,
@@ -36,6 +37,7 @@ import {
   type AgentRunRequest,
   type AgentRunResult,
   type ChatMessage,
+  type EmitEvent,
   type HarnessCapabilities,
   type ResolvedToolSpec,
   type ToolCallbackContext,
@@ -62,6 +64,35 @@ function log(message: string): void {
   process.stderr.write(`[pi-wrapper] ${message}\n`);
 }
 
+// services/agent/src/engines/pi.ts -> services/agent. Bundled skills (the Agenta harness's
+// forced skills) live under services/agent/skills/<name>/. Overridable for non-default layouts.
+const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url))));
+const SKILLS_ROOT = process.env.AGENTA_AGENT_SKILLS_DIR || join(PKG_ROOT, "skills");
+
+/**
+ * Resolve the requested skill names to bundled skill directories under SKILLS_ROOT. Each name
+ * must be a committed dir holding a SKILL.md (Pi loads them and surfaces them in the system
+ * prompt). Absolute paths are honored as-is; unknown or non-directory entries are skipped with
+ * a warning so a stale name never fails the run.
+ */
+function resolveSkillDirs(names: string[] | undefined): string[] {
+  const dirs: string[] = [];
+  for (const name of names ?? []) {
+    if (!name) continue;
+    const path = isAbsolute(name) ? name : join(SKILLS_ROOT, name);
+    try {
+      if (existsSync(path) && statSync(path).isDirectory()) {
+        dirs.push(path);
+      } else {
+        log(`skipping unknown skill "${name}" (no directory at ${path})`);
+      }
+    } catch {
+      log(`skipping skill "${name}": cannot stat ${path}`);
+    }
+  }
+  return dirs;
+}
+
 /** Apply vault-resolved provider keys to the process env so Pi's model auth can see them. */
 function applySecrets(secrets: Record<string, string> | undefined): void {
   for (const [key, value] of Object.entries(secrets ?? {})) {
@@ -148,7 +179,10 @@ export function buildCustomTools(
   }));
 }
 
-export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
+export async function runPi(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+): Promise<AgentRunResult> {
   const prompt = resolvePromptText(request);
   if (!prompt) {
     return { ok: false, error: "No user message to send (prompt/messages empty)." };
@@ -185,11 +219,28 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
 
     // Inject AGENTS.md in memory and keep on-disk context files out of the run.
     const agentsMd = request.agentsMd?.trim();
+    // Pi's two system-prompt layers, carried on the request (PiAgentConfig.system /
+    // append_system). `systemPrompt` replaces Pi's base prompt; `appendSystemPrompt` adds to
+    // it. We feed them through the loader overrides so the run stays hermetic: only what the
+    // request carries applies, never a SYSTEM.md / APPEND_SYSTEM.md left on disk.
+    const systemPrompt = request.systemPrompt?.trim();
+    const appendSystemPrompt = request.appendSystemPrompt?.trim();
+    // Forced skills (the Agenta harness): load exactly the bundled dirs the request names.
+    // `noSkills` suppresses host/global discovery so the run is deterministic; the loader still
+    // merges `additionalSkillPaths` on top, so the bundled skills load. They only surface in
+    // the prompt when `read` is enabled (the harness forces it).
+    const skillDirs = resolveSkillDirs(request.skills);
+    if (skillDirs.length > 0) {
+      log(`skills: ${skillDirs.join(", ")}`);
+    }
     const loader = new DefaultResourceLoader({
       cwd,
       agentDir: getAgentDir(),
       noContextFiles: true,
-      appendSystemPromptOverride: () => [],
+      noSkills: true,
+      additionalSkillPaths: skillDirs,
+      systemPromptOverride: () => systemPrompt || undefined,
+      appendSystemPromptOverride: () => (appendSystemPrompt ? [appendSystemPrompt] : []),
       agentsFilesOverride: () => ({
         agentsFiles: agentsMd
           ? [{ path: "/virtual/AGENTS.md", content: agentsMd }]
@@ -227,14 +278,26 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
     otel.config.provider = model.provider;
     otel.config.requestModel = model.id;
 
-    // Accumulate streamed text as the primary output channel.
+    // Accumulate streamed text as the primary output channel. On the streaming path, flush
+    // each Pi `text_delta` as a `message_delta` live (Pi deltas are already pure, so they
+    // emit verbatim); the block opens on the first delta and closes after the run.
     let streamed = "";
+    let piTextId: string | undefined;
     session.subscribe((event: any) => {
       if (
         event.type === "message_update" &&
         event.assistantMessageEvent?.type === "text_delta"
       ) {
-        streamed += event.assistantMessageEvent.delta ?? "";
+        const delta = event.assistantMessageEvent.delta ?? "";
+        if (!delta) return;
+        streamed += delta;
+        if (emit) {
+          if (piTextId === undefined) {
+            piTextId = "msg-0";
+            emit({ type: "message_start", id: piTextId });
+          }
+          emit({ type: "message_delta", id: piTextId, delta });
+        }
       }
     });
 
@@ -253,12 +316,28 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
     // The structured stream is thinner here than on the rivet path: Pi's in-process tool
     // events feed the trace spans, while the result-level event log carries the final
     // message, usage, and stop reason (enough for the platform without double-plumbing).
+    //
+    // On the streaming path the events were flushed live via `emit`, so the result log stays
+    // empty; here we only close the open text block (or synthesize one when the text never
+    // streamed) and flush the tail usage/done events.
     const events: AgentEvent[] = [];
-    if (output) events.push({ type: "message", text: output });
-    if (usage.total > 0) {
-      events.push({ type: "usage", ...usage });
+    const emitOrLog = (event: AgentEvent): void => {
+      if (emit) emit(event);
+      else events.push(event);
+    };
+    if (emit) {
+      if (piTextId !== undefined) {
+        emit({ type: "message_end", id: piTextId });
+      } else if (output) {
+        emit({ type: "message_start", id: "msg-0" });
+        emit({ type: "message_delta", id: "msg-0", delta: output });
+        emit({ type: "message_end", id: "msg-0" });
+      }
+    } else if (output) {
+      events.push({ type: "message", text: output });
     }
-    events.push({ type: "done", stopReason });
+    if (usage.total > 0) emitOrLog({ type: "usage", ...usage });
+    emitOrLog({ type: "done", stopReason });
 
     const messages: ChatMessage[] = output
       ? [{ role: "assistant", content: output }]
@@ -271,7 +350,8 @@ export async function runPi(request: AgentRunRequest): Promise<AgentRunResult> {
       events,
       usage,
       stopReason,
-      capabilities: PI_CAPABILITIES,
+      // `streamingDeltas` is only honest when a live sink carried the deltas end-to-end.
+      capabilities: { ...PI_CAPABILITIES, streamingDeltas: !!emit },
       sessionId,
       model: `${model.provider}/${model.id}`,
       traceId: otel.config.traceId,
diff --git a/services/agent/src/engines/rivet.ts b/services/agent/src/engines/rivet.ts
index f056849855..62f978a5c8 100644
--- a/services/agent/src/engines/rivet.ts
+++ b/services/agent/src/engines/rivet.ts
@@ -51,6 +51,7 @@ import {
   type AgentRunRequest,
   type AgentRunResult,
   type ChatMessage,
+  type EmitEvent,
   type HarnessCapabilities,
   type ResolvedToolSpec,
   type ToolCallbackContext,
@@ -593,7 +594,11 @@ async function probeCapabilities(
   }
 }
 
-export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult> {
+export async function runRivet(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+  signal?: AbortSignal,
+): Promise<AgentRunResult> {
   const harness = request.harness || process.env.AGENTA_AGENT_HARNESS || "pi";
   const sandboxId = request.sandbox || process.env.AGENTA_AGENT_SANDBOX || "local";
 
@@ -637,6 +642,15 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
     : mkdtempSync(join(tmpdir(), "agenta-rivet-"));
   const agentsMd = request.agentsMd?.trim();
 
+  // Pi's system-prompt overrides (systemPrompt / appendSystemPrompt) are honored on the
+  // in-process Pi engine via the resource loader. The ACP path drives Pi through pi-acp,
+  // which gives us no per-run hook to set them (a project SYSTEM.md is trust-gated, and CLI
+  // flags can't be set per session here), so they are not delivered yet. Warn rather than
+  // drop them silently. AGENTS.md still applies on this path regardless.
+  if (isPi && (request.systemPrompt?.trim() || request.appendSystemPrompt?.trim())) {
+    log("systemPrompt/appendSystemPrompt are not yet delivered on the ACP (rivet) Pi path; ignored");
+  }
+
   // Pi writes its run totals here on agent_end; we read them back and return them so the
   // caller can roll them onto the workflow span (separate OTLP batch, see piExtension).
   const usageOutPath = isPi ? `${cwd}/.agenta-usage.json` : undefined;
@@ -660,6 +674,9 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
   const sandbox = await SandboxAgent.start({
     sandbox: buildSandboxProvider(sandboxId, env, binaryPath, piExtEnv, secrets),
     persist,
+    // Propagate caller cancellation (a client disconnect on the streaming HTTP edge) so an
+    // in-flight run aborts instead of finishing unobserved. The `finally` still disposes.
+    ...(signal ? { signal } : {}),
     // Daytona's preview proxy authenticates with a per-sandbox cookie; carry it across
     // requests so ACP calls after the first don't 401. Harmless for local.
     ...(isDaytona ? { fetch: createCookieFetch() } : {}),
@@ -726,6 +743,7 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
       authorization: request.trace?.authorization,
       captureContent: request.trace?.captureContent,
       emitSpans: !isPi || isDaytona,
+      emit,
     });
     otel = run;
 
@@ -789,10 +807,14 @@ export async function runRivet(request: AgentRunRequest): Promise<AgentRunResult
       ok: true,
       output,
       messages: output ? [{ role: "assistant", content: output }] : [],
-      events: run.events(),
+      // Streaming already delivered every event live, so the terminal result carries none
+      // (re-sending would double them on the consumer).
+      events: emit ? [] : run.events(),
       usage,
       stopReason,
-      capabilities,
+      // `streamingDeltas` advertises end-to-end live deltas, which is only true when a live
+      // sink is wired. The one-shot path reports false even when the harness produces deltas.
+      capabilities: { ...capabilities, streamingDeltas: !!emit && capabilities.streamingDeltas },
       sessionId: session.id,
       model: model ?? request.model,
       traceId: run.traceId(),
diff --git a/services/agent/src/protocol.ts b/services/agent/src/protocol.ts
index af5029234d..98a33ad876 100644
--- a/services/agent/src/protocol.ts
+++ b/services/agent/src/protocol.ts
@@ -1,9 +1,12 @@
 /**
  * The `/run` wire contract, shared by both backends.
  *
- * The Python side mirrors these names in `services/oss/src/harness/wire.py`. Keeping the
- * request/result/event/capability types here (rather than in one runner that the other
- * imports from) is what lets `engines/pi.ts` and `engines/rivet.ts` stay peers.
+ * The Python side mirrors these names in `sdks/python/agenta/sdk/agents/utils/wire.py`.
+ * The contract is pinned by shared golden fixtures under
+ * `sdks/python/oss/tests/pytest/unit/agents/golden/`; a change here that drifts from those
+ * fixtures fails `test_wire_contract.py`. Keeping the request/result/event/capability types
+ * here (rather than in one runner that the other imports from) is what lets `engines/pi.ts`
+ * and `engines/rivet.ts` stay peers.
  */
 
 /** One piece of a message. `text` is all the playground sends today; the rest is plumbed. */
@@ -70,16 +73,33 @@ export interface HarnessCapabilities {
   sessionLifecycle?: boolean;
 }
 
-/** One structured run event. Mirrors the ACP `session/update` variants we surface. */
+/**
+ * One structured run event. Mirrors the ACP `session/update` variants we surface.
+ *
+ * Two text families coexist. The coalesced `message` / `thought` events carry the whole
+ * block and are what the one-shot `/run` result log holds (the non-streaming path has no
+ * per-token granularity to recover). The `*_start` / `*_delta` / `*_end` lifecycle events
+ * are emitted live on the streaming path; a consumer that sees the delta family for a block
+ * never also sees a coalesced `message` for it (see `createRivetOtel.finish`).
+ */
 export type AgentEvent =
   | { type: "message"; text: string }
   | { type: "thought"; text: string }
+  | { type: "message_start"; id: string }
+  | { type: "message_delta"; id: string; delta: string }
+  | { type: "message_end"; id: string }
+  | { type: "reasoning_start"; id: string }
+  | { type: "reasoning_delta"; id: string; delta: string }
+  | { type: "reasoning_end"; id: string }
   | { type: "tool_call"; id?: string; name?: string; input?: unknown }
   | { type: "tool_result"; id?: string; output?: string; isError?: boolean }
   | { type: "usage"; input?: number; output?: number; total?: number; cost?: number }
   | { type: "error"; message: string }
   | { type: "done"; stopReason?: string };
 
+/** A live event sink the engines call as each event is built. */
+export type EmitEvent = (event: AgentEvent) => void;
+
 /** Run token/cost totals, rolled up onto the caller's workflow span. */
 export interface AgentUsage {
   input: number;
@@ -101,6 +121,18 @@ export interface AgentRunRequest {
   secrets?: Record<string, string>;
   /** AGENTS.md text injected as the agent's instructions. */
   agentsMd?: string;
+  /**
+   * Pi only: replace Pi's built-in base system prompt outright (Pi's `systemPrompt` /
+   * `SYSTEM.md`). AGENTS.md is still appended after it, so this changes Pi's persona, not
+   * the project context. Leave unset to keep Pi's default coding-assistant prompt.
+   */
+  systemPrompt?: string;
+  /**
+   * Pi only: append to the base system prompt without replacing it (Pi's
+   * `appendSystemPrompt` / `APPEND_SYSTEM.md`). Use this to add framing on top of Pi's
+   * default prompt rather than rewrite it.
+   */
+  appendSystemPrompt?: string;
   /** Model id ("gpt-5.5") or "provider/id" ("openai-codex/gpt-5.5"). */
   model?: string;
   /** Explicit latest turn. Falls back to the last user message in `messages`. */
@@ -109,6 +141,12 @@ export interface AgentRunRequest {
   messages?: ChatMessage[];
   /** Built-in tools to enable. */
   tools?: string[];
+  /**
+   * Bundled skill directory names to force-load (the Agenta harness). Each name resolves
+   * against the runner's bundled `skills/` root and is loaded into Pi's resource loader, so
+   * it appears in the system prompt (Pi only renders skills when the `read` tool is enabled).
+   */
+  skills?: string[];
   /** Resolved runnable tools (WP-7). */
   customTools?: ResolvedToolSpec[];
   /** Where customTools route their calls back to. Required when customTools is set. */
@@ -140,6 +178,18 @@ export interface AgentRunResult {
   error?: string;
 }
 
+/**
+ * One line of the NDJSON stream the runner writes when a caller asks for live delivery
+ * (HTTP `Accept: application/x-ndjson`, or the CLI `--stream` flag). Every `event` record
+ * flushes the moment its `AgentEvent` is built; the run ends with exactly one `result`
+ * record carrying the same `AgentRunResult` the one-shot path returns (so the Python side
+ * parses it with the same `result_from_wire`). On the streaming path the terminal result's
+ * `events` is empty — the events were already delivered live.
+ */
+export type StreamRecord =
+  | { kind: "event"; event: AgentEvent }
+  | { kind: "result"; result: AgentRunResult };
+
 /** Flatten a message's content (string or content blocks) to its text. */
 export function messageText(content: string | ContentBlock[] | undefined): string {
   if (!content) return "";
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
index 116a8e7578..aae23c4480 100644
--- a/services/agent/src/server.ts
+++ b/services/agent/src/server.ts
@@ -12,7 +12,12 @@
  */
 import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
 
-import type { AgentRunRequest, AgentRunResult } from "./protocol.ts";
+import type {
+  AgentRunRequest,
+  AgentRunResult,
+  EmitEvent,
+  StreamRecord,
+} from "./protocol.ts";
 import { runPi } from "./engines/pi.ts";
 import { runRivet } from "./engines/rivet.ts";
 
@@ -24,11 +29,60 @@ const PORT = Number(process.env.PORT ?? 8765);
 // request shape (a rivet request carries `harness`/`sandbox`).
 const DEFAULT_BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
 
-function runAgent(request: AgentRunRequest): Promise<AgentRunResult> {
+function runAgent(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+  signal?: AbortSignal,
+): Promise<AgentRunResult> {
   const backend = (request.backend ?? DEFAULT_BACKEND).toLowerCase();
-  if (backend === "rivet") return runRivet(request);
-  if (backend === "pi") return runPi(request);
-  return request.harness || request.sandbox ? runRivet(request) : runPi(request);
+  if (backend === "rivet") return runRivet(request, emit, signal);
+  if (backend === "pi") return runPi(request, emit);
+  return request.harness || request.sandbox
+    ? runRivet(request, emit, signal)
+    : runPi(request, emit);
+}
+
+/**
+ * Stream a run as NDJSON: one `{kind:"event"}` line per event the moment it is built, then
+ * exactly one terminal `{kind:"result"}` line (success or failure). Selected by the caller
+ * with `Accept: application/x-ndjson`; the one-shot `/run` path is left untouched.
+ */
+async function runAndStream(
+  req: IncomingMessage,
+  res: ServerResponse,
+  request: AgentRunRequest,
+): Promise<void> {
+  res.writeHead(200, {
+    "content-type": "application/x-ndjson",
+    "cache-control": "no-cache",
+    "x-accel-buffering": "no",
+    connection: "keep-alive",
+  });
+
+  // A client disconnect aborts the in-flight run rather than letting it finish unobserved.
+  // Listen on the response, not the request: the request body is already fully read, so its
+  // `close` can fire early on a keep-alive connection. `res` `close` fires when the response
+  // connection ends — after a normal `res.end()` (harmless: the run is already done) or when
+  // the client drops mid-stream (the case we want to cancel).
+  const controller = new AbortController();
+  res.on("close", () => controller.abort());
+
+  const writeRecord = (record: StreamRecord): void => {
+    if (res.writableEnded) return;
+    res.write(JSON.stringify(record) + "\n");
+  };
+  const emit: EmitEvent = (event) => writeRecord({ kind: "event", event });
+
+  let result: AgentRunResult;
+  try {
+    result = await runAgent(request, emit, controller.signal);
+  } catch (err) {
+    const message = err instanceof Error ? err.stack ?? err.message : String(err);
+    result = { ok: false, error: message };
+  }
+  // Streaming delivered the events live, so don't echo them in the terminal record.
+  writeRecord({ kind: "result", result: { ...result, events: [] } });
+  res.end();
 }
 
 function send(res: ServerResponse, status: number, body: unknown): void {
@@ -63,6 +117,14 @@ const server = createServer(async (req, res) => {
         return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
       }
 
+      const wantsStream = (req.headers["accept"] ?? "").includes(
+        "application/x-ndjson",
+      );
+      if (wantsStream) {
+        await runAndStream(req, res, request);
+        return;
+      }
+
       const result = await runAgent(request);
       return send(res, result.ok ? 200 : 500, result);
     }
diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts
index d1129e7ba9..6f3ddb2848 100644
--- a/services/agent/src/tracing/otel.ts
+++ b/services/agent/src/tracing/otel.ts
@@ -52,7 +52,7 @@ import type {
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
 import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
 
-import type { AgentEvent, AgentUsage } from "../protocol.ts";
+import type { AgentEvent, AgentUsage, EmitEvent } from "../protocol.ts";
 
 // ---------------------------------------------------------------------------
 // Shared, process-wide tracing infrastructure
@@ -669,6 +669,14 @@ export interface RivetOtelInit extends Partial<RunConfig> {
    * the reply text and builds no spans, so the two do not double up.
    */
   emitSpans?: boolean;
+  /**
+   * Live event sink. When set, each `AgentEvent` is flushed here the moment it is built
+   * (in addition to being recorded in `events[]`), and the text/reasoning blocks are
+   * emitted as `*_start`/`*_delta`/`*_end` lifecycle events rather than coalesced at the
+   * end. When unset (the one-shot path), only the coalesced `message`/`thought` land in
+   * `events[]`. This split is what keeps a delta'd block from being re-sent in full.
+   */
+  emit?: EmitEvent;
 }
 
 export interface RivetOtel {
@@ -711,10 +719,83 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
   let llmSpan: Span | undefined;
   let runTraceId: string | undefined;
   let accumulated = "";
+  let reasoningAccumulated = "";
   let usage: AgentUsage | undefined;
   const events: AgentEvent[] = [];
   const toolSpans = new Map<string, { span: Span; name: string }>();
 
+  // Live emission. `record` is the single choke point for every event: it appends to the
+  // result log and, on the streaming path, flushes the event the moment it is built — so
+  // the live order is byte-identical to `events[]`. A sink failure never aborts the run.
+  const sink = init.emit;
+  function record(event: AgentEvent): void {
+    events.push(event);
+    if (sink) {
+      try {
+        sink(event);
+      } catch {
+        // a downstream sink error must not break the agent run
+      }
+    }
+  }
+
+  // Text/reasoning block lifecycle (streaming path only). At most one block of each kind is
+  // open; each gets a stable, monotonic id. `*Emitted` tracks the total text delivered as
+  // deltas across the whole run (NOT per block) — `accumulated` is run-long, so the next
+  // delta is always its remainder. Block boundaries (a tool call between two text runs) only
+  // insert start/end markers; they must not reset the counter, or the second block would
+  // re-emit the first block's text.
+  let textBlockId: string | undefined;
+  let textEmitted = "";
+  let anyTextDelta = false;
+  let reasoningBlockId: string | undefined;
+  let reasoningEmitted = "";
+  let blockSeq = 0;
+  const nextId = (prefix: string): string => `${prefix}-${blockSeq++}`;
+
+  function closeText(): void {
+    if (textBlockId === undefined) return;
+    record({ type: "message_end", id: textBlockId });
+    textBlockId = undefined;
+  }
+
+  function closeReasoning(): void {
+    if (reasoningBlockId === undefined) return;
+    record({ type: "reasoning_end", id: reasoningBlockId });
+    reasoningBlockId = undefined;
+  }
+
+  /** Open (if needed) the assistant text block and emit the pure delta up to `target`. */
+  function streamText(target: string): void {
+    closeReasoning(); // a text chunk ends any open reasoning run (blocks never overlap)
+    const delta = target.startsWith(textEmitted)
+      ? target.slice(textEmitted.length)
+      : target;
+    if (!delta) return;
+    if (textBlockId === undefined) {
+      textBlockId = nextId("msg");
+      record({ type: "message_start", id: textBlockId });
+    }
+    record({ type: "message_delta", id: textBlockId, delta });
+    textEmitted = target.startsWith(textEmitted) ? target : textEmitted + delta;
+    anyTextDelta = true;
+  }
+
+  /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */
+  function streamReasoning(target: string): void {
+    closeText(); // a reasoning chunk ends any open text run
+    const delta = target.startsWith(reasoningEmitted)
+      ? target.slice(reasoningEmitted.length)
+      : target;
+    if (!delta) return;
+    if (reasoningBlockId === undefined) {
+      reasoningBlockId = nextId("reason");
+      record({ type: "reasoning_start", id: reasoningBlockId });
+    }
+    record({ type: "reasoning_delta", id: reasoningBlockId, delta });
+    reasoningEmitted = target.startsWith(reasoningEmitted) ? target : reasoningEmitted + delta;
+  }
+
   function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void {
     // Span-less mode (harness self-instruments): only track the trace id so the run can
     // report it; the harness emits the spans under the propagated parent.
@@ -767,6 +848,19 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
       // Replace when a chunk is a superset of what we have, append otherwise.
       if (t.startsWith(accumulated)) accumulated = t;
       else accumulated += t;
+      // Live deltas run independent of span emission (text, not a span), so they flow even
+      // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative
+      // text, so the pure delta is its tail past what we already sent.
+      if (sink) streamText(accumulated);
+      return;
+    }
+
+    if (kind === "agent_thought_chunk") {
+      const t = acpBlockText(update.content);
+      if (!t) return;
+      if (t.startsWith(reasoningAccumulated)) reasoningAccumulated = t;
+      else reasoningAccumulated += t;
+      if (sink) streamReasoning(reasoningAccumulated);
       return;
     }
 
@@ -775,6 +869,10 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     if (kind === "tool_call") {
       const id = update.toolCallId;
       if (!id || !turnCtx) return;
+      // A tool call ends any open text/reasoning block (keeps streamed block boundaries
+      // clean across text -> tool -> text interleaving). No-op on the one-shot path.
+      closeText();
+      closeReasoning();
       const name = update.title || update.kind || "tool";
       const span = tracer.startSpan(`execute_tool ${name}`, undefined, turnCtx);
       span.setAttribute("openinference.span.kind", "TOOL");
@@ -784,7 +882,7 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
       if (update.rawInput != null)
         setInputs(span, update.rawInput as Record<string, unknown>, capture);
       toolSpans.set(id, { span, name: String(name) });
-      events.push({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput });
+      record({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput });
       // A tool_call can arrive already completed (status set up front).
       maybeCloseTool(id, update);
       return;
@@ -807,7 +905,7 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
         total: typeof total === "number" ? total : usage?.total ?? 0,
         cost: typeof cost === "number" ? cost : usage?.cost ?? 0,
       };
-      events.push({ type: "usage", ...usage });
+      record({ type: "usage", ...usage });
     }
   }
 
@@ -823,15 +921,32 @@ export function createRivetOtel(init: RivetOtelInit): RivetOtel {
     if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR });
     entry.span.end();
     toolSpans.delete(id);
-    events.push({ type: "tool_result", id, output: out, isError: status === "failed" });
+    record({ type: "tool_result", id, output: out, isError: status === "failed" });
   }
 
   function finish(): string {
     const text = stripStartupBanner(accumulated.trim());
-    // The event log is independent of span emission, so build its tail either way: the
-    // final assistant message, then the terminal done marker.
-    if (text) events.push({ type: "message", text });
-    events.push({ type: "done" });
+    // The event log is independent of span emission, so build its tail either way.
+    closeText();
+    closeReasoning();
+    if (sink) {
+      // Streaming path: the block deltas were already flushed, so do NOT re-emit the
+      // coalesced message (that would double it). If the harness produced no token deltas
+      // at all but there is text, synthesize a minimal start/delta/end so the consumer
+      // always sees one uniform block shape regardless of harness streaming support.
+      if (text && !anyTextDelta) {
+        const id = nextId("msg");
+        record({ type: "message_start", id });
+        record({ type: "message_delta", id, delta: text });
+        record({ type: "message_end", id });
+      }
+    } else {
+      // One-shot path: coalesced events only (no per-token granularity to recover).
+      if (text) record({ type: "message", text });
+      const reasoning = reasoningAccumulated.trim();
+      if (reasoning) record({ type: "thought", text: reasoning });
+    }
+    record({ type: "done" });
     if (!emitSpans) return text;
     if (llmSpan) {
       emitMessages(
diff --git a/services/agent/test/stream-events.test.ts b/services/agent/test/stream-events.test.ts
new file mode 100644
index 0000000000..9e6c905cfe
--- /dev/null
+++ b/services/agent/test/stream-events.test.ts
@@ -0,0 +1,124 @@
+/**
+ * Unit test for the createRivetOtel delta/lifecycle state machine.
+ *
+ * Drives `handleUpdate` with a hand-built ACP `session/update` sequence (Claude-style
+ * cumulative text snapshots, a tool call between two text runs, a reasoning run) and asserts
+ * the streaming and one-shot event shapes. No harness, no network: spans are built offline
+ * and never flushed.
+ *
+ * Run: pnpm exec tsx test/stream-events.test.ts
+ */
+import assert from "node:assert/strict";
+
+import { createRivetOtel } from "../src/tracing/otel.ts";
+import type { AgentEvent } from "../src/protocol.ts";
+
+const textChunk = (text: string) => ({
+  sessionUpdate: "agent_message_chunk",
+  content: { type: "text", text },
+});
+const thoughtChunk = (text: string) => ({
+  sessionUpdate: "agent_thought_chunk",
+  content: { type: "text", text },
+});
+const toolCall = (id: string, title: string, rawInput: unknown) => ({
+  sessionUpdate: "tool_call",
+  toolCallId: id,
+  title,
+  rawInput,
+});
+const toolDone = (id: string, text: string) => ({
+  sessionUpdate: "tool_call_update",
+  toolCallId: id,
+  status: "completed",
+  content: [{ content: { type: "text", text } }],
+});
+const usage = () => ({ sessionUpdate: "usage_update", used: 100, cost: { amount: 0.01 } });
+
+// The same ACP sequence drives both modes: two text runs around a tool call, then reasoning.
+function drive(run: ReturnType<typeof createRivetOtel>): void {
+  run.start({ prompt: "weather in Paris?" });
+  run.handleUpdate(textChunk("Hello ")); // pure delta
+  run.handleUpdate(textChunk("Hello world")); // cumulative snapshot (Claude-style)
+  run.handleUpdate(toolCall("call_1", "getWeather", { city: "Paris" }));
+  run.handleUpdate(toolDone("call_1", "sunny"));
+  run.handleUpdate(textChunk("Hello world It is sunny.")); // resumes after the tool
+  run.handleUpdate(thoughtChunk("thinking..."));
+  run.handleUpdate(usage());
+}
+
+const types = (events: AgentEvent[]) => events.map((e) => e.type);
+const ofType = <T extends AgentEvent["type"]>(events: AgentEvent[], t: T) =>
+  events.filter((e) => e.type === t) as Extract<AgentEvent, { type: T }>[];
+
+// --- Scenario 1: streaming (emit set) ---------------------------------------
+{
+  const emitted: AgentEvent[] = [];
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
+  drive(run);
+  const finalText = run.finish();
+
+  // No coalesced text events on the streaming path.
+  assert.equal(ofType(emitted, "message").length, 0, "no coalesced message when streaming");
+  assert.equal(ofType(emitted, "thought").length, 0, "no coalesced thought when streaming");
+
+  // Exactly one terminal done.
+  assert.equal(ofType(emitted, "done").length, 1, "exactly one done");
+
+  // Two text blocks (split by the tool call), one reasoning block, balanced start/end.
+  const mStart = ofType(emitted, "message_start");
+  const mEnd = ofType(emitted, "message_end");
+  assert.equal(mStart.length, 2, "two message_start");
+  assert.equal(mEnd.length, 2, "two message_end");
+  assert.deepEqual(mStart.map((e) => e.id), ["msg-0", "msg-1"], "stable monotonic text ids");
+  const rStart = ofType(emitted, "reasoning_start");
+  const rEnd = ofType(emitted, "reasoning_end");
+  assert.equal(rStart.length, 1, "one reasoning_start");
+  assert.equal(rEnd.length, 1, "one reasoning_end");
+
+  // Deltas are pure and reconstruct the full text, with no overlap/repeat.
+  const text = ofType(emitted, "message_delta").map((e) => e.delta).join("");
+  assert.equal(text, "Hello world It is sunny.", "concatenated deltas == full text");
+  assert.equal(text, finalText, "deltas match finish() output");
+  const reasoning = ofType(emitted, "reasoning_delta").map((e) => e.delta).join("");
+  assert.equal(reasoning, "thinking...", "concatenated reasoning deltas");
+
+  // Ordering invariant: each block's start precedes its deltas precede its end; tool result
+  // lands before the second text block opens.
+  const seq = types(emitted);
+  assert.ok(seq.indexOf("message_end") < seq.indexOf("tool_call"), "first text block closes before the tool call");
+  assert.ok(seq.indexOf("tool_result") < seq.lastIndexOf("message_start"), "tool result precedes the second text block");
+  for (const id of ["msg-0", "msg-1", "reason-2"]) {
+    const idxs = emitted
+      .map((e, i) => ((e as any).id === id ? { i, t: e.type } : null))
+      .filter(Boolean) as { i: number; t: string }[];
+    assert.ok(idxs[0].t.endsWith("_start"), `${id} starts with *_start`);
+    assert.ok(idxs[idxs.length - 1].t.endsWith("_end"), `${id} ends with *_end`);
+  }
+}
+
+// --- Scenario 2: one-shot (no emit) -----------------------------------------
+{
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
+  drive(run);
+  const finalText = run.finish();
+  const events = run.events();
+
+  // Coalesced text/thought, no delta lifecycle events.
+  const messages = ofType(events, "message");
+  assert.equal(messages.length, 1, "one coalesced message");
+  assert.equal(messages[0].text, "Hello world It is sunny.", "coalesced text == final");
+  assert.equal(messages[0].text, finalText);
+  assert.equal(ofType(events, "thought").length, 1, "one coalesced thought");
+  for (const t of ["message_start", "message_delta", "message_end", "reasoning_start", "reasoning_delta", "reasoning_end"]) {
+    assert.equal(events.filter((e) => e.type === t).length, 0, `no ${t} on the one-shot path`);
+  }
+
+  // The structured tool/usage events are still present, with exactly one done.
+  assert.equal(ofType(events, "tool_call").length, 1, "tool_call present");
+  assert.equal(ofType(events, "tool_result").length, 1, "tool_result present");
+  assert.equal(ofType(events, "usage").length, 1, "usage present");
+  assert.equal(ofType(events, "done").length, 1, "exactly one done");
+}
+
+console.log("stream-events.test.ts: all assertions passed");
diff --git a/services/oss/src/agent/__init__.py b/services/oss/src/agent/__init__.py
index 1701c8f1bd..8a1b875183 100644
--- a/services/oss/src/agent/__init__.py
+++ b/services/oss/src/agent/__init__.py
@@ -1,10 +1,11 @@
 """The Agenta agent workflow app and its glue.
 
-The handler and harness wiring are in ``app``; request parsing in ``inputs``; tool
-resolution in ``tools``; provider secrets in ``secrets``; trace/usage glue in ``tracing``;
-the ``/inspect`` schemas in ``schemas``; the file-backed defaults in ``config``. The
-engine-agnostic runtime (the harness/environment seams and adapters) lives in
-``oss.src.harness``.
+The handler and backend wiring are in ``app``; tool resolution in ``tools``; provider
+secrets in ``secrets``; trace/usage glue in ``tracing``; the ``/inspect`` schemas in
+``schemas``; the file-backed defaults in ``config``. The engine-agnostic runtime (the
+backend/environment/harness ports and their adapters) lives in the SDK at
+``agenta.sdk.agents``; this package is the thin Agenta integration that feeds it resolved
+tools, vault secrets, and a trace context.
 """
 
 from oss.src.agent.app import agent_v0_app, create_agent_app
diff --git a/services/oss/src/agent/app.py b/services/oss/src/agent/app.py
index aee738fbb1..bcb0594aa4 100644
--- a/services/oss/src/agent/app.py
+++ b/services/oss/src/agent/app.py
@@ -1,14 +1,15 @@
-"""Agent workflow app: the ``/invoke`` handler and how it wires a harness run.
+"""Agent workflow app: the ``/invoke`` handler, wired onto the SDK agent runtime.
 
 Mirrors the chat/completion services: an Agenta app exposing ``/invoke`` and ``/inspect``
 through ``ag.create_app`` + ``ag.workflow`` + ``ag.route``. The handler parses the request
-(``inputs``), resolves tools (``tools``) and provider secrets (``secrets``), threads the
-trace context (``tracing``), runs one turn through an :class:`AgentSession` on the
-engine-agnostic runtime (``oss.src.harness``), and records the run's usage.
-
-The engine (rivet over ACP vs the legacy in-process Pi path) and the transport (HTTP
-sidecar vs subprocess) are deployment choices; the harness, sandbox, and permission policy
-are editable playground config.
+into a neutral ``AgentConfig`` + ``RunSelection`` (``agenta.sdk.agents``), resolves tools
+(``tools``) and provider secrets (``secrets``) server-side, threads the trace context
+(``tracing``), then runs one turn through a :class:`Harness` over a backend it picks from
+the selection, and records the run's usage.
+
+The backend (rivet over ACP vs the in-process Pi path) and the transport (HTTP sidecar vs
+subprocess) are deployment choices; the harness, sandbox, and permission policy are editable
+playground config.
 """
 
 import os
@@ -16,50 +17,58 @@
 
 import agenta as ag
 
+from agenta.sdk.agents import (
+    AgentConfig,
+    Backend,
+    Environment,
+    InProcessPiBackend,
+    RivetBackend,
+    RunSelection,
+    SessionConfig,
+    make_harness,
+    to_messages,
+)
+
 from oss.src.agent.config import load_config, wrapper_dir
-from oss.src.agent.inputs import resolve_agent_config, to_messages
 from oss.src.agent.schemas import AGENT_SCHEMAS
 from oss.src.agent.secrets import resolve_harness_secrets
 from oss.src.agent.tools import resolve_tools
 from oss.src.agent.tracing import record_usage, trace_context
-from oss.src.harness import (
-    Harness,
-    HttpHarness,
-    LocalEnvironment,
-    SessionConfig,
-    SubprocessHarness,
-)
 
 
-def select_backend(harness_id: str, sandbox_id: str) -> str:
-    """Choose the engine (``rivet`` or ``pi``) for a run.
-
-    ``rivet`` drives a harness over ACP via a rivet daemon; ``pi`` is the legacy in-process
-    Pi path. The legacy path only runs the ``pi`` harness locally, so any other harness or
-    sandbox forces ``rivet`` rather than silently dropping the selection.
-    ``AGENTA_AGENT_RUNTIME=rivet`` forces rivet for everything.
-    """
-    runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
-    if runtime == "rivet" or harness_id != "pi" or sandbox_id != "local":
-        return "rivet"
-    return "pi"
+def _default_agent_config() -> AgentConfig:
+    """The service's file defaults (AGENTS.md, model, tools) as a neutral AgentConfig."""
+    file_cfg = load_config()
+    return AgentConfig(
+        instructions=file_cfg.agents_md,
+        model=file_cfg.model,
+        tools=file_cfg.tools,
+    )
 
 
-def build_harness(backend: str) -> Harness:
-    """Pick the transport to the TypeScript runner for the current deployment.
+def select_backend(selection: RunSelection) -> Backend:
+    """Pick the backend for a run.
 
-    ``AGENTA_AGENT_PI_URL`` set (docker) -> call the sidecar over HTTP; unset (local) ->
-    spawn the runner as a subprocess. ``backend`` (the engine) is chosen by
-    :func:`select_backend`.
+    The in-process Pi backend runs Pi locally, and the Agenta harness is Pi with an opinion,
+    so both ``pi`` and ``agenta`` stay on it. Any other harness, a non-local sandbox, or
+    ``AGENTA_AGENT_RUNTIME=rivet`` selects the rivet backend instead of silently dropping the
+    choice (``agenta`` is not yet supported on the rivet path, so ``agenta`` + a non-local
+    sandbox raises ``UnsupportedHarnessError`` rather than running the wrong thing). The
+    transport to the TypeScript runner is a deployment detail each backend takes:
+    ``AGENTA_AGENT_PI_URL`` set (docker) -> HTTP to the sidecar; unset (local checkout) ->
+    spawn the runner CLI from the wrapper dir.
     """
-    pi_url = os.getenv("AGENTA_AGENT_PI_URL")
-    if pi_url:
-        return HttpHarness(pi_url, backend=backend)
-    return SubprocessHarness(
-        LocalEnvironment(),
-        wrapper_dir=str(wrapper_dir()),
-        backend=backend,
+    runtime = os.getenv("AGENTA_AGENT_RUNTIME", "pi").lower()
+    url = os.getenv("AGENTA_AGENT_PI_URL")
+    cwd = str(wrapper_dir())
+    use_rivet = (
+        runtime == "rivet"
+        or selection.harness not in ("pi", "agenta")
+        or selection.sandbox != "local"
     )
+    if use_rivet:
+        return RivetBackend(sandbox=selection.sandbox, url=url, cwd=cwd)
+    return InProcessPiBackend(url=url, cwd=cwd)
 
 
 async def _agent(
@@ -68,34 +77,36 @@ async def _agent(
     parameters: Optional[Dict] = None,
 ):
     params = parameters or {}
-    cfg = resolve_agent_config(params, load_config())
+
+    agent_config = AgentConfig.from_params(params, defaults=_default_agent_config())
+    selection = RunSelection.from_params(
+        params,
+        default_harness=os.getenv("AGENTA_AGENT_HARNESS", "pi"),
+        default_sandbox=os.getenv("AGENTA_AGENT_SANDBOX", "local"),
+    )
 
     msgs = to_messages(messages or (inputs or {}).get("messages") or [])
-    builtins, custom_tools, tool_callback = await resolve_tools(cfg.tools)
+    builtins, custom_tools, tool_callback = await resolve_tools(agent_config.tools)
 
     session_config = SessionConfig(
-        instructions=cfg.instructions,
-        model=cfg.model,
-        harness=cfg.harness,
-        sandbox=cfg.sandbox,
+        agent=agent_config,
         secrets=await resolve_harness_secrets(),
+        permission_policy=selection.permission_policy,
+        trace=trace_context(),
         builtin_tools=builtins,
         custom_tools=custom_tools,
         tool_callback=tool_callback,
-        permission_policy=cfg.permission_policy,
-        trace=trace_context(),
     )
 
-    # The engine follows the selected harness/sandbox: a claude harness or a daytona
-    # sandbox needs rivet, so the legacy pi path never silently swallows the selection.
-    harness = build_harness(select_backend(cfg.harness, cfg.sandbox))
+    # The harness validates that the chosen backend can drive it; select_backend already
+    # routes a claude harness or a non-local sandbox to rivet, so this never fails in
+    # practice. setup/cleanup own the backend lifecycle; prompt runs one cold turn.
+    harness = make_harness(selection.harness, Environment(select_backend(selection)))
     await harness.setup()
     try:
-        session = harness.create_session(session_config)
-        result = await session.prompt(msgs)
-        await session.destroy()
+        result = await harness.prompt(session_config, msgs)
     finally:
-        await harness.shutdown()
+        await harness.cleanup()
 
     record_usage(result.usage)
     return {"role": "assistant", "content": result.output}
diff --git a/services/oss/src/agent/inputs.py b/services/oss/src/agent/inputs.py
deleted file mode 100644
index 3ba0728fd9..0000000000
--- a/services/oss/src/agent/inputs.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""Parse the playground/API request into a model, instructions, tools, and messages."""
-
-import os
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
-
-from oss.src.agent.config import AgentConfig
-from oss.src.harness.ports import Message
-
-
-@dataclass
-class RunConfig:
-    """The agent config for one run, resolved from the request and the file defaults."""
-
-    instructions: str
-    model: str
-    tools: List[Any] = field(default_factory=list)
-    harness: str = "pi"
-    sandbox: str = "local"
-    permission_policy: str = "auto"
-
-
-def _as_list(raw: Any) -> List[Any]:
-    """Coerce a tools value (a dict, a list, or nothing) into a list."""
-    if isinstance(raw, dict):
-        return [raw]
-    if isinstance(raw, list):
-        return raw
-    return []
-
-
-def resolve_agent_config(params: Dict[str, Any], config: AgentConfig) -> RunConfig:
-    """Resolve the full agent run config from the request parameters.
-
-    Prefers the dedicated ``agent`` config element (the ``agent_config`` control). Falls
-    back to the legacy shape (a ``prompt`` prompt-template plus loose ``harness`` /
-    ``sandbox`` / ``permission_policy`` params) so existing revisions keep working.
-    Unset harness/sandbox fall back to the env defaults.
-    """
-    agent = params.get("agent")
-    if isinstance(agent, dict):
-        return RunConfig(
-            instructions=agent.get("instructions") or config.agents_md,
-            model=agent.get("model") or config.model,
-            tools=_as_list(agent.get("tools")),
-            harness=(
-                agent.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
-            ).lower(),
-            sandbox=(
-                agent.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
-            ).lower(),
-            permission_policy=(agent.get("permission_policy") or "auto").lower(),
-        )
-
-    model, instructions, raw_tools = resolve_run_config(params, config)
-    return RunConfig(
-        instructions=instructions,
-        model=model,
-        tools=_as_list(raw_tools),
-        harness=(
-            params.get("harness") or os.getenv("AGENTA_AGENT_HARNESS", "pi")
-        ).lower(),
-        sandbox=(
-            params.get("sandbox") or os.getenv("AGENTA_AGENT_SANDBOX", "local")
-        ).lower(),
-        permission_policy=(params.get("permission_policy") or "auto").lower(),
-    )
-
-
-def _system_text(messages: Optional[List[Any]]) -> str:
-    """Join the system-message content of a prompt-template into AGENTS.md text."""
-    parts: List[str] = []
-    for message in messages or []:
-        if not isinstance(message, dict) or message.get("role") != "system":
-            continue
-        content = message.get("content")
-        if isinstance(content, str):
-            parts.append(content)
-        elif isinstance(content, list):
-            parts.extend(
-                block.get("text", "")
-                for block in content
-                if isinstance(block, dict) and block.get("type") == "text"
-            )
-    return "\n\n".join(part for part in parts if part)
-
-
-def resolve_run_config(
-    params: Dict[str, Any],
-    config: AgentConfig,
-) -> Tuple[str, str, Any]:
-    """Pull model, instructions, and raw tools from the request parameters.
-
-    Accepts both shapes: the playground's ``prompt`` (a ``prompt-template`` whose system
-    message is the AGENTS.md and whose ``llm_config`` carries model + picker tools) and the
-    flat ``{model, agents_md, tools}`` an API caller may send. Falls back to the service
-    file config for any unset field.
-    """
-    prompt_cfg = params.get("prompt")
-    if isinstance(prompt_cfg, dict):
-        llm_config = prompt_cfg.get("llm_config") or {}
-        model = llm_config.get("model") or config.model
-        agents_md = _system_text(prompt_cfg.get("messages")) or config.agents_md
-        raw_tools = llm_config.get("tools")
-        if raw_tools is None:
-            raw_tools = prompt_cfg.get("tools")
-    else:
-        model = params.get("model") or config.model
-        agents_md = params.get("agents_md") or config.agents_md
-        raw_tools = params.get("tools")
-
-    if raw_tools is None:
-        raw_tools = config.tools
-    return model, agents_md, raw_tools
-
-
-def to_messages(raw: Optional[List[Any]]) -> List[Message]:
-    """Coerce the playground's loose message dicts into :class:`Message` objects.
-
-    The runner picks the latest user turn and replays the rest as context, so we hand it
-    the whole conversation rather than pre-extracting a single prompt.
-    """
-    messages: List[Message] = []
-    for item in raw or []:
-        message = Message.from_raw(item)
-        if message is not None:
-            messages.append(message)
-    return messages
diff --git a/services/oss/src/agent/schemas.py b/services/oss/src/agent/schemas.py
index a5e65de2ee..d165938ba2 100644
--- a/services/oss/src/agent/schemas.py
+++ b/services/oss/src/agent/schemas.py
@@ -81,9 +81,12 @@
         "harness": {
             "type": "string",
             "title": "Harness",
-            "enum": ["pi", "claude"],
+            "enum": ["pi", "claude", "agenta"],
             "default": "pi",
-            "description": "Coding agent engine to drive over ACP (pi or claude).",
+            "description": (
+                "Coding agent to drive: pi, claude, or agenta (pi with Agenta's forced "
+                "skills, tools, and base instructions)."
+            ),
         },
         "sandbox": {
             "type": "string",
diff --git a/services/oss/src/agent/tools.py b/services/oss/src/agent/tools.py
index 5c84581b27..fee9172f1f 100644
--- a/services/oss/src/agent/tools.py
+++ b/services/oss/src/agent/tools.py
@@ -15,7 +15,7 @@
     agenta_api_base,
     request_authorization,
 )
-from oss.src.harness.ports import ToolCallback
+from agenta.sdk.agents import ToolCallback
 
 
 def _parse_gateway_slug(slug: Any) -> Optional[Dict[str, Any]]:
diff --git a/services/oss/src/agent/tracing.py b/services/oss/src/agent/tracing.py
index a64c3c5fab..7069381a35 100644
--- a/services/oss/src/agent/tracing.py
+++ b/services/oss/src/agent/tracing.py
@@ -15,7 +15,7 @@
 from agenta.sdk.engines.tracing.propagation import inject
 from agenta.sdk.utils.logging import get_module_logger
 
-from oss.src.harness.ports import TraceContext
+from agenta.sdk.agents import TraceContext
 
 log = get_module_logger(__name__)
 
diff --git a/services/oss/src/harness/__init__.py b/services/oss/src/harness/__init__.py
deleted file mode 100644
index a6b553c33f..0000000000
--- a/services/oss/src/harness/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Engine-agnostic agent runtime: the harness and environment seams, plus their adapters.
-
-Nothing here is Agenta-specific. The Agenta workflow integration (the ``/invoke`` handler,
-tool resolution, secrets, tracing) lives in ``oss.src.agent``. Two seams (see
-``docs/design/agent-workflows/ports-and-adapters.md``):
-
-- ``Harness``: the agent engine. ``SubprocessHarness`` and ``HttpHarness`` (``transports.py``)
-  reach the TypeScript runner over a subprocess or HTTP. The engine that runs behind them
-  (rivet over ACP, or the legacy in-process Pi path) is an env value, not a class.
-  ``create_session`` returns an :class:`AgentSession` (create / prompt / destroy).
-- ``Environment``: where the harness process runs. ``LocalEnvironment`` runs it as a local
-  subprocess; a sandbox environment is selected inside the rivet runner.
-"""
-
-from .environment import LocalEnvironment
-from .ports import (
-    AgentEvent,
-    AgentRequest,
-    AgentResult,
-    AgentSession,
-    ContentBlock,
-    Environment,
-    Harness,
-    HarnessCapabilities,
-    Message,
-    SessionConfig,
-    ToolCallback,
-    TraceContext,
-)
-from .transports import HttpHarness, SubprocessHarness
-
-__all__ = [
-    "AgentEvent",
-    "AgentRequest",
-    "AgentResult",
-    "AgentSession",
-    "ContentBlock",
-    "Environment",
-    "Harness",
-    "HarnessCapabilities",
-    "HttpHarness",
-    "LocalEnvironment",
-    "Message",
-    "SessionConfig",
-    "SubprocessHarness",
-    "ToolCallback",
-    "TraceContext",
-]
diff --git a/services/oss/src/harness/environment.py b/services/oss/src/harness/environment.py
deleted file mode 100644
index d66889c710..0000000000
--- a/services/oss/src/harness/environment.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""Local environment: run the harness as a subprocess on this host.
-
-This is the parity baseline. The Node process is the run environment. A sandbox
-environment (Daytona) is selected on the rivet path inside the TypeScript runner, so it
-does not need a separate Python ``Environment`` here.
-"""
-
-from __future__ import annotations
-
-import asyncio
-from typing import Dict, Optional, Sequence
-
-from agenta.sdk.utils.logging import get_module_logger
-
-from .ports import Environment, ExecResult
-
-log = get_module_logger(__name__)
-
-
-class LocalEnvironment(Environment):
-    """Run a command as a subprocess on this host, feeding it the request on stdin."""
-
-    async def exec(
-        self,
-        command: Sequence[str],
-        input_bytes: bytes,
-        *,
-        cwd: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        timeout: Optional[float] = None,
-    ) -> ExecResult:
-        proc = await asyncio.create_subprocess_exec(
-            *command,
-            cwd=cwd,
-            env=env,
-            stdin=asyncio.subprocess.PIPE,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-        )
-
-        try:
-            stdout, stderr = await asyncio.wait_for(
-                proc.communicate(input=input_bytes),
-                timeout=timeout,
-            )
-        except asyncio.TimeoutError:
-            proc.kill()
-            await proc.wait()
-            raise RuntimeError(
-                f"Harness process timed out after {timeout}s: {' '.join(command)}"
-            )
-
-        return ExecResult(
-            code=proc.returncode if proc.returncode is not None else 0,
-            stdout=stdout.decode("utf-8", "replace"),
-            stderr=stderr.decode("utf-8", "replace"),
-        )
diff --git a/services/oss/src/harness/ports.py b/services/oss/src/harness/ports.py
deleted file mode 100644
index 40e57b0174..0000000000
--- a/services/oss/src/harness/ports.py
+++ /dev/null
@@ -1,418 +0,0 @@
-"""Ports for the agent service: the Environment seam and the Harness seam.
-
-These interfaces keep the service environment-agnostic and engine-agnostic. The shapes
-are borrowed from the rivet ``sandbox-agent`` SDK (see
-``docs/design/agent-workflows/ports-and-adapters.md``) but stay ours, so rivet is one
-adapter behind the seam and a non-rivet engine (the legacy in-process Pi path) fits the
-same port.
-
-Two seams:
-
-- ``Environment`` — where the harness process runs. ``LocalEnvironment`` runs it as a
-  subprocess on this host; a sandbox environment runs it elsewhere. This is the "runtime"
-  axis renamed; ``exec`` survives only as the subprocess transport's mechanism.
-- ``Harness`` — the agent engine. One ``invoke`` is one cold run. ``create_session``
-  returns an :class:`AgentSession`, the rivet-shaped abstraction on top: under cold +
-  replay it holds no warm daemon, so continuing a conversation replays the caller-held
-  history into a fresh run.
-
-The engine choice (legacy in-process Pi vs rivet over ACP) is not a Python class. It is an
-env value the transport hands the TypeScript runner, so Python has two transports
-(subprocess, HTTP), not three backend adapters.
-"""
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Sequence, Union
-
-
-# ---------------------------------------------------------------------------
-# Capabilities
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class HarnessCapabilities:
-    """What a harness can do, probed by the runtime (rivet ``AgentCapabilities``).
-
-    The runner reports these in the result; the service uses them for observability and
-    for input shaping (for example, do not send image blocks to a harness without
-    ``images``). The branching that used to key off the harness name (``if pi``) now keys
-    off these flags in the TypeScript runner, where the live answer is.
-    """
-
-    text_messages: bool = True
-    images: bool = False
-    file_attachments: bool = False
-    mcp_tools: bool = False
-    tool_calls: bool = False
-    reasoning: bool = False
-    plan_mode: bool = False
-    permissions: bool = False
-    usage: bool = False
-    streaming_deltas: bool = False
-    session_lifecycle: bool = False
-
-    @classmethod
-    def from_wire(
-        cls, data: Optional[Dict[str, Any]]
-    ) -> Optional["HarnessCapabilities"]:
-        """Parse the camelCase capability object the runner returns. ``None`` passes through."""
-        if not isinstance(data, dict):
-            return None
-        return cls(
-            text_messages=bool(data.get("textMessages", True)),
-            images=bool(data.get("images", False)),
-            file_attachments=bool(data.get("fileAttachments", False)),
-            mcp_tools=bool(data.get("mcpTools", False)),
-            tool_calls=bool(data.get("toolCalls", False)),
-            reasoning=bool(data.get("reasoning", False)),
-            plan_mode=bool(data.get("planMode", False)),
-            permissions=bool(data.get("permissions", False)),
-            usage=bool(data.get("usage", False)),
-            streaming_deltas=bool(data.get("streamingDeltas", False)),
-            session_lifecycle=bool(data.get("sessionLifecycle", False)),
-        )
-
-
-# ---------------------------------------------------------------------------
-# Turn input: content blocks and messages
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class ContentBlock:
-    """One piece of a message, mirroring the ACP content-block kinds.
-
-    ``text`` is the only kind the playground sends today; ``image`` and ``resource`` are
-    plumbed so an image-capable harness can take them once the playground does. A bare
-    string content is normalized to a single ``text`` block on the wire.
-    """
-
-    type: str  # "text" | "image" | "resource"
-    text: Optional[str] = None
-    # image / resource payloads (base64 data or a uri), used when type != "text".
-    data: Optional[str] = None
-    mime_type: Optional[str] = None
-    uri: Optional[str] = None
-
-    def to_wire(self) -> Dict[str, Any]:
-        block: Dict[str, Any] = {"type": self.type}
-        if self.text is not None:
-            block["text"] = self.text
-        if self.data is not None:
-            block["data"] = self.data
-        if self.mime_type is not None:
-            block["mimeType"] = self.mime_type
-        if self.uri is not None:
-            block["uri"] = self.uri
-        return block
-
-    @classmethod
-    def from_raw(cls, raw: Any) -> "ContentBlock":
-        """Coerce a loose block (string or dict) into a ContentBlock."""
-        if isinstance(raw, ContentBlock):
-            return raw
-        if isinstance(raw, str):
-            return cls(type="text", text=raw)
-        if isinstance(raw, dict):
-            return cls(
-                type=str(raw.get("type", "text")),
-                text=raw.get("text"),
-                data=raw.get("data"),
-                mime_type=raw.get("mimeType") or raw.get("mime_type"),
-                uri=raw.get("uri"),
-            )
-        return cls(type="text", text=str(raw))
-
-
-# A message's content is either a plain string or a list of content blocks.
-MessageContent = Union[str, List[ContentBlock]]
-
-
-@dataclass
-class Message:
-    """A chat message in the conversation. ``content`` is text or content blocks."""
-
-    role: str
-    content: MessageContent = ""
-
-    def to_wire(self) -> Dict[str, Any]:
-        if isinstance(self.content, str):
-            content: Any = self.content
-        else:
-            # Tolerate both ContentBlock objects and the raw dicts a caller may pass.
-            content = [
-                block.to_wire() if isinstance(block, ContentBlock) else block
-                for block in self.content
-            ]
-        return {"role": self.role, "content": content}
-
-    @classmethod
-    def from_raw(cls, raw: Any) -> Optional["Message"]:
-        """Coerce a loose dict (the playground's message shape) into a Message.
-
-        List content (ACP-style content blocks) is normalized into ``ContentBlock``
-        objects so the typed-content invariant holds downstream.
-        """
-        if isinstance(raw, Message):
-            return raw
-        if not isinstance(raw, dict) or "role" not in raw:
-            return None
-        content = raw.get("content", "")
-        if isinstance(content, list):
-            content = [ContentBlock.from_raw(block) for block in content]
-        return cls(role=str(raw["role"]), content=content)
-
-
-# ---------------------------------------------------------------------------
-# Run events: the structured stream
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class AgentEvent:
-    """One structured event from a run, mapped from an ACP ``session/update``.
-
-    ``type`` is one of ``message``, ``thought``, ``tool_call``, ``tool_result``,
-    ``usage``, ``error``, ``done``. ``data`` carries the rest verbatim. The runner returns
-    these as a per-turn log; an ``on_event`` callback can also receive them live.
-    """
-
-    type: str
-    data: Dict[str, Any] = field(default_factory=dict)
-
-    @classmethod
-    def from_wire(cls, raw: Any) -> Optional["AgentEvent"]:
-        if not isinstance(raw, dict) or not raw.get("type"):
-            return None
-        return cls(type=str(raw["type"]), data=raw)
-
-
-# A live event sink. Synchronous: the transports invoke it as events arrive.
-EventSink = Callable[[AgentEvent], None]
-
-
-# ---------------------------------------------------------------------------
-# Trace context and tool callback (cross-boundary plumbing, unchanged shapes)
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class TraceContext:
-    """Agenta trace context threaded into the harness run.
-
-    Lets the harness nest its spans under the caller's workflow span (same ``trace_id``)
-    and ship them to the same Agenta backend with the same auth, so the agent's whole run
-    becomes part of the ``/invoke`` trace the way completion/chat nest their LLM spans.
-    All fields optional; with none set the harness traces standalone (or not at all).
-    """
-
-    traceparent: Optional[str] = None
-    baggage: Optional[str] = None
-    endpoint: Optional[str] = None  # OTLP traces URL
-    authorization: Optional[str] = None  # full Authorization header value
-    capture_content: bool = True
-
-    def to_wire(self) -> Dict[str, Any]:
-        return {
-            "traceparent": self.traceparent,
-            "baggage": self.baggage,
-            "endpoint": self.endpoint,
-            "authorization": self.authorization,
-            "captureContent": self.capture_content,
-        }
-
-
-@dataclass
-class ToolCallback:
-    """How the harness routes a tool call back through Agenta's ``/tools/call``.
-
-    The backend resolves runnable tool references into specs and hands the harness this
-    callback. The provider key and connection auth never enter the sandbox; they stay
-    behind ``/tools/call``. Same mechanism that threads the OTLP credential.
-    """
-
-    endpoint: str  # full ``/tools/call`` URL
-    authorization: Optional[str] = None  # full Authorization header value
-
-    def to_wire(self) -> Dict[str, Any]:
-        return {"endpoint": self.endpoint, "authorization": self.authorization}
-
-
-# ---------------------------------------------------------------------------
-# Session config, request, result
-# ---------------------------------------------------------------------------
-
-# Permission policy for harness tool use in a headless run. ``auto`` approves (tools are
-# backend-resolved and trusted, no human to prompt); ``deny`` rejects.
-PermissionPolicy = str  # "auto" | "deny"
-
-
-@dataclass
-class SessionConfig:
-    """The agent config bundle for a session: everything but the turn itself.
-
-    Mirrors the rivet session config. ``instructions`` becomes ``AGENTS.md``;
-    ``harness``/``sandbox`` are the two orthogonal swap axes; ``secrets`` are provider keys
-    injected as harness env, never written to the agent filesystem. Skills and hooks are
-    carried as workspace artifacts (not modeled as verbs); they are not built in this pass.
-    """
-
-    instructions: Optional[str] = None  # AGENTS.md text
-    model: Optional[str] = None
-    harness: str = "pi"
-    sandbox: str = "local"
-    session_id: Optional[str] = None
-    secrets: Dict[str, str] = field(default_factory=dict)
-    builtin_tools: List[str] = field(default_factory=list)
-    custom_tools: List[Dict[str, Any]] = field(default_factory=list)
-    tool_callback: Optional[ToolCallback] = None
-    permission_policy: PermissionPolicy = "auto"
-    trace: Optional[TraceContext] = None
-
-
-@dataclass
-class AgentRequest:
-    """One transport call: the session config plus the conversation so far.
-
-    The runner picks the latest user turn and replays the prior turns as context (the
-    cold + replay model). ``messages`` is the full conversation the caller holds.
-    """
-
-    config: SessionConfig
-    messages: List[Message] = field(default_factory=list)
-
-
-@dataclass
-class AgentResult:
-    """The agent's reply plus structured run metadata.
-
-    ``output`` is the final assistant text (the playground renders this). ``messages`` and
-    ``events`` are the structured forms. ``usage`` rolls token/cost onto the workflow span
-    (the harness span tree ships in a separate OTLP batch, so the service stamps the totals
-    itself). ``capabilities`` is what the harness was probed to support this run.
-    """
-
-    output: str = ""
-    messages: List[Message] = field(default_factory=list)
-    events: List[AgentEvent] = field(default_factory=list)
-    usage: Optional[Dict[str, Any]] = None
-    stop_reason: Optional[str] = None
-    capabilities: Optional[HarnessCapabilities] = None
-    session_id: Optional[str] = None
-    model: Optional[str] = None
-    trace_id: Optional[str] = None
-
-
-# ---------------------------------------------------------------------------
-# Environment seam (where the harness process runs)
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class ExecResult:
-    """Result of running a command through an Environment."""
-
-    code: int
-    stdout: str
-    stderr: str
-
-
-class Environment(ABC):
-    """Where and how the harness process runs.
-
-    ``LocalEnvironment`` runs it as a subprocess on this host. ``exec`` is the subprocess
-    transport's mechanism; the HTTP transport does not use it. ``start``/``dispose`` are
-    lifecycle hooks (no-ops for a local process).
-    """
-
-    async def start(self) -> None:
-        """Bring the environment up (no-op for a local process)."""
-        return None
-
-    async def dispose(self) -> None:
-        """Tear the environment down (no-op for a local process)."""
-        return None
-
-    @abstractmethod
-    async def exec(
-        self,
-        command: Sequence[str],
-        input_bytes: bytes,
-        *,
-        cwd: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        timeout: Optional[float] = None,
-    ) -> ExecResult:
-        """Run ``command`` in the environment, feeding ``input_bytes`` to stdin."""
-
-
-# ---------------------------------------------------------------------------
-# Harness seam (the agent engine) and the session abstraction
-# ---------------------------------------------------------------------------
-
-
-class Harness(ABC):
-    """The agent engine behind one transport. Rivet and the legacy Pi path are adapters."""
-
-    async def setup(self) -> None:
-        """Prepare the harness for a run (no-op by default)."""
-        return None
-
-    async def shutdown(self) -> None:
-        """Release harness resources (no-op by default)."""
-        return None
-
-    @abstractmethod
-    async def invoke(
-        self,
-        request: AgentRequest,
-        *,
-        on_event: Optional[EventSink] = None,
-    ) -> AgentResult:
-        """Run one cold turn and return the structured result."""
-
-    async def destroy_session(self, session_id: Optional[str]) -> None:
-        """Drop a session's resources. A no-op under cold + replay (nothing is kept warm)."""
-        return None
-
-    def create_session(self, config: SessionConfig) -> "AgentSession":
-        """Open a session for this config. The session is the rivet-shaped abstraction."""
-        return AgentSession(self, config)
-
-
-class AgentSession:
-    """A first-class session over a :class:`Harness`.
-
-    ``create_session(config)`` then ``session.prompt(messages)``. Under cold + replay the
-    session keeps no warm daemon: each ``prompt`` is a fresh ``invoke`` that replays the
-    supplied history. The abstraction is real (and where a future server-side history
-    store slots in); the cold lifecycle is an adapter detail.
-    """
-
-    def __init__(self, harness: Harness, config: SessionConfig) -> None:
-        self._harness = harness
-        self._config = config
-
-    @property
-    def id(self) -> Optional[str]:
-        return self._config.session_id
-
-    async def prompt(
-        self,
-        messages: Sequence[Message],
-        *,
-        on_event: Optional[EventSink] = None,
-    ) -> AgentResult:
-        request = AgentRequest(config=self._config, messages=list(messages))
-        result = await self._harness.invoke(request, on_event=on_event)
-        # Carry the engine's session id forward so a follow-up prompt resumes it.
-        if result.session_id:
-            self._config.session_id = result.session_id
-        return result
-
-    async def destroy(self) -> None:
-        await self._harness.destroy_session(self._config.session_id)
diff --git a/services/oss/src/harness/transports.py b/services/oss/src/harness/transports.py
deleted file mode 100644
index 45c79b0b98..0000000000
--- a/services/oss/src/harness/transports.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""The two harness transports: subprocess and HTTP.
-
-Both speak the same ``/run`` wire contract (see ``wire.py``) and differ only in how they
-reach the TypeScript runner:
-
-- ``SubprocessHarness`` spawns the TS CLI through an :class:`Environment`, handing it the
-  request on stdin. It sets ``AGENT_BACKEND`` to pick the engine (``rivet`` for the ACP
-  path, ``pi`` for the legacy in-process Pi path).
-- ``HttpHarness`` POSTs to the wrapper running as a sidecar. The sidecar auto-routes to the
-  engine by request shape (a rivet request carries ``harness``/``sandbox``), so the
-  transport itself stays engine-agnostic.
-
-The engine is therefore config, not a Python class. This is what collapsed the old
-``PiHarness`` / ``PiHttpHarness`` / ``RivetHarness`` trio into two transports.
-"""
-
-from __future__ import annotations
-
-import json
-import os
-from typing import List, Optional, Sequence
-
-import httpx
-
-from agenta.sdk.utils.logging import get_module_logger
-
-from .ports import AgentRequest, AgentResult, Environment, EventSink, Harness
-from .wire import request_to_wire, result_from_wire
-
-log = get_module_logger(__name__)
-
-_DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_TIMEOUT", "180"))
-_DEFAULT_COMMAND = ["pnpm", "exec", "tsx", "src/cli.ts"]
-
-
-def _emit_events(result: AgentResult, on_event: Optional[EventSink]) -> None:
-    """Replay the result's event log to a live sink.
-
-    The one-shot transports receive the whole run at once, so events arrive as a batch
-    rather than live. Firing them here keeps the ``on_event`` API working; true streaming
-    (NDJSON over ``/run``) is a documented follow-on.
-    """
-    if not on_event:
-        return
-    for event in result.events:
-        try:
-            on_event(event)
-        except Exception:  # pylint: disable=broad-except
-            log.warning("agent: on_event sink raised", exc_info=True)
-
-
-class SubprocessHarness(Harness):
-    """Drive the TS runner as a subprocess on this host, request on stdin.
-
-    ``backend`` selects the engine via ``AGENT_BACKEND`` (``rivet`` or ``pi``).
-    """
-
-    def __init__(
-        self,
-        environment: Environment,
-        *,
-        wrapper_dir: str,
-        backend: str = "rivet",
-        command: Optional[Sequence[str]] = None,
-        timeout: float = _DEFAULT_TIMEOUT,
-    ) -> None:
-        self._environment = environment
-        self._wrapper_dir = wrapper_dir
-        self._backend = backend
-        self._command: List[str] = list(command or _DEFAULT_COMMAND)
-        self._timeout = timeout
-
-    async def setup(self) -> None:
-        await self._environment.start()
-
-    async def shutdown(self) -> None:
-        await self._environment.dispose()
-
-    async def invoke(
-        self,
-        request: AgentRequest,
-        *,
-        on_event: Optional[EventSink] = None,
-    ) -> AgentResult:
-        wire = request_to_wire(request)
-        wire["backend"] = self._backend
-        payload = json.dumps(wire).encode("utf-8")
-        exec_result = await self._environment.exec(
-            self._command,
-            payload,
-            cwd=self._wrapper_dir,
-            env={**os.environ, "AGENT_BACKEND": self._backend},
-            timeout=self._timeout,
-        )
-
-        if not exec_result.stdout.strip():
-            raise RuntimeError(
-                "Agent runner returned no output. "
-                f"exit={exec_result.code} stderr={exec_result.stderr[-2000:]}"
-            )
-        try:
-            data = json.loads(exec_result.stdout)
-        except json.JSONDecodeError as exc:
-            raise RuntimeError(
-                "Agent runner returned invalid JSON. "
-                f"stdout={exec_result.stdout[:500]} stderr={exec_result.stderr[-1000:]}"
-            ) from exc
-
-        result = result_from_wire(data)
-        _emit_events(result, on_event)
-        return result
-
-
-class HttpHarness(Harness):
-    """Drive the TS runner over HTTP (the sidecar). The sidecar picks the engine."""
-
-    def __init__(
-        self,
-        base_url: str,
-        *,
-        backend: str = "rivet",
-        timeout: float = _DEFAULT_TIMEOUT,
-    ) -> None:
-        self._base_url = base_url.rstrip("/")
-        self._backend = backend
-        self._timeout = timeout
-
-    async def invoke(
-        self,
-        request: AgentRequest,
-        *,
-        on_event: Optional[EventSink] = None,
-    ) -> AgentResult:
-        payload = request_to_wire(request)
-        payload["backend"] = self._backend
-        async with httpx.AsyncClient(timeout=self._timeout) as client:
-            response = await client.post(f"{self._base_url}/run", json=payload)
-        if response.status_code >= 500:
-            raise RuntimeError(
-                f"Agent runner HTTP {response.status_code}: {response.text[:1000]}"
-            )
-
-        result = result_from_wire(response.json())
-        _emit_events(result, on_event)
-        return result
diff --git a/services/oss/src/harness/wire.py b/services/oss/src/harness/wire.py
deleted file mode 100644
index 3fddee5324..0000000000
--- a/services/oss/src/harness/wire.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""The ``/run`` wire contract, in one place.
-
-Every transport (subprocess, HTTP) sends the same camelCase JSON to the TypeScript runner
-and parses the same result back, so the wire shape lives here rather than being rebuilt in
-each adapter. The TypeScript side mirrors these names in ``services/agent/src/protocol.ts``.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List
-
-from .ports import (
-    AgentEvent,
-    AgentRequest,
-    AgentResult,
-    HarnessCapabilities,
-    Message,
-)
-
-
-def request_to_wire(request: AgentRequest) -> Dict[str, Any]:
-    """Serialize an :class:`AgentRequest` to the ``/run`` request JSON."""
-    config = request.config
-    return {
-        "harness": config.harness,
-        "sandbox": config.sandbox,
-        "sessionId": config.session_id,
-        "agentsMd": config.instructions,
-        "model": config.model,
-        "messages": [message.to_wire() for message in request.messages],
-        "secrets": config.secrets or {},
-        "tools": config.builtin_tools,
-        "customTools": config.custom_tools,
-        "toolCallback": config.tool_callback.to_wire()
-        if config.tool_callback
-        else None,
-        "permissionPolicy": config.permission_policy,
-        "trace": config.trace.to_wire() if config.trace else None,
-    }
-
-
-def result_from_wire(data: Dict[str, Any]) -> AgentResult:
-    """Parse a ``/run`` result JSON into an :class:`AgentResult`.
-
-    Raises ``RuntimeError`` when the runner reported a failure, so the invoke surfaces a
-    clear message rather than handing the model an empty reply.
-    """
-    if not data.get("ok"):
-        raise RuntimeError(f"Agent run failed: {data.get('error')}")
-
-    messages: List[Message] = []
-    for raw in data.get("messages") or []:
-        message = Message.from_raw(raw)
-        if message is not None:
-            messages.append(message)
-
-    events: List[AgentEvent] = []
-    for raw in data.get("events") or []:
-        event = AgentEvent.from_wire(raw)
-        if event is not None:
-            events.append(event)
-
-    return AgentResult(
-        output=data.get("output", "") or "",
-        messages=messages,
-        events=events,
-        usage=data.get("usage"),
-        stop_reason=data.get("stopReason"),
-        capabilities=HarnessCapabilities.from_wire(data.get("capabilities")),
-        session_id=data.get("sessionId"),
-        model=data.get("model"),
-        trace_id=data.get("traceId"),
-    )
diff --git a/services/oss/tests/pytest/unit/agent/__init__.py b/services/oss/tests/pytest/unit/agent/__init__.py
new file mode 100644
index 0000000000..5da5a3fd3b
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/__init__.py
@@ -0,0 +1 @@
+# Unit tests for the agent workflow service (oss.src.agent).
diff --git a/services/oss/tests/pytest/unit/agent/conftest.py b/services/oss/tests/pytest/unit/agent/conftest.py
new file mode 100644
index 0000000000..7664bf9c90
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/conftest.py
@@ -0,0 +1,99 @@
+"""Fakes for the agent service unit tests.
+
+A local, minimal ``FakeBackend`` (≈ the SDK's) so the ``/invoke`` handler can run end-to-end
+in-process with no runner, no LLM, and no network. It implements the real ``Backend`` /
+``Sandbox`` / ``Session`` ports, so the port contract keeps it honest across the two suites.
+
+This conftest is scoped to ``unit/agent/`` so the handler tests do not pull the acceptance
+suite's account / live-API fixtures from the services root conftest.
+"""
+
+from __future__ import annotations
+
+from typing import Dict, Mapping, Optional, Sequence
+
+import pytest
+
+from agenta.sdk.agents import AgentResult, HarnessType
+from agenta.sdk.agents.interfaces import Backend, Sandbox, Session
+from agenta.sdk.agents.streaming import AgentRun
+
+
+class _FakeSandbox(Sandbox):
+    def __init__(self) -> None:
+        self.files: Dict[str, bytes] = {}
+        self.destroyed = False
+
+    async def add_files(self, files: Mapping[str, bytes]) -> None:
+        self.files.update(files)
+
+    async def destroy(self) -> None:
+        self.destroyed = True
+
+
+class _FakeSession(Session):
+    def __init__(self, result: AgentResult) -> None:
+        self._result = result
+        self.destroyed = False
+
+    @property
+    def id(self) -> Optional[str]:
+        return self._result.session_id
+
+    async def prompt(self, messages, *, on_event=None) -> AgentResult:
+        return self._result
+
+    def stream(self, messages) -> AgentRun:
+        result = self._result
+
+        async def _records():
+            yield {
+                "kind": "result",
+                "result": {"ok": True, "output": result.output},
+            }
+
+        return AgentRun(_records())
+
+    async def destroy(self) -> None:
+        self.destroyed = True
+
+
+class FakeBackend(Backend):
+    """Echoes a fixed result, regardless of harness. Records lifecycle for assertions."""
+
+    def __init__(
+        self,
+        *,
+        result: Optional[AgentResult] = None,
+        supported: Sequence[HarnessType] = (
+            HarnessType.PI,
+            HarnessType.CLAUDE,
+            HarnessType.AGENTA,
+        ),
+    ) -> None:
+        self.supported_harnesses = frozenset(supported)
+        self._result = result if result is not None else AgentResult(output="echo")
+        self.setup_calls = 0
+        self.shutdown_calls = 0
+
+    async def setup(self) -> None:
+        self.setup_calls += 1
+
+    async def shutdown(self) -> None:
+        self.shutdown_calls += 1
+
+    async def create_sandbox(self) -> _FakeSandbox:
+        return _FakeSandbox()
+
+    async def create_session(
+        self, sandbox, config, *, harness, secrets=None, trace=None, session_id=None
+    ) -> _FakeSession:
+        return _FakeSession(self._result)
+
+
+@pytest.fixture
+def fake_backend():
+    def _make(**kwargs) -> FakeBackend:
+        return FakeBackend(**kwargs)
+
+    return _make
diff --git a/services/oss/tests/pytest/unit/agent/test_invoke_handler.py b/services/oss/tests/pytest/unit/agent/test_invoke_handler.py
new file mode 100644
index 0000000000..d32386cd6b
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/test_invoke_handler.py
@@ -0,0 +1,71 @@
+"""The ``/invoke`` handler (`_agent`) end-to-end in-process.
+
+Runs the real parse -> resolve -> harness -> record path with a ``FakeBackend`` and the
+network-touching helpers stubbed. No runner, no LLM, no HTTP. This is where the cross-harness
+"byte-identical response body" guarantee is locked at the Python layer.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import AgentConfig, AgentResult
+
+from oss.src.agent import app
+
+
+@pytest.fixture
+def patched(monkeypatch, fake_backend):
+    backend = fake_backend(result=AgentResult(output="echo", usage={"total": 15}))
+    recorded = {}
+
+    async def _no_tools(_tools):
+        return [], [], None
+
+    async def _no_secrets():
+        return {}
+
+    monkeypatch.setattr(app, "resolve_tools", _no_tools)
+    monkeypatch.setattr(app, "resolve_harness_secrets", _no_secrets)
+    monkeypatch.setattr(app, "trace_context", lambda: None)
+    monkeypatch.setattr(
+        app, "record_usage", lambda usage: recorded.__setitem__("usage", usage)
+    )
+    monkeypatch.setattr(app, "select_backend", lambda selection: backend)
+    monkeypatch.setattr(
+        app, "_default_agent_config", lambda: AgentConfig(instructions="x", model="m")
+    )
+    return backend, recorded
+
+
+async def _invoke(harness="pi"):
+    return await app._agent(
+        messages=[{"role": "user", "content": "hi"}],
+        parameters={"agent": {"harness": harness}},
+    )
+
+
+async def test_invoke_returns_assistant_message(patched):
+    assert await _invoke("pi") == {"role": "assistant", "content": "echo"}
+
+
+async def test_invoke_records_usage(patched):
+    _, recorded = patched
+    await _invoke("pi")
+    assert recorded["usage"] == {"total": 15}
+
+
+async def test_invoke_runs_backend_lifecycle(patched):
+    backend, _ = patched
+    await _invoke("pi")
+    assert backend.setup_calls == 1
+    assert backend.shutdown_calls == 1  # cleanup() tears the backend down
+
+
+async def test_invoke_body_is_identical_across_harnesses(patched):
+    # The same turn against the same (echoing) backend must produce a byte-identical body
+    # whether routed as pi, agenta, or claude. This is the design's cross-harness guarantee.
+    pi = await _invoke("pi")
+    agenta = await _invoke("agenta")
+    claude = await _invoke("claude")
+    assert pi == agenta == claude
diff --git a/services/oss/tests/pytest/unit/agent/test_secrets_mapping.py b/services/oss/tests/pytest/unit/agent/test_secrets_mapping.py
new file mode 100644
index 0000000000..54104e3bb0
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/test_secrets_mapping.py
@@ -0,0 +1,24 @@
+"""Provider-key -> harness env-var mapping.
+
+The harness authenticates with the project's vault provider keys, injected as the env vars
+each provider's SDK reads. If a name here drifts from what the harness expects, auth fails
+silently and the run falls back to login/OAuth, so the table is worth a guard.
+"""
+
+from __future__ import annotations
+
+from oss.src.agent.secrets import _PROVIDER_ENV_VARS
+
+
+def test_standard_providers_map_to_expected_env_vars():
+    assert _PROVIDER_ENV_VARS["openai"] == "OPENAI_API_KEY"
+    assert _PROVIDER_ENV_VARS["anthropic"] == "ANTHROPIC_API_KEY"
+    assert _PROVIDER_ENV_VARS["gemini"] == "GEMINI_API_KEY"
+    assert _PROVIDER_ENV_VARS["groq"] == "GROQ_API_KEY"
+    assert _PROVIDER_ENV_VARS["together_ai"] == "TOGETHERAI_API_KEY"
+    assert _PROVIDER_ENV_VARS["openrouter"] == "OPENROUTER_API_KEY"
+
+
+def test_both_mistral_spellings_share_one_env_var():
+    assert _PROVIDER_ENV_VARS["mistral"] == "MISTRAL_API_KEY"
+    assert _PROVIDER_ENV_VARS["mistralai"] == "MISTRAL_API_KEY"
diff --git a/services/oss/tests/pytest/unit/agent/test_select_backend.py b/services/oss/tests/pytest/unit/agent/test_select_backend.py
new file mode 100644
index 0000000000..e998428d99
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/test_select_backend.py
@@ -0,0 +1,61 @@
+"""``select_backend``: the engine-routing decision.
+
+The harness and sandbox are orthogonal playground choices; this locks how they (plus the
+``AGENTA_AGENT_RUNTIME`` deployment override) map to an engine. ``pi`` and ``agenta`` stay on
+the in-process Pi backend locally; anything else routes to rivet. The transport (HTTP sidecar
+vs subprocess) follows ``AGENTA_AGENT_PI_URL``.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agenta.sdk.agents import InProcessPiBackend, RivetBackend, RunSelection
+
+from oss.src.agent.app import select_backend
+
+
+@pytest.fixture(autouse=True)
+def _clean_env(monkeypatch):
+    # Start every case from a known-empty deployment environment.
+    monkeypatch.delenv("AGENTA_AGENT_RUNTIME", raising=False)
+    monkeypatch.delenv("AGENTA_AGENT_PI_URL", raising=False)
+
+
+def _sel(harness="pi", sandbox="local"):
+    return RunSelection(harness=harness, sandbox=sandbox)
+
+
+def test_pi_local_uses_in_process():
+    assert isinstance(select_backend(_sel("pi", "local")), InProcessPiBackend)
+
+
+def test_agenta_local_uses_in_process():
+    # Agenta is Pi with an opinion, so it stays on the in-process Pi backend.
+    assert isinstance(select_backend(_sel("agenta", "local")), InProcessPiBackend)
+
+
+def test_claude_routes_to_rivet():
+    assert isinstance(select_backend(_sel("claude", "local")), RivetBackend)
+
+
+def test_non_local_sandbox_routes_to_rivet():
+    backend = select_backend(_sel("pi", "daytona"))
+    assert isinstance(backend, RivetBackend)
+    assert backend._sandbox == "daytona"  # the sandbox axis is threaded through
+
+
+def test_runtime_override_forces_rivet(monkeypatch):
+    monkeypatch.setenv("AGENTA_AGENT_RUNTIME", "rivet")
+    assert isinstance(select_backend(_sel("pi", "local")), RivetBackend)
+
+
+def test_pi_url_selects_http_transport(monkeypatch):
+    monkeypatch.setenv("AGENTA_AGENT_PI_URL", "http://agent-pi:8765")
+    backend = select_backend(_sel("pi", "local"))
+    assert backend._url == "http://agent-pi:8765"
+
+
+def test_no_pi_url_uses_subprocess_transport():
+    # Unset URL means the backend will spawn the runner CLI rather than POST to a sidecar.
+    assert select_backend(_sel("pi", "local"))._url is None
diff --git a/services/oss/tests/pytest/unit/agent/test_tool_refs.py b/services/oss/tests/pytest/unit/agent/test_tool_refs.py
new file mode 100644
index 0000000000..4f51399c19
--- /dev/null
+++ b/services/oss/tests/pytest/unit/agent/test_tool_refs.py
@@ -0,0 +1,75 @@
+"""Tool-reference normalization: the playground's loose tool entries -> resolver refs.
+
+``resolve_tools`` posts these refs to the backend resolver. Getting the discrimination wrong
+(a Composio action read as a built-in, or vice versa) silently drops or misroutes a tool, so
+these pure parsers are worth pinning.
+"""
+
+from __future__ import annotations
+
+from oss.src.agent.tools import _normalize_tool_ref, _parse_gateway_slug
+
+_SLUG = "tools__composio__github__GET_THE_AUTHENTICATED_USER__github-tvn"
+
+
+def test_parse_gateway_slug_underscore_form():
+    assert _parse_gateway_slug(_SLUG) == {
+        "type": "composio",
+        "integration": "github",
+        "action": "GET_THE_AUTHENTICATED_USER",
+        "connection": "github-tvn",
+    }
+
+
+def test_parse_gateway_slug_dot_form():
+    assert _parse_gateway_slug("tools.composio.slack.SEND_MESSAGE.conn-1") == {
+        "type": "composio",
+        "integration": "slack",
+        "action": "SEND_MESSAGE",
+        "connection": "conn-1",
+    }
+
+
+def test_parse_gateway_slug_rejects_non_matching():
+    assert _parse_gateway_slug("tools__composio__too__few") is None  # 4 segments
+    assert _parse_gateway_slug("tools__other__a__b__c") is None  # not composio
+    assert _parse_gateway_slug(123) is None  # not a string
+    assert _parse_gateway_slug(None) is None
+
+
+def test_normalize_bare_string_is_builtin():
+    assert _normalize_tool_ref("read") == {"type": "builtin", "name": "read"}
+
+
+def test_normalize_typed_dict_passes_through():
+    composio = {
+        "type": "composio",
+        "integration": "x",
+        "action": "y",
+        "connection": "z",
+    }
+    assert _normalize_tool_ref(composio) is composio
+    builtin = {"type": "builtin", "name": "read"}
+    assert _normalize_tool_ref(builtin) is builtin
+
+
+def test_normalize_picker_gateway_entry_becomes_composio():
+    ref = {"function": {"name": _SLUG}}
+    assert _normalize_tool_ref(ref) == {
+        "type": "composio",
+        "integration": "github",
+        "action": "GET_THE_AUTHENTICATED_USER",
+        "connection": "github-tvn",
+    }
+
+
+def test_normalize_untyped_name_is_builtin_unless_it_is_a_slug():
+    assert _normalize_tool_ref({"name": "grep"}) == {"type": "builtin", "name": "grep"}
+    # A name that is itself a gateway slug resolves to composio.
+    assert _normalize_tool_ref({"name": _SLUG})["type"] == "composio"
+
+
+def test_normalize_unsupported_entries_are_dropped():
+    assert _normalize_tool_ref({"foo": "bar"}) is None  # no type, no usable name
+    assert _normalize_tool_ref(123) is None
+    assert _normalize_tool_ref(None) is None