diff --git a/.gitignore b/.gitignore index 48e7b32614..6c91758e28 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ **/*dont_commit_me* web/packages/agenta-api-client/dist/ web/tsconfig.tsbuildinfo +# Agent Pi extension bundle, built by `pnpm run build:extension` and in the Docker image. +services/agent/dist/ __pycache__/ **/__pycache__/ diff --git a/api/oss/src/apis/fastapi/tools/models.py b/api/oss/src/apis/fastapi/tools/models.py index 891b276c22..768574f23c 100644 --- a/api/oss/src/apis/fastapi/tools/models.py +++ b/api/oss/src/apis/fastapi/tools/models.py @@ -15,6 +15,9 @@ ToolConnectionCreate, # Tool Calls ToolResult, + # Agent tools + AgentToolReference, + ResolvedAgentTool, ) @@ -87,3 +90,18 @@ class ToolConnectionsResponse(BaseModel): class ToolCallResponse(BaseModel): call: ToolResult + + +# --------------------------------------------------------------------------- +# Agent tool resolution +# --------------------------------------------------------------------------- + + +class ToolResolveRequest(BaseModel): + tools: List[AgentToolReference] = [] + + +class ToolResolveResponse(BaseModel): + count: int = 0 + builtins: List[str] = [] + custom: List[ResolvedAgentTool] = [] diff --git a/api/oss/src/apis/fastapi/tools/router.py b/api/oss/src/apis/fastapi/tools/router.py index 043d114fa7..3cc689a055 100644 --- a/api/oss/src/apis/fastapi/tools/router.py +++ b/api/oss/src/apis/fastapi/tools/router.py @@ -29,6 +29,9 @@ ToolConnectionsResponse, # ToolCallResponse, + # + ToolResolveRequest, + ToolResolveResponse, ) from oss.src.core.shared.dtos import Status @@ -42,10 +45,12 @@ ToolResultData, ) from oss.src.core.tools.exceptions import ( + ActionNotFoundError, AdapterError, ConnectionInactiveError, ConnectionInvalidError, ConnectionNotFoundError, + ToolSlugInvalidError, ) from oss.src.core.tools.service import ( ToolsService, @@ -208,6 +213,14 @@ def __init__( ) # --- Tool operations --- + self.router.add_api_route( + "/resolve", + self.resolve_tools, + methods=["POST"], + operation_id="resolve_agent_tools", + response_model=ToolResolveResponse, + response_model_exclude_none=True, + ) self.router.add_api_route( "/call", self.call_tool, @@ -886,6 +899,51 @@ async def callback_connection( # Tool Calls # ----------------------------------------------------------------------- + @intercept_exceptions() + @handle_adapter_exceptions() + async def resolve_tools( + self, + request: Request, + *, + body: ToolResolveRequest, + ) -> ToolResolveResponse: + """Resolve an agent's tool references into model-ready specs. + + Validates Composio connections up front and enriches each action from the + catalog, so a running agent (e.g. Pi) gets ``customTools`` whose ``execute`` + routes back through ``POST /tools/call`` — provider keys stay server-side. + """ + if is_ee(): + has_permission = await check_action_access( + user_uid=request.state.user_id, + project_id=request.state.project_id, + permission=Permission.VIEW_TOOLS, + ) + if not has_permission: + raise FORBIDDEN_EXCEPTION + + try: + resolution = await self.tools_service.resolve_agent_tools( + project_id=UUID(request.state.project_id), + tools=body.tools, + ) + except ConnectionNotFoundError as e: + raise HTTPException(status_code=404, detail=e.message) from e + except ConnectionInactiveError as e: + raise HTTPException(status_code=400, detail=e.message) from e + except ConnectionInvalidError as e: + raise HTTPException(status_code=400, detail=e.message) from e + except ToolSlugInvalidError as e: + raise HTTPException(status_code=400, detail=e.message) from e + except ActionNotFoundError as e: + raise HTTPException(status_code=404, detail=e.message) from e + + return ToolResolveResponse( + count=len(resolution.builtins) + len(resolution.custom), + builtins=resolution.builtins, + custom=resolution.custom, + ) + @intercept_exceptions() @handle_adapter_exceptions() async def call_tool( @@ -931,39 +989,12 @@ async def call_tool( connection_slug = slug_parts[4] try: - connections = await self.tools_service.query_connections( + connection = await self.tools_service.resolve_connection_by_slug( project_id=UUID(request.state.project_id), provider_key=provider_key, integration_key=integration_key, + connection_slug=connection_slug, ) - - connection = next( - (c for c in connections if c.slug == connection_slug), None - ) - - if not connection: - raise ConnectionNotFoundError( - connection_slug=connection_slug, - provider_key=provider_key, - integration_key=integration_key, - ) - - if not connection.is_active: - raise ConnectionInactiveError(connection_id=connection_slug) - - if not connection.is_valid: - raise ConnectionInvalidError( - connection_slug=connection_slug, - detail="Please refresh the connection.", - ) - - if not connection.provider_connection_id: - raise ConnectionNotFoundError( - connection_slug=connection_slug, - provider_key=provider_key, - integration_key=integration_key, - ) - except ConnectionNotFoundError as e: raise HTTPException(status_code=404, detail=e.message) from e except ConnectionInactiveError as e: diff --git a/api/oss/src/core/tools/dtos.py b/api/oss/src/core/tools/dtos.py index a588965f61..3c3f0ec53e 100644 --- a/api/oss/src/core/tools/dtos.py +++ b/api/oss/src/core/tools/dtos.py @@ -1,8 +1,8 @@ from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Annotated, Any, Dict, List, Literal, Optional, Union from agenta.sdk.models.workflows import JsonSchemas -from pydantic import BaseModel +from pydantic import BaseModel, Field from oss.src.core.shared.dtos import ( Header, @@ -238,3 +238,61 @@ class ToolExecutionResponse(BaseModel): data: Optional[Json] = None error: Optional[str] = None successful: bool = False + + +# --------------------------------------------------------------------------- +# Agent tools (config references + resolution) +# --------------------------------------------------------------------------- + +# A provider-agnostic list of tool references lives under an agent revision's +# ``parameters["tools"]``. Each entry is a discriminated union on ``type``: config +# holds references and display metadata only, never secrets. The backend resolves +# them into model-ready specs at invoke time (see ToolsService.resolve_agent_tools). + + +class AgentBuiltinTool(BaseModel): + """A Pi built-in tool, referenced by name (e.g. ``read``, ``bash``).""" + + type: Literal["builtin"] = "builtin" + name: str + + +class AgentComposioTool(BaseModel): + """A Composio action, carrying the slug segments ``/tools/call`` parses.""" + + type: Literal["composio"] = "composio" + integration: str + action: str + connection: str + # Function name shown to the model. Defaults to ``{integration}__{action}``. + name: Optional[str] = None + + +AgentToolReference = Annotated[ + Union[AgentBuiltinTool, AgentComposioTool], + Field(discriminator="type"), +] + + +class ResolvedAgentTool(BaseModel): + """A runnable reference resolved into a model-ready tool spec. + + ``call_ref`` is the ``tools.{provider}.{integration}.{action}.{connection}`` slug + the execution bridge sends back to ``POST /tools/call``. + """ + + name: str + description: Optional[str] = None + input_schema: Optional[Dict[str, Any]] = None + call_ref: str + + +class AgentToolsResolution(BaseModel): + """Outcome of resolving an agent's ``tools`` list. + + ``builtins`` pass straight into Pi's ``tools: string[]``; ``custom`` become Pi + ``customTools`` whose ``execute`` routes through ``/tools/call``. + """ + + builtins: List[str] = [] + custom: List[ResolvedAgentTool] = [] diff --git a/api/oss/src/core/tools/exceptions.py b/api/oss/src/core/tools/exceptions.py index f46c08b6cd..e9dbd54f3f 100644 --- a/api/oss/src/core/tools/exceptions.py +++ b/api/oss/src/core/tools/exceptions.py @@ -40,6 +40,24 @@ def __init__( super().__init__(msg) +class ActionNotFoundError(ToolsError): + """Raised when a catalog action cannot be found for an integration.""" + + def __init__( + self, + *, + provider_key: str, + integration_key: str, + action_key: str, + ): + self.provider_key = provider_key + self.integration_key = integration_key + self.action_key = action_key + super().__init__( + f"Action not found: {provider_key}/{integration_key}/{action_key}" + ) + + class ConnectionSlugConflictError(ToolsError): """Raised when a connection slug already exists for the integration.""" diff --git a/api/oss/src/core/tools/service.py b/api/oss/src/core/tools/service.py index f603bc4d42..a9e1e4c779 100644 --- a/api/oss/src/core/tools/service.py +++ b/api/oss/src/core/tools/service.py @@ -1,3 +1,4 @@ +import re from typing import Any, Dict, List, Optional, Tuple from uuid import UUID @@ -6,6 +7,11 @@ from oss.src.core.tools.utils import make_oauth_state from oss.src.core.tools.dtos import ( + AgentBuiltinTool, + AgentComposioTool, + AgentToolReference, + AgentToolsResolution, + ResolvedAgentTool, ToolCatalogAction, ToolCatalogActionDetails, ToolCatalogIntegration, @@ -15,17 +21,27 @@ ToolConnectionRequest, ToolExecutionRequest, ToolExecutionResponse, + ToolProviderKind, ) from oss.src.core.tools.interfaces import ( ToolsDAOInterface, ) from oss.src.core.tools.registry import ToolsGatewayRegistry from oss.src.core.tools.exceptions import ( + ActionNotFoundError, ConnectionInactiveError, + ConnectionInvalidError, ConnectionNotFoundError, + ToolSlugInvalidError, ) +# A slug segment is safe for the ``tools.{provider}.{integration}.{action}.{connection}`` +# call-ref. ``__`` is forbidden because ``/tools/call`` round-trips ``__`` <-> ``.`` when +# parsing function names, so a ``__`` inside a segment would corrupt the split. +_SLUG_SEGMENT_RE = re.compile(r"^[a-zA-Z0-9-]+(?:_[a-zA-Z0-9-]+)*$") + + log = get_module_logger(__name__) @@ -408,3 +424,146 @@ async def execute_tool( arguments=arguments, ), ) + + # ----------------------------------------------------------------------- + # Connection resolution (shared by the call endpoint and the agent resolver) + # ----------------------------------------------------------------------- + + async def resolve_connection_by_slug( + self, + *, + project_id: UUID, + provider_key: str, + integration_key: str, + connection_slug: str, + ) -> ToolConnection: + """Resolve a project-scoped connection slug to a usable connection row. + + Raises a domain exception when the connection is missing, inactive, invalid, + or never finished its provider handshake. Shared by ``call_tool`` (execution) + and ``resolve_agent_tools`` (up-front validation). + """ + # Query all (not active-only) so an inactive connection yields a precise + # "inactive" error instead of an indistinguishable "not found". + connections = await self.query_connections( + project_id=project_id, + provider_key=provider_key, + integration_key=integration_key, + is_active=None, + ) + + connection = next( + (c for c in connections if c.slug == connection_slug), + None, + ) + + if not connection: + raise ConnectionNotFoundError( + provider_key=provider_key, + integration_key=integration_key, + connection_slug=connection_slug, + ) + + if not connection.is_active: + raise ConnectionInactiveError(connection_id=connection_slug) + + if not connection.is_valid: + raise ConnectionInvalidError( + connection_slug=connection_slug, + detail="Please refresh the connection.", + ) + + if not connection.provider_connection_id: + raise ConnectionNotFoundError( + provider_key=provider_key, + integration_key=integration_key, + connection_slug=connection_slug, + ) + + return connection + + # ----------------------------------------------------------------------- + # Agent tool resolution + # ----------------------------------------------------------------------- + + async def resolve_agent_tools( + self, + *, + project_id: UUID, + tools: List[AgentToolReference], + ) -> AgentToolsResolution: + """Resolve an agent's tool references into model-ready specs. + + ``builtin`` references pass through as names. ``composio`` references are + validated against the project's connections up front and enriched from the + catalog (description + input schema), so the model never sees a stale schema + and the invoke fails fast on a missing/invalid connection rather than mid-loop. + """ + builtins: List[str] = [] + custom: List[ResolvedAgentTool] = [] + + for ref in tools: + if isinstance(ref, AgentBuiltinTool): + if ref.name: + builtins.append(ref.name) + continue + + if isinstance(ref, AgentComposioTool): + custom.append( + await self._resolve_composio_tool( + project_id=project_id, + ref=ref, + ) + ) + + return AgentToolsResolution(builtins=builtins, custom=custom) + + async def _resolve_composio_tool( + self, + *, + project_id: UUID, + ref: AgentComposioTool, + ) -> ResolvedAgentTool: + provider_key = ToolProviderKind.COMPOSIO.value + + for segment in (ref.integration, ref.action, ref.connection): + if not _SLUG_SEGMENT_RE.match(segment): + raise ToolSlugInvalidError( + slug=f"{provider_key}.{ref.integration}.{ref.action}.{ref.connection}", + detail=f"Invalid slug segment: {segment!r}", + ) + + # Fail fast if the connection is missing/inactive/invalid for this project. + await self.resolve_connection_by_slug( + project_id=project_id, + provider_key=provider_key, + integration_key=ref.integration, + connection_slug=ref.connection, + ) + + action = await self.get_action( + provider_key=provider_key, + integration_key=ref.integration, + action_key=ref.action, + ) + if not action: + raise ActionNotFoundError( + provider_key=provider_key, + integration_key=ref.integration, + action_key=ref.action, + ) + + input_schema = ( + action.schemas.inputs if action.schemas and action.schemas.inputs else None + ) + name = ref.name or f"{ref.integration}__{ref.action}" + call_ref = ( + f"tools.{provider_key}.{ref.integration}.{ref.action}.{ref.connection}" + ) + + return ResolvedAgentTool( + name=name, + description=action.description, + input_schema=input_schema, + call_ref=call_ref, + ) diff --git a/docs/design/agent-workflows/README.md b/docs/design/agent-workflows/README.md new file mode 100644 index 0000000000..90a12c2b2c --- /dev/null +++ b/docs/design/agent-workflows/README.md @@ -0,0 +1,53 @@ +# Agent workflows + +This folder documents a proof of concept: running a coding agent as an Agenta workflow. + +Agenta runs prompt workflows today (completion, chat, the LLM judge). Each calls a model +once and returns one answer. An agent is different. It runs a loop, calls tools across many +turns, and returns a final answer. This PoC adds the agent as a new workflow type behind the +same `/invoke` contract, traced into the same spans, configured from the same playground. + +It proves one specific claim: that the **agent** and the **place it runs** are both config, +not code. You change a dropdown to swap Pi for Claude Code, or local for a Daytona cloud +sandbox, and nothing above the seam changes. + +## Read in this order + +1. **[Architecture](architecture.md)**. How a request flows from the playground to the model + and back: the relay of programs, the two containers, and the vocabulary. Start here. +2. **[Ports and adapters](ports-and-adapters.md)**. The ports that keep the relay swappable: + the backend, environment, and harness layers, where they live in the SDK, the wire + contract, and how the service picks a backend. +3. **[Sessions](sessions.md)**. How a multi-turn conversation holds together today (cold + replay), and the two paths open to us tomorrow. +4. **[The Pi adapter](adapters/pi.md)**. The default harness, which traces itself and takes + tools natively through a Pi extension. +5. **[The Claude Code adapter](adapters/claude-code.md)**. The second harness, which proves + the swap and is the template for any MCP-capable agent. +6. **[The Agenta harness](adapters/agenta.md)**. Pi with an opinion: forced skills, forced + tools, and a base AGENTS.md preamble the author's instructions are appended to. + +## What this PoC includes and defers + +It includes the agent workflow behind `/invoke`, two harnesses (Pi and Claude Code), two +sandboxes (local and Daytona), backend-resolved tools that keep credentials server-side, and +tracing that nests the agent's run under the caller's span. + +It defers the things a production rollout will need: a warm daemon and server-owned session +storage (see [Sessions](sessions.md)), live streaming to the client over the HTTP edge, the +multi-tenant filesystem jail for a shared daemon, and registering the agent as a first-class +backend workflow type with its own builtin URI. Each is called out where it belongs. + +The first two of those, streaming and server-owned sessions, have a proposed design: +[Streaming and sessions](streaming-and-sessions.md) for the rationale and trade-offs, and +the [Agent protocol RFC](agent-protocol-rfc.md) for the normative spec of the endpoints and +the wire format. They add a new `POST /messages` endpoint (Vercel-AI-format SSE stream, an +optional `session_id`, and `UIMessage` inputs) plus a `load-session` endpoint, sitting next +to the existing `/invoke`, which is unchanged. + +## The `scratch/` folder + +`scratch/` holds the raw working material from the build: the original work-package folders +(WP-1 through WP-8), the port redesign notes, the research write-ups, and the proof-of-concept +spikes. The pages above supersede it. It stays for history and for the running POC code, and +it is not meant to be read as the design. diff --git a/docs/design/agent-workflows/adapters/agenta.md b/docs/design/agent-workflows/adapters/agenta.md new file mode 100644 index 0000000000..ca9fd4ea77 --- /dev/null +++ b/docs/design/agent-workflows/adapters/agenta.md @@ -0,0 +1,64 @@ +# The Agenta harness + +`AgentaHarness` is Pi with an opinion. It runs on the same engine as the [Pi +adapter](pi.md) and produces a Pi-shaped config, so it inherits everything Pi does (native +tools, the system-prompt layers, tracing). What it adds is a fixed set of Agenta-shipped +extras that the agent author cannot turn off: + +- **Forced tools** — always unioned into the agent's resolved tools. At minimum `read` + (Pi only renders the skills section when `read` is enabled) and `bash` (so skills can run + their helper scripts). +- **Forced skills** — Agenta-shipped Pi skills loaded on every run. +- **A base AGENTS.md preamble** — the author's `instructions` are appended after it. +- **A base persona** — forced onto Pi's `append_system`, with any author-supplied + `append_system` appended after it. + +Read the [architecture](../architecture.md), [ports and adapters](../ports-and-adapters.md), +and [Pi adapter](pi.md) pages first. This page assumes them. + +## Where the forced bits live + +The forced *policy* lives in the SDK harness layer, in one editable module: +`sdks/python/agenta/sdk/agents/adapters/agenta_builtins.py` (`AGENTA_PREAMBLE`, +`AGENTA_FORCED_APPEND_SYSTEM`, `AGENTA_FORCED_TOOLS`, `AGENTA_FORCED_SKILLS`). `AgentaHarness` +(`adapters/harnesses.py`) reads them in `_to_harness_config` and layers them onto the neutral +`SessionConfig`, exactly where `PiHarness` and `ClaudeHarness` do their own translation. + +The forced skill *files* live with the runner that runs Pi, under +`services/agent/skills//` (each a directory with a `SKILL.md`). Skills are real files on +disk because they reference relative scripts and assets, so they cannot ride the wire as +text. The contract between the two halves is the skill **name**: `AGENTA_FORCED_SKILLS` lists +names, and each must match a committed directory under the runner's skills root. + +## How a skill reaches the model + +1. `AgentaHarness._to_harness_config` puts the forced skill names on the `skills` field of + the `/run` request (`AgentaAgentConfig.wire_tools`). +2. The in-process Pi engine (`engines/pi.ts`) resolves each name against its bundled + `skills/` root (override with `AGENTA_AGENT_SKILLS_DIR`) and passes the directories to Pi's + `DefaultResourceLoader` as `additionalSkillPaths`, with `noSkills: true` so only the + bundled skills load (the run stays hermetic, like `noContextFiles`). +3. Pi loads them, and because the forced `read` tool is enabled, surfaces them in the system + prompt. The model reads a skill's `SKILL.md` on demand (progressive disclosure). + +## Two prompt layers, kept distinct + +This follows Pi's own split (see `PiAgentConfig`): the **persona** ("who the agent is") +belongs in `append_system`, and **project conventions** belong in `AGENTS.md`. So the Agenta +persona is a forced `append_system`, while the Agenta base preamble plus the author's +instructions are the `AGENTS.md`. An author's own `system` / `append_system` (via +`AgentConfig.harness_options["pi"]`) still apply, layered after the forced persona. + +## Selecting it + +`agenta` is a harness option alongside `pi` and `claude` (the playground dropdown, the +`harness` field). It runs on the in-process Pi backend (`InProcessPiBackend` now lists +`HarnessType.AGENTA` as supported), so `select_backend` keeps `agenta` on the local Pi path. + +## Deferred + +Only the in-process Pi (local) path is wired. The ACP/rivet path (and therefore the Daytona +sandbox) does not yet deliver the forced skills — it would teach `runRivet` to read the +`skills` field and lay the bundled skill directories into the sandbox via the existing +bundled-file provisioning. Until then, `agenta` with a non-local sandbox raises +`UnsupportedHarnessError` rather than silently running without its skills. diff --git a/docs/design/agent-workflows/adapters/claude-code.md b/docs/design/agent-workflows/adapters/claude-code.md new file mode 100644 index 0000000000..64a4e3c96c --- /dev/null +++ b/docs/design/agent-workflows/adapters/claude-code.md @@ -0,0 +1,95 @@ +# The Claude Code adapter + +Claude Code is the second harness. It proves the central claim of this PoC: that swapping +the agent is one config value. Where the [Pi adapter](pi.md) does much of its work inside Pi +through an extension, Claude does its work through standard ACP. That makes Claude the +template for any MCP-capable harness rivet can drive. + +Read the [architecture](../architecture.md) and [ports and adapters](../ports-and-adapters.md) +pages first. + +## Running Claude + +The daemon resolves the harness id `claude` to the `claude-agent-acp` adapter, which starts +the `claude` CLI. One operational detail is worth calling out, because it caused a real bug. +The daemon does not ship the `claude` CLI. It downloads it over HTTPS the first time a run +asks for Claude. The sidecar image is a slim Node image with no root certificates, so that +HTTPS download failed until we added `ca-certificates` to the image. With the certs in +place, the download verifies and Claude runs. + +Auth is config, like everything else. Claude authenticates with `ANTHROPIC_API_KEY` from the +project vault when present, or with an OAuth token (`CLAUDE_CODE_OAUTH_TOKEN`) otherwise. The +runner turns the common failures into one clear line, so a user sees "add the project's +Anthropic key" rather than a stack trace. + +## Tools over MCP + +Claude advertises the `mcpTools` capability, so the runner delivers tools to Claude the +standard ACP way, over MCP. This is the branch that the [capability probe](../ports-and-adapters.md) +chooses: deliver over MCP when the harness reports `mcpTools`, not when the harness name is +something in particular. + +The mechanism is a small stdio MCP server (`tools/mcp-server.ts`) that the daemon launches +and attaches to the session. Its tool bodies POST back to Agenta's `/tools/call` with the +same WP-7 envelope the Pi path uses. The resolved specs and the callback endpoint reach the +MCP server through its environment, so nothing tool-specific is written to a file the agent +can read. The safety property is identical to Pi's: the provider key and the connection auth +stay server-side, and the agent only ever asks Agenta to run a named tool. + +## Permissions + +Claude gates tool use behind a permission prompt. In an Agenta run there is no human at the +keyboard to answer it, so the runner answers for it. By default it auto-approves, because the +tools are backend-resolved and trusted. The per-run permission policy (or an env override) +can flip this to deny, which rejects tool use instead. This is handled on +`session.onPermissionRequest`, a hook Pi does not need because Pi does not gate tools this +way. + +## Tracing from the event stream + +Claude does not self-instrument the way Pi does, because we do not load an Agenta extension +into Claude. So the runner builds the trace itself, from the ACP event stream. It subscribes +to the session's `session/update` notifications and turns them into the same span tree Pi +produces: + +``` +invoke_agent (AGENT) + turn 0 (CHAIN) + chat (LLM) + execute_tool (TOOL) one per ACP tool_call +``` + +This is the general path. Any harness rivet drives that does not bring its own +instrumentation gets traced this way. Pi is the exception that traces itself; Claude is the +rule. + +## Usage and output + +Claude reports usage in two places, so the runner reads both. The per-call input and output +token split rides on the ACP `PromptResponse`, and the cost rides on the `usage_update` +event. The runner combines them into the run total, which then rolls onto the workflow span +the same way Pi's writeback total does. + +Output needs one small piece of care. Claude streams text deltas and also periodically +streams a full cumulative snapshot of the message so far. If the runner naively appended +everything, the answer would double. The runner detects a snapshot (a chunk that is a +superset of what it already has) and replaces rather than appends, so the final text is +correct whether a chunk is a delta or a snapshot. + +## Models + +Claude ignores a model id meant for another provider. Ask it for `gpt-5.5` and it keeps its +own default. The runner handles this honestly: when the harness does not accept the requested +model, the chat span is labelled `chat` rather than falsely claiming a model the run did not +use. + +## What Claude demonstrates + +Claude is the proof that the seam works. Adding it took a `ClaudeHarness` (which holds its +Pi-versus-Claude config mapping) and no change to the workflow handler above the ports; the +same `RivetBackend` drives it. It also exercises the capability-driven branches the design is +built on: tools over MCP because it reports `mcpTools`, a permission answer because it gates +tools, and event-stream tracing because it does not self-instrument. A future harness that +rivet can drive would reuse this exact path. A future harness that rivet cannot drive would +instead get its own backend beside `RivetBackend` and `InProcessPiBackend`, behind the same +`/run` contract. diff --git a/docs/design/agent-workflows/adapters/pi.md b/docs/design/agent-workflows/adapters/pi.md new file mode 100644 index 0000000000..abcd9ced87 --- /dev/null +++ b/docs/design/agent-workflows/adapters/pi.md @@ -0,0 +1,167 @@ +# The Pi adapter + +Pi is the default harness. This page explains how we run it, how it gets its tools, and how +it traces itself. Pi is the richer of the two adapters because Pi has an extension API we +can use, so much of the work happens inside Pi rather than around it. + +Read the [architecture](../architecture.md) and [ports and adapters](../ports-and-adapters.md) +pages first. This page assumes the relay and the wire contract. + +## Two ways Pi runs + +Pi runs through one of two engines, both behind the same port: + +- **Over ACP, through rivet** (`engines/rivet.ts` with `harness: pi`). This is the main + path and the one the rest of this page describes. The rivet daemon starts the `pi-acp` + adapter, which starts the `pi` CLI. +- **In-process** (`engines/pi.ts`). This drives the Pi SDK directly inside the sidecar, with + no daemon, no adapter, and no ACP. It is the simplest local path and a fallback. The last + section covers it. + +## The ACP path: pi-acp plus a bundled extension + +On the ACP path, the daemon resolves the harness id `pi` to the `pi-acp` adapter. One detail +matters: `pi-acp` does not bundle Pi. It spawns the `pi` CLI from `PATH`, so the runner +points it at our pinned `pi` binary (`PI_ACP_PI_COMMAND`) and puts our `node_modules/.bin` +on the daemon's `PATH`. + +The interesting part is what we load into Pi. We ship a single **Pi extension** +(`extensions/agenta.ts`, bundled to `dist/extensions/agenta.js`) and install it into Pi's +agent directory. Pi loads it on every run. This one extension does two jobs: it delivers our +tools the Pi-native way, and it traces the run. Both are driven entirely by environment +variables, so the extension stays inert when none are set and is safe to install globally. + +## Tools, the Pi-native way + +Pi 0.79.4 does not support MCP. So we do not deliver tools over MCP to Pi. Instead the +extension reads the resolved tool specs from `AGENTA_TOOL_SPECS` and registers each one with +Pi directly through `pi.registerTool`. Pi then sees them as native tools and runs the loop. + +Each registered tool's body does one thing: it POSTs the call back to Agenta's `/tools/call` +with the tool's `callRef` (the WP-7 envelope). The model picks the tool and supplies the +arguments; Agenta runs the actual tool server-side. This is the key safety property: the +Composio key and the connection auth never enter the sandbox. The agent only ever asks +Agenta to run a named tool. + +On Daytona the in-sandbox process cannot reach Agenta directly, so the extension writes each +tool request to a file (`AGENTA_TOOL_RELAY_DIR`) and the runner, which can reach Agenta, +relays it to `/tools/call` and writes the answer back. Same envelope, different delivery. + +## System prompt: AGENTS.md, SYSTEM, and APPEND_SYSTEM + +Pi builds its system prompt from three separate inputs, and they stack rather than compete: + +- **`AGENTS.md`** is project context. Pi wraps it in a `` block and appends + it after the base prompt. It loads with no trust gate, and it is what `instructions` on the + neutral `AgentConfig` becomes. This is the right home for project conventions, commands, + and preferences. +- **`APPEND_SYSTEM`** adds to Pi's built-in base prompt without replacing it. Reach for this + when you only want to add framing on top of Pi's default coding-assistant prompt. +- **`SYSTEM`** replaces the base prompt outright. Pi throws away its default + "you are a coding assistant" persona, the tool list, and the built-in guidelines, and uses + your text instead. Use it only when a workflow needs a fundamentally different agent. + +The key fact: these are not either/or with `AGENTS.md`. Even when `SYSTEM` replaces the base +prompt, Pi still appends the `AGENTS.md` context after it. So `AGENTS.md` stays the project +layer, and `SYSTEM` / `APPEND_SYSTEM` only change Pi's base persona. For almost every agent, +`AGENTS.md` alone is enough; the other two are a deliberate opt-in. + +### How to set them + +`SYSTEM` and `APPEND_SYSTEM` are Pi-specific, so they ride the neutral config's per-harness +escape hatch, `AgentConfig.harness_options`. It is a bag keyed by harness name; each Harness +adapter reads only its own slice: + +```python +AgentConfig( + instructions="Project: a SQL analytics tool. Run `make lint` before finishing.", # AGENTS.md + harness_options={ + "pi": { + "system": "You are a SQL expert. Only answer with queries.", # replaces base prompt + "append_system": "Always explain each query in one line.", # adds to base prompt + } + }, +) +``` + +`PiHarness` lifts the `pi` slice onto `PiAgentConfig.system` / `append_system`, which emit +`systemPrompt` / `appendSystemPrompt` on the `/run` wire. An empty or whitespace value is +dropped, so it never reaches the runner as a real override. + +### Delivery status + +The **in-process Pi engine** honors both. It feeds them through the resource loader's +`systemPromptOverride` / `appendSystemPromptOverride`, so the run stays hermetic: only what +the request carries applies, never a `SYSTEM.md` or `APPEND_SYSTEM.md` left on disk. + +The **ACP (rivet) path does not deliver them yet**. It drives Pi through `pi-acp`, which gives +us no per-run hook to set the prompt: a project `.pi/SYSTEM.md` is trust-gated, and the CLI +`--system-prompt` flag cannot be set per session through the adapter. The engine logs a +warning when these fields are set on that path so the gap is visible, not silent. `AGENTS.md` +still applies there, because Pi loads context files regardless of trust. Wiring the ACP path +(via project trust plus `.pi/SYSTEM.md`, or per-session CLI flags) is the remaining work. + +## Tracing: Pi instruments itself + +Pi emits lifecycle events on an in-process event bus (`pi.on(...)`). The extension hooks +those events and turns them into OpenTelemetry spans, the same span tree completion and chat +already produce: + +``` +invoke_agent (AGENT) + turn N (CHAIN) + chat (LLM) real token usage from the provider call + execute_tool (TOOL) one per tool the turn ran +``` + +The runner passes the caller's `traceparent` to the extension as `AGENTA_TRACEPARENT`. The +extension starts `invoke_agent` as a child of that span, so the whole Pi run joins the same +trace as the `/invoke` request. Because Pi self-instruments with real provider data, its +spans carry true per-call token counts, not estimates. + +This is why the rivet engine does not also build spans for Pi. It would double them. The +engine emits its own spans only for harnesses that do not self-instrument (see the +[Claude Code adapter](claude-code.md)). + +## Usage writeback: the one extra hop + +Pi reports no token usage over ACP. It only has the numbers in-process. And the Pi spans and +the workflow span ship to Agenta in separate batches, so Agenta cannot roll Pi's per-call +tokens up onto the workflow span on its own. + +The fix is a small handoff. On `agent_end`, the extension writes the run's token and cost +totals to a file (`AGENTA_USAGE_OUT`). The runner reads that file after the prompt finishes +and returns the totals on the `/run` result. The Python service then stamps them on the live +workflow span. The result is that `_agent` shows the agent's real tokens and cost even +though the two traces shipped separately. + +## Models and output + +Pi exposes provider-prefixed model ids, like `openai-codex/gpt-5.5`. The runner normalizes a +requested id to Pi's own id: it tries the value as given, and on rejection it matches by the +part after the provider prefix. If nothing matches, Pi keeps its default and the run still +answers. + +For output, Pi streams pure text deltas over ACP (`agent_message_chunk`). The runner +appends them in order to build the final answer. + +## Daytona notes + +Two things differ on Daytona. The rivet `-full` image ships the `pi-acp` adapter but not the +`pi` CLI, so the runner either installs `pi` into the sandbox at session time or runs from a +pre-baked snapshot that already has it (the snapshot path avoids a slow per-run install). +And auth comes from the provider key in the sandbox env when present, or from an uploaded +`auth.json` (the developer's OAuth login) when no key is set. + +## The in-process engine + +The in-process Pi engine (`engines/pi.ts`, selected by the `InProcessPiBackend`) skips rivet +entirely. It drives Pi's `createAgentSession` directly, with everything in memory: AGENTS.md +injected through the resource loader, the session and settings managers in memory, and a +throwaway working directory. It registers the same tools as Pi `customTools` (the same +POST-back-to-`/tools/call` body) and traces with the same extension logic, just wired in +process rather than loaded from disk. + +It returns the same `/run` result as the rivet path, which is the whole point of the ports: +the workflow author cannot tell which engine ran. It exists for the simplest local case and +as a path that does not depend on the rivet daemon being present. diff --git a/docs/design/agent-workflows/agent-protocol-rfc.md b/docs/design/agent-workflows/agent-protocol-rfc.md new file mode 100644 index 0000000000..4a517d5a72 --- /dev/null +++ b/docs/design/agent-workflows/agent-protocol-rfc.md @@ -0,0 +1,526 @@ +# RFC: Agenta Agent Protocol (`POST /messages`, Sessions and Streaming) + +| | | +| --- | --- | +| **Status** | Draft | +| **Version** | 0.1 | +| **Layer** | Frontend to backend, over HTTP/1.1 | +| **Defines** | `POST /messages`, `POST /load-session` | +| **Reuses** | The workflow response envelope (`WorkflowServiceResponse`) and revision resolution (`references`) | +| **Companion** | [streaming-and-sessions.md](streaming-and-sessions.md) (design rationale and trade-offs) | + +## Abstract + +This document specifies the wire protocol between an Agenta client (typically a browser +running the Vercel AI SDK `useChat` hook) and the Agenta backend for running an **agent** +workflow. It defines a new endpoint, `POST /messages`, for stateful, streaming chat. The +endpoint carries a session identifier in the request and response bodies, offers two response +modes (a single JSON response and a Server-Sent Events stream in the Vercel UI Message Stream +format), and takes the agent's inputs as a conversation (`messages`) plus named input +variables (`inputs`). A second endpoint, `POST /load-session`, returns a conversation's +history. + +`/messages` is a sibling of the existing workflow `/invoke`, not a change to it. The generic, +stateless `/invoke` is untouched. `/messages` exists because the chat contract differs: the +conversation is a first-class top-level member in the Vercel `UIMessage` shape, the response +can stream, and a turn belongs to a session. + +## 1. Conventions and terminology + +The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, +**SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this document are to be +interpreted as described in RFC 2119 and RFC 8174 when, and only when, they appear in all +capitals. + +JSON is defined in RFC 8259. Server-Sent Events (SSE) follow the WHATWG HTML `text/event- +stream` definition. All request and response bodies are UTF-8 encoded JSON unless a streaming +content type is negotiated. + +| Term | Definition | +| --- | --- | +| **Agent** | A workflow that runs a multi-step loop (model, tool, model, ...) and emits a stream of events before producing a final answer. | +| **Turn** | One request to `/messages`. A turn supplies new input and produces one assistant response (streamed or whole). | +| **Session** | A server-named conversation that groups turns. Identified by a `session_id`. | +| **`session_id`** | An opaque string that identifies a session within a project. Carried in the request and response bodies. | +| **`UIMessage`** | A message in Vercel AI SDK v5/v6 form: `{ id, role, parts[] }`. See Appendix B. | +| **Part** | One element of the UI Message Stream (for example `text-delta`, `tool-input-available`). See Section 6.2. | +| **`inputs`** | The agent's inputs for a turn: the conversation `messages` plus named input variables. See Section 5. | +| **Streaming edge** | The backend component that encodes the agent's internal `AgentEvent` stream into the UI Message Stream. | + +## 2. Protocol overview + +The protocol defines two endpoints: + +| Method | Path | Purpose | +| --- | --- | --- | +| `POST` | `/messages` | Run one agent turn. Returns one JSON response or an SSE stream, by content negotiation. | +| `POST` | `/load-session` | Return the history of a session. | + +A turn carries an OPTIONAL `session_id`. The server resolves it per Section 4. A turn's +response mode is selected by the `Accept` request header per Section 6. + +The agent's input for a turn is the conversation `data.messages` in `UIMessage` form, plus the +named input variables in `data.inputs`. The agent configuration travels as on `/invoke`, +either inline in `data.parameters` or resolved from `references` (Section 5). + +``` + ┌─────────────────────────── client (useChat) ───────────────────────────┐ + │ │ + POST /messages (Accept: text/event-stream) POST /load-session │ + │ │ │ + ▼ ▼ │ + ┌──────────────────┐ AgentEvent stream ┌───────────────────────────────────┐ │ + │ agent run │ ─────────────────────▶│ streaming edge → UI Message Stream │──┘ + │ (harness loop) │ └───────────────────────────────────┘ + └──────────────────┘ persists per turn + │ │ + └──────────────── trace store (ag.session.id) ◀───────┘ load-session reads here +``` + +## 3. Relationship to `/invoke` + +`/messages` is a new endpoint. It does not change `/invoke`. The generic, stateless workflow +invoke keeps its exact request and response, and a client that does not run a chat agent never +touches `/messages`. + +`/messages` reuses two things from the workflow contract so the backend does not fork: the +response envelope (`WorkflowServiceResponse`, with the answer in `data.outputs`) and revision +resolution (`references`). It diverges from `/invoke` in three ways, which is why it is its own +endpoint: + +1. The conversation is a first-class member, `data.messages`, in the `UIMessage` shape, rather + than nested in `data.inputs.messages` as `{role, content}`. +2. The response can stream as a UI Message Stream (Section 6.2). +3. A turn belongs to a session (`session_id`, Section 4). + +A server **SHOULD** map a `/messages` request onto the same internal agent invocation that +`/invoke` uses, after lifting `data.messages` and `data.inputs` into the handler's `messages` +and `inputs` arguments. + +## 4. Session model + +### 4.1 Identity + +A `session_id` is an opaque string scoped to a project. The pair `(project_id, session_id)` +**MUST** be unique. A bare `session_id` is not a global identifier. + +A client-supplied `session_id`: + +- **MUST** be treated as an opaque token. A server **MUST NOT** interpolate it into a storage + path, a query, or a trace attribute without escaping. +- **SHOULD** be constrained by the server to a bounded length and a restricted character set. + A server **MAY** reject an id outside those bounds with `400 Bad Request`. + +### 4.2 Resolution + +On receiving a turn, the server resolves the session as follows: + +1. If the request omits `session_id`, the server **MUST** mint a new unique id, associate the + turn with it, and return that id (Section 6). +2. If the request supplies a `session_id` that does not exist for the caller's project, the + server **MUST** create a session with that id and associate the turn with it. +3. If the request supplies a `session_id` that exists for the caller's project, the server + **MUST** associate the turn with that existing session. +4. If the request supplies a `session_id` that exists under a **different** project, the + server **MUST NOT** resume it. The server **MUST** treat it as case 2 within the caller's + own project, or reject the turn. A server **MUST NOT** disclose the existence of a session + the caller does not own. + +Rule 4 is the ownership boundary. "Resume if it exists" means "resume if it exists and +belongs to the caller." + +### 4.3 Continuation semantics for this version + +In this version, associating a turn with a session records the turn under that session for +tracing and later retrieval. The conversation context the model sees is supplied by the +`messages` in the request (Section 5.2), not reconstructed from the server's record. + +A future version MAY make the server's record authoritative, at which point a turn carries +only the new message and the server supplies the prior history. The request field is +unchanged by that evolution. See [streaming-and-sessions.md](streaming-and-sessions.md). + +### 4.4 Concurrency + +Two turns that create the same new `(project_id, session_id)` concurrently **MUST** resolve +to a single session. A server **SHOULD** enforce this with a unique constraint and treat the +losing creation as a resume (case 3). + +## 5. Request format (`POST /messages`) + +### 5.1 Envelope + +```jsonc +{ + "session_id": "sess_123", // OPTIONAL (Section 4) + "references": { ... }, // OPTIONAL: selects the workflow revision (as on /invoke) + "data": { + "messages": [ /* UIMessage[] */ ], // REQUIRED: the conversation (Section 5.2) + "inputs": { "": }, // OPTIONAL: named input variables (Section 5.3) + "parameters": { /* agent config */ } // OPTIONAL (Section 5.4) + } +} +``` + +`session_id` sits at the envelope top level, alongside the existing `trace_id` and `span_id`. +It **MUST NOT** be required in a request header. + +`data.messages`, `data.inputs`, and `data.parameters` are siblings. They map onto the agent +handler's `messages`, `inputs`, and `parameters` arguments. On `/invoke` the conversation is +nested at `data.inputs.messages`; on `/messages` it is lifted out to `data.messages`, because +the conversation is the primary input of this endpoint. + +### 5.2 `data.messages` + +`data.messages` is the conversation as an array of `UIMessage` objects (Appendix B). It is +REQUIRED. The last element is the new user turn. + +In this version the client **MUST** send the full conversation in `data.messages`. Each +element uses the parts-based `UIMessage` shape (Appendix B), not the `{role, content}` shape +of `/invoke`. + +### 5.3 `data.inputs` + +`data.inputs` carries the agent's named input variables for the turn: the workflow's declared +inputs and any per-turn context the caller supplies (for example a retrieved document or a +record id). Keys are input names; values are arbitrary JSON. This is the same `inputs` as the +workflow contract, with the conversation no longer nested inside it. + +`data.inputs` is OPTIONAL and MAY be sent on every turn, since its values can change between +turns. + +### 5.4 `data.parameters` and `references` + +The agent configuration (instructions, model, tools, harness, sandbox, permission policy) +travels as on `/invoke`: inline in `data.parameters.agent`, or resolved by the platform from +`references` when the request targets a stored revision. This protocol does not change that +resolution. + +### 5.5 Content negotiation + +The response mode is selected by the `Accept` request header: + +| `Accept` | Response | +| --- | --- | +| `application/json` (or absent) | Single JSON response (Section 6.1) | +| `text/event-stream` | UI Message Stream over SSE (Section 6.2) | + +A server that cannot satisfy the `Accept` header **MUST** respond `406 Not Acceptable`. + +## 6. Response formats + +### 6.1 Single JSON response + +For `Accept: application/json`, the server returns `200 OK` with a body extending +`WorkflowServiceResponse`: + +```jsonc +{ + "trace_id": "...", + "span_id": "...", + "session_id": "sess_123", // the resolved id (minted or echoed) + "status": { "code": 200 }, + "data": { "outputs": { "role": "assistant", "content": "Berlin." } } +} +``` + +The response **MUST** include `session_id`, set to the resolved session (Section 4). The +assistant answer rides in `data.outputs` as today. Token usage is not in the body; it is +recorded on the trace. + +### 6.2 UI Message Stream (SSE) + +For `Accept: text/event-stream`, the server returns `200 OK` and streams the run in the +Vercel UI Message Stream format (AI SDK v5/v6). + +#### 6.2.1 Response headers + +The response **MUST** set: + +``` +content-type: text/event-stream +x-vercel-ai-ui-message-stream: v1 +``` + +and **SHOULD** set: + +``` +cache-control: no-cache +connection: keep-alive +x-accel-buffering: no +``` + +`x-accel-buffering: no` disables proxy buffering so parts flush immediately. + +#### 6.2.2 Framing + +Each part is one SSE event: the literal bytes `data: `, followed by the part as compact JSON +(no insignificant whitespace), followed by `\n\n`. + +``` +data: {"type":"text-delta","id":"t1","delta":"Hello"}\n\n +``` + +The stream **MUST** terminate with the literal line `data: [DONE]\n\n`. + +#### 6.2.3 Part registry + +The parts a server emits, with their REQUIRED fields. Fields not listed are OPTIONAL and MAY +be omitted. + +| `type` | Required fields | Meaning | +| --- | --- | --- | +| `start` | none | Begin a message. Carries `messageId` and `messageMetadata` (Section 6.2.4). | +| `start-step` | none | Begin a step of the agent loop. | +| `finish-step` | none | End the current step. | +| `finish` | none | End the message. Carries `finishReason`, `messageMetadata`. | +| `text-start` | `id` | Begin a text block. | +| `text-delta` | `id`, `delta` | Append `delta` to the text block `id`. | +| `text-end` | `id` | End the text block. | +| `reasoning-start` | `id` | Begin a reasoning block. | +| `reasoning-delta` | `id`, `delta` | Append to the reasoning block. | +| `reasoning-end` | `id` | End the reasoning block. | +| `tool-input-start` | `toolCallId`, `toolName` | A tool call begins. | +| `tool-input-delta` | `toolCallId`, `inputTextDelta` | Append a fragment of the tool arguments (note: `inputTextDelta`, not `delta`). | +| `tool-input-available` | `toolCallId`, `toolName`, `input` | The full tool arguments are known. | +| `tool-output-available` | `toolCallId`, `output` | The tool result. | +| `tool-output-error` | `toolCallId`, `errorText` | The tool failed. | +| `file` | `url`, `mediaType` | A file or image. `url` MAY be an `https:` or `data:` URL. | +| `data-` | `data` | An application-defined part (generative UI). MAY carry `id` and `transient`. | +| `error` | `errorText` | A stream-level error (Section 8.2). | + +A server **MUST** order parts so that for any `id` or `toolCallId`, a `*-start` precedes its +deltas, which precede its `*-end` or `*-available`. Text and reasoning deltas are +concatenated by `id`. Tool parts are keyed by `toolCallId`. + +#### 6.2.4 Session id in the stream + +The server **MUST** convey the resolved `session_id` as `messageMetadata.sessionId` on the +`start` part, which is the first part of the stream: + +``` +data: {"type":"start","messageId":"msg_1","messageMetadata":{"sessionId":"sess_123"}} +``` + +A server **MAY** additionally mirror `session_id` to a response header. The body remains the +normative source. + +#### 6.2.5 Mapping from agent events + +The streaming edge consumes the agent's internal `AgentEvent` stream +(`services/agent/src/protocol.ts:74`) and emits parts as follows: + +| `AgentEvent` | Parts | +| --- | --- | +| run start (synthesized) | `start` (with `messageId`, `messageMetadata.sessionId`), then `start-step` | +| `message` | `text-start`, one or more `text-delta`, `text-end` | +| `thought` | `reasoning-start`, `reasoning-delta`, `reasoning-end` | +| `tool_call` | `tool-input-start`, then `tool-input-available` | +| `tool_result` with `isError=false` | `tool-output-available` | +| `tool_result` with `isError=true` | `tool-output-error` | +| `usage` | `messageMetadata` on the `finish` part | +| `error` | `error` (Section 8.2) | +| `done` | `finish-step`, then `finish` (`finishReason` = `stopReason`), then `[DONE]` | + +A harness that reports `capabilities.streamingDeltas` produces token-level `text-delta` +parts. A harness that does not produces one `text-delta` carrying the whole text. The wire +shape is identical, so the client does not distinguish them. + +The protocol streams deltas only. There is no full-message snapshot part. The client +assembles the final `UIMessage` from the parts. The server **SHOULD** record the assembled +turn on the trace (`ag.session.id`), which is the source `load-session` reads. + +## 7. The `load-session` endpoint (`POST /load-session`) + +Returns the history of a session so a client can rebuild a conversation it does not hold +locally. + +### 7.1 Request + +```jsonc +{ "session_id": "sess_123" } +``` + +`session_id` is REQUIRED. The server **MUST** apply the ownership rule of Section 4.2: if the +session does not exist for the caller's project, the server **MUST** respond `404 Not Found` +and **MUST NOT** reveal a session owned by another project. + +### 7.2 Response (default, `Accept: application/json`) + +The server returns `200 OK` with the conversation as `UIMessage` objects, the shape `useChat` +accepts as its initial `messages`: + +```jsonc +{ + "session_id": "sess_123", + "messages": [ + { "id": "m1", "role": "user", "parts": [ { "type": "text", "text": "capital of France?" } ] }, + { "id": "m2", "role": "assistant", "parts": [ { "type": "text", "text": "Paris." } ] } + ] +} +``` + +### 7.3 Response (negotiated replay, `Accept: text/event-stream`) + +A server **MAY** support a delta replay of the stored history under +`Accept: text/event-stream`, re-emitting the session as a UI Message Stream (Section 6.2). +This is OPTIONAL. Whether the folded form or the replay is the primary form is left open by +this draft; a conformant client **SHOULD** request `application/json` for rebuilding a static +view. + +## 8. Error handling + +### 8.1 Request and endpoint errors (JSON) + +Before a stream begins, the server reports errors with an HTTP status and the existing +`status` envelope (`WorkflowServiceStatus`: `code`, `message`, `type`, `stacktrace`): + +| Status | Condition | +| --- | --- | +| `400 Bad Request` | Malformed body, or a `session_id` that violates Section 4.1. | +| `401 Unauthorized` / `403 Forbidden` | Missing or invalid credentials. | +| `404 Not Found` | `load-session` on a session the caller does not own. | +| `406 Not Acceptable` | The `Accept` header cannot be satisfied. | +| `5xx` | Server failure before streaming starts. | + +### 8.2 In-stream errors + +A failure after the stream has started **MUST** be reported as an `error` part: + +``` +data: {"type":"error","errorText":"the agent run failed: ..."} +``` + +After emitting an `error` part, the server **SHOULD** terminate the stream. It **MAY** omit +the `finish` part. It **SHOULD** still emit `[DONE]` to close the SSE channel cleanly. The +client surfaces the error to the user. + +## 9. Security considerations + +- **Session ownership.** Section 4.2 rule 4 is a security requirement, not a convenience. + Because a client may supply a `session_id` for an unknown id (case 2), a server that keys + sessions on `session_id` alone would let a caller read or extend another tenant's + conversation. Servers **MUST** key on `(project_id, session_id)` and scope every resume, + every `load-session`, and every existence check to the caller's project. +- **Opaque ids.** A client-supplied `session_id` is untrusted input. See Section 4.1. +- **Secrets.** Provider keys and tool credentials travel and resolve as in the current + contract. This protocol adds no new secret-bearing field. `inputs` is caller-supplied + input and **MUST NOT** be used to smuggle credentials in place of the existing `secrets` + and signed-credential mechanisms. +- **Content negotiation and buffering.** A streaming response disables proxy buffering + (Section 6.2.1). Operators **MUST** ensure intermediaries do not re-buffer `text/event- + stream` responses, or streaming degrades to a single delayed flush. + +## 10. Interaction sequences + +### 10.1 New session, streaming turn + +``` +client server + │ POST /messages │ + │ Accept: text/event-stream │ + │ { data:{ messages:[...] } } │ (no session_id) + │───────────────────────────────────────▶│ + │ │ mint sess_123 + │ 200 text/event-stream │ + │ data: {"type":"start", │ + │ "messageMetadata": │ + │ {"sessionId":"sess_123"}} │ + │◀───────────────────────────────────────│ + │ data: {"type":"start-step"} ... │ + │ ... tool / text parts ... │ + │ data: {"type":"finish"} │ + │ data: [DONE] │ + │◀───────────────────────────────────────│ + │ (client stores sess_123 for next turn) │ +``` + +### 10.2 Returning to a known session + +``` +client server + │ POST /load-session │ + │ { "session_id": "sess_123" } │ + │───────────────────────────────────────▶│ check ownership + │ 200 { messages: [ UIMessage, ... ] } │ + │◀───────────────────────────────────────│ + │ (render history; hold it) │ + │ │ + │ POST /messages │ + │ Accept: text/event-stream │ + │ { session_id:"sess_123", │ + │ data:{ messages:[...full] } } │ + │───────────────────────────────────────▶│ resolve existing sess_123 + │ 200 text/event-stream → parts → [DONE] │ + │◀───────────────────────────────────────│ +``` + +## Appendix A: Full stream transcript + +One turn: the agent calls a weather tool, reads the result, and answers. Every `data:` line +in order, each followed by a blank line. + +``` +data: {"type":"start","messageId":"msg_1","messageMetadata":{"sessionId":"sess_123"}} + +data: {"type":"start-step"} + +data: {"type":"tool-input-start","toolCallId":"call_1","toolName":"getWeather"} + +data: {"type":"tool-input-available","toolCallId":"call_1","toolName":"getWeather","input":{"city":"Paris"}} + +data: {"type":"tool-output-available","toolCallId":"call_1","output":{"weather":"sunny","temp":24}} + +data: {"type":"finish-step"} + +data: {"type":"start-step"} + +data: {"type":"text-start","id":"t1"} + +data: {"type":"text-delta","id":"t1","delta":"It is sunny "} + +data: {"type":"text-delta","id":"t1","delta":"and 24°C in Paris."} + +data: {"type":"text-end","id":"t1"} + +data: {"type":"finish-step"} + +data: {"type":"finish","messageMetadata":{"usage":{"input":820,"output":36,"cost":0.004}}} + +data: [DONE] +``` + +## Appendix B: `UIMessage` schema + +A message accumulated by the client and accepted by `load-session`: + +```jsonc +{ + "id": "m2", + "role": "user | assistant | system", + "parts": [ + { "type": "text", "text": "..." }, + { "type": "reasoning", "text": "..." }, + { "type": "tool-", "toolCallId": "...", "state": "output-available", "input": {}, "output": {} }, + { "type": "file", "url": "...", "mediaType": "image/png" }, + { "type": "data-", "data": { } }, + { "type": "step-start" } + ], + "metadata": { } +} +``` + +A `UIMessage` carries no top-level `content` string in v5/v6. All content lives in `parts`. + +## Appendix C: References + +- RFC 2119, RFC 8174: requirement keywords. +- RFC 8259: JSON. +- WHATWG HTML, Server-Sent Events: `text/event-stream`. +- Vercel AI SDK UI Message Stream (v5/v6): https://ai-sdk.dev, and the chunk schema at + https://github.com/vercel/ai/blob/main/packages/ai/src/ui-message-stream/ui-message-chunks.ts +- Current contract: `sdks/python/agenta/sdk/models/workflows.py`, + `sdks/python/agenta/sdk/decorators/routing.py` (Accept negotiation at `:236`). +- Agent events and session id: `services/agent/src/protocol.ts:74`, + `sdks/python/agenta/sdk/agents/dtos.py`, `services/oss/src/agent/app.py`. +- Design rationale and trade-offs: [streaming-and-sessions.md](streaming-and-sessions.md). +``` diff --git a/docs/design/agent-workflows/architecture.md b/docs/design/agent-workflows/architecture.md new file mode 100644 index 0000000000..069a4ba7fa --- /dev/null +++ b/docs/design/agent-workflows/architecture.md @@ -0,0 +1,183 @@ +# Architecture + +This page explains how an agent runs inside Agenta, from the moment a request arrives +to the moment the answer comes back. Read it first. The other pages go deeper into the +[ports and adapters](ports-and-adapters.md), [sessions](sessions.md), and the two +shipped adapters ([Pi](adapters/pi.md), [Claude Code](adapters/claude-code.md)). + +## What an agent workflow is + +Agenta already runs prompt workflows: completion, chat, and the LLM judge. Each one calls +a model once and returns one answer. An agent is different. It runs a loop. It reads its +instructions, calls a model, runs a tool, reads the result, and calls the model again. It +keeps going until the task is done, then returns the final answer. + +This PoC adds the agent as a new kind of workflow. It sits behind the same `/invoke` +endpoint every other workflow uses, traces into the same spans, and reads its config from +the same playground. + +The loop itself is not the hard part. Open-source coding agents already run the loop well. +The hard part is running one of those agents *as an Agenta workflow*: behind the standard +contract, traced into the standard spans, with the agent and the place it runs both +swappable by config. That is the problem this architecture solves. + +## The core idea: a relay of programs + +The system is a relay. Each program starts the next one and passes work down the line. The +prompt travels down the relay, and the answer travels back up. + +Here is the whole relay for a normal local run: + +``` + browser / playground + │ POST /invoke + ▼ + ┌─────────────────────────────────────────────────── + │ CONTAINER 1: "services" (Python / FastAPI) + │ the Agenta backend. Parses the request, + │ gathers config, and calls the runner. + └─────────────────────────────────────────────────── + │ POST http://agent-pi:8765/run + ▼ + ┌─────────────────────────────────────────────────── + │ CONTAINER 2: "agent-pi" (Node / TypeScript) + │ the sidecar. server.ts → engines/rivet.ts + │ + │ rivet daemon (subprocess) + │ └── ACP adapter: pi-acp (subprocess) + │ └── pi (subprocess) ← the harness + └─────────────────────────────────────────────────── + │ HTTPS + ▼ + OpenAI / Anthropic (the model) +``` + +Two containers carry the request. Inside the second one, a small tree of processes does +the work. Each box has a clear job, and the next sections name them. + +## The two containers + +The deployment runs two containers that matter here. Both stay up all the time. You can +see both in `hosting/docker-compose/ee/docker-compose.dev.yml`. + +The **`services`** container runs the Python backend. Every Agenta workflow lives here, +including the agent. When you run an agent in the playground, the request lands in this +container. The handler reads the config (which agent, which model, the instructions, the +tools, the provider keys), builds one request, and calls the runner over HTTP. + +The **`agent-pi`** container is the sidecar. It runs a small Node web server on port 8765. +Its only job is to receive a `POST /run`, drive the agent, and return the result. The +`services` container reaches it on the internal network at `http://agent-pi:8765`. + +"Sidecar" just names a small helper container that runs next to a main one. Two reasons +justify the split. The agent code is TypeScript and the backend is Python, so they want +different runtimes. And the sidecar deliberately holds none of the stack's secrets (it has +no `env_file`), so a sandboxed agent cannot read the platform's Stripe or Composio keys. + +## Inside the sidecar: the process tree + +The sidecar does not run the agent itself. When a `/run` request arrives, its TypeScript +starts a chain of child processes, and each one starts the next. + +1. **The rivet daemon** (`sandbox-agent server`). Our code spawns it as a child process. + It is a binary from the open-source [`rivet-dev/sandbox-agent`](https://github.com/rivet-dev/sandbox-agent) + project (Apache-2.0). Think of it as a manager. You tell it "run agent `pi` with this + prompt," and it handles the work of launching the agent and streaming results back. + +2. **The ACP adapter** (`pi-acp`, or `claude-agent-acp` for Claude). The daemon spawns it + as a child process. It is a translator. It speaks ACP on the side facing the daemon and + the agent's own protocol on the side facing the agent. + +3. **The harness** (`pi`, or the `claude` CLI). The adapter spawns it as a child process. + This is the real coding agent. It reads the instructions, calls the model, runs tools, + and loops until the task is done. + +All three run as processes inside the `agent-pi` container. They are not separate +containers. They form a parent-child-grandchild tree. + +## The vocabulary, defined once + +| Term | What it is | +| --- | --- | +| **Harness** | The coding agent program. Pi, Claude Code, and Codex are harnesses. Each is a CLI that takes instructions, calls a model, runs tools, and loops. "Harness" is our umbrella word for "the agent engine." | +| **ACP** (Agent Client Protocol) | A shared language for talking to any coding agent. Without it, each agent has its own API and you write custom glue per agent. With it, you speak one protocol and the agent on the far end is swappable. This is why one config value flips `pi` to `claude`. | +| **ACP adapter** | The translator that makes one specific agent speak ACP. Pi does not speak ACP on its own, so `pi-acp` wraps it. Claude has `claude-agent-acp`. | +| **rivet daemon** | The manager that starts the adapter and harness, hides *where* they run, and streams their events back over ACP. We use it; we did not write it. | +| **Sandbox** | *Where* the agent's process tree runs. `local` means processes inside the sidecar. `daytona` means a throwaway cloud machine. | +| **Sidecar** | The always-on helper container (`agent-pi`) that drives runs. Not the sandbox. The sidecar starts the sandbox. | + +## Two axes you can change: harness and sandbox + +The whole point of the relay is that two pieces swap independently, by config, with no code +change. The playground exposes both as dropdowns. + +- **Harness** chooses *which* agent runs: `pi` or `claude`. It becomes the rivet `agent` + value, which selects the ACP adapter. +- **Sandbox** chooses *where* the agent's process tree runs: `local` or `daytona`. + +The two are orthogonal. You can run `pi` locally, `claude` locally, or `pi` on Daytona, and +each is one dropdown change. The request also carries a **permission policy** (`auto` or +`deny`) that decides how a permission-gating harness like Claude handles tool prompts in a +run with no human watching. + +## Local versus Daytona: the same tree, a different place + +The relay above is `sandbox: local`. The daemon, adapter, and harness all run as processes +inside the `agent-pi` container, on our own server. + +Switch to `sandbox: daytona` and one thing changes. That same tree runs in a Daytona cloud +sandbox instead. Daytona starts a throwaway remote machine, the daemon and adapter and +harness run there, and the sidecar talks to them over HTTP. Everything else is identical. + +So the sidecar is not the sandbox. The sidecar is the always-on driver. The sandbox is the +place the agent runs, which is either "processes inside the sidecar" (`local`) or "a cloud +machine the sidecar talks to" (`daytona`). + +## The lifecycle: cold per run + +Nothing in the process tree stays alive between runs. Only the two containers stay up. +Every invoke starts a fresh daemon, which starts a fresh adapter, which starts a fresh +harness. The run does its work, returns its answer, and then the runner tears the whole +tree down (`destroySandbox` and `dispose` in a `finally` block). The next invoke builds the +tree again from scratch. + +This is the **cold** model. It is simple and well isolated, and it has one consequence +worth stating up front: because no session is held between turns, a multi-turn conversation +replays its history on every turn. [Sessions](sessions.md) covers what that means today and +how a warm model could change it tomorrow. + +## The other engine: in-process Pi + +The relay above describes the **rivet engine**, the default in the deployed stack and the +path the rest of these docs assume. The runner also ships a second engine: **in-process +Pi**. It drives the Pi SDK directly inside the sidecar, with no daemon, adapter, or ACP in +between. It exists for the simplest local case and as a fallback that does not depend on the +rivet daemon. + +The two engines are the two backends behind the same SDK ports: `RivetBackend` and +`InProcessPiBackend`. Both serve the same `/run` contract, so which one runs is a deployment +detail, not a difference the workflow author sees. The +[ports and adapters](ports-and-adapters.md) page explains the ports and the backends. + +## How a request flows, end to end + +Putting it together, a single agent run on `pi` / `local` goes like this: + +1. The playground sends `POST /invoke` to the `services` container. +2. The Python handler (`agent/app.py`) reads the config, resolves the tools and provider + keys, and builds a neutral `AgentConfig` and `SessionConfig` from the SDK runtime + (`agenta.sdk.agents`). +3. It picks a backend (`RivetBackend` here) from the harness and sandbox, wraps it in an + `Environment` and a `Harness`, and the harness sends one `POST /run` over the backend's + transport (HTTP to the sidecar). +4. The sidecar's rivet engine starts the daemon, which starts `pi-acp`, which starts `pi`. +5. `pi` reads the instructions, calls the model, runs any tools, and streams events back up + the relay. Those events become trace spans nested under the `/invoke` span (the + [Pi adapter](adapters/pi.md) page explains who emits them). +6. The harness finishes. The runner reads the final text and the token usage, tears the + tree down, and returns one `/run` result. +7. The Python handler records the usage on the workflow span and returns the assistant + message as the `/invoke` response. + +The next pages explain the seam that makes step 3 engine-agnostic, the session model behind +steps 4 to 6, and exactly how each adapter implements step 5. diff --git a/docs/design/agent-workflows/ports-and-adapters.md b/docs/design/agent-workflows/ports-and-adapters.md new file mode 100644 index 0000000000..a86002c1a8 --- /dev/null +++ b/docs/design/agent-workflows/ports-and-adapters.md @@ -0,0 +1,196 @@ +# Ports and adapters + +The [architecture](architecture.md) page showed the relay of programs. This page shows the +seam that keeps that relay swappable: the ports, where they live, and the adapters behind +them. + +## Where the runtime lives + +The neutral runtime is part of the published Python SDK, at +`sdks/python/agenta/sdk/agents/`. An SDK user gets it as `agenta.sdk.agents` (with the main +types re-exported as `ag.AgentConfig`, `ag.RivetBackend`, and so on). The Agenta service +(`services/oss/src/agent/`) is a thin consumer of it: it resolves tools and secrets +server-side, threads a trace context, and runs a turn through the same ports. Nothing in the +SDK runtime calls the Agenta API, so the same code runs an agent standalone, with no Agenta +backend at all. + +The package follows Agenta's hexagonal vocabulary, the same words the `api/` domains use: + +| Layer | File | What it holds | +| --- | --- | --- | +| DTOs | `dtos.py` | data contracts (Pydantic): `AgentConfig`, `SessionConfig`, `Message`, events, capabilities, the per-harness configs | +| Ports | `interfaces.py` | the abstract contracts: `Backend`, `Environment`, `Sandbox`, `Session`, `Harness` | +| Adapters | `adapters/` | the implementations: the backends and the harnesses | +| Utils | `utils/` | shared plumbing for the runner-backed adapters (the `/run` wire and the transports) | + +## The three layers + +The runtime is three ports stacked, lowest to highest. + +### Backend (the engine) + +A `Backend` is the engine. It declares which harnesses it can drive, owns the sandbox and +session lifecycle, and is pure plumbing: it takes an already-harness-shaped config and +launches it. It carries no "how this harness works" logic. + +```python +class Backend(ABC): + supported_harnesses: ClassVar[FrozenSet[HarnessType]] = frozenset() + def supports(self, harness) -> bool: ... + async def create_sandbox(self) -> Sandbox: ... + async def create_session(self, sandbox, config, *, harness, secrets, trace, session_id) -> Session: ... +``` + +Each backend is its own class and hard-codes what makes it that engine. There is no shared +base beyond the ABC. Three exist: + +- **`RivetBackend`** drives a harness over ACP through the TypeScript rivet runner. It + supports Pi and Claude. Its `sandbox` axis (`local` or `daytona`) is a constructor + argument, because it is a real runtime choice. +- **`InProcessPiBackend`** drives Pi in-process through the runner, with no rivet daemon. Pi + only, local only. It was the first backend and stays as the simplest one, the reference to + read when writing a new backend. +- **`LocalBackend`** runs a harness on the user's own machine for standalone SDK use (Pi via + a bundled JS runner, Claude via the Python `claude-agent-sdk`). See + [`scratch/sdk-local-backend/status.md`](scratch/sdk-local-backend/status.md) for its build + state. + +`RivetBackend` and `InProcessPiBackend` are different engines that happen to share the +`utils` wire and transport helpers; neither subclasses the other. + +### Environment (where it runs) + +An `Environment` wraps a backend and owns the sandbox policy: by default a fresh sandbox per +session (the cold model, strong isolation). Share one `Environment` across harnesses to +share its sandbox, or use one per harness to isolate them. The workflow handler builds an +`Environment(backend)` and never touches the backend's sandbox calls directly. + +### Harness (the conversation, per harness type) + +A `Harness` wraps an `Environment` for one harness type (`PiHarness`, `ClaudeHarness`). It +does two jobs. First, it validates at construction that the environment's backend can drive +it; if not, it raises `UnsupportedHarnessError` immediately: + +```python +ClaudeHarness(Environment(InProcessPiBackend())) +# UnsupportedHarnessError: InProcessPiBackend cannot drive harness 'claude'; it supports: pi +``` + +Second, it holds the per-harness adaptation logic, the part that used to live in the +TypeScript runner. `Harness._to_harness_config` maps the neutral `SessionConfig` into the +harness's own config, and the two harnesses genuinely differ: + +- **`PiHarness`** keeps built-in tool names, delivers resolved tools natively (Pi has no + MCP), and forces the permission policy to `auto` because Pi does not gate tool use. +- **`ClaudeHarness`** drops Pi built-ins (Claude has none), delivers tools over MCP, and + honors the permission policy because Claude gates tool use. + +Both normalize the resolved tool specs (a name, a description, a JSON-Schema `inputSchema`, +the `callRef`). The backend below stays pure plumbing; this layer owns the harness knowledge. + +A `make_harness(harness_type, environment)` factory maps the playground's harness string to +the right class. + +The workflow handler runs a turn through these ports: + +```python +backend = select_backend(selection) # RivetBackend or InProcessPiBackend +harness = make_harness(selection.harness, Environment(backend)) +await harness.setup() +result = await harness.prompt(session_config, messages) +await harness.cleanup() +``` + +## The configs + +`AgentConfig` is the one neutral config the platform and playground speak: instructions +(written as `AGENTS.md`), model, and provider-agnostic tool references. +`AgentConfig.from_params` parses a downloaded config dict (the `agent` element, a `prompt` +prompt-template, or a flat shape) so a standalone user runs exactly what the playground +stores. `RunSelection` carries the run-time choices stored alongside it (harness, sandbox, +permission policy); the caller reads it to pick a backend and a harness class. + +`SessionConfig` bundles everything one run needs except where it runs: the `AgentConfig`, +the provider secrets, the permission policy, the trace context, and the resolved tool +delivery (built-in names, custom specs, the `/tools/call` callback). Sandbox is deliberately +not in it; that is a backend and environment concern. + +The per-harness configs (`PiAgentConfig`, `ClaudeAgentConfig`) are what a backend plumbs. +Each shapes its own tool and permission fields for the wire, so the difference between Pi's +native tools and Claude's MCP tools lives in the config types, not in a runtime branch. + +## How the service picks a backend + +The handler chooses on every request, in `services/oss/src/agent/app.py`. `select_backend` +returns a backend instance: `InProcessPiBackend` for Pi running locally, and `RivetBackend` +otherwise (any other harness, a non-local sandbox, or `AGENTA_AGENT_RUNTIME=rivet`). The +in-process Pi engine only knows how to run Pi locally, so anything else routes to rivet +rather than silently dropping the choice. + +The transport to the runner is a deployment detail each backend takes as a constructor +argument: `AGENTA_AGENT_PI_URL` set (the Docker deployment) means HTTP to the sidecar; unset +(a local checkout) means spawn the runner CLI from the wrapper directory. + +## The wire contract: one `/run` shape + +Both transports send the same camelCase JSON to the TypeScript runner and parse the same +result back. The shape lives once in `utils/wire.py` on the Python side and `protocol.ts` on +the TypeScript side. This contract is the actual boundary of the system. + +**Request** (the harness-shaped config plus the conversation): + +| Field | Meaning | +| --- | --- | +| `backend` | The engine the runner uses (`rivet` or `pi`), set by the backend | +| `harness`, `sandbox` | The two swap axes | +| `sessionId` | Continue a prior run by replaying its history | +| `agentsMd` | The agent's instructions, written as `AGENTS.md` | +| `model` | The requested model id | +| `messages` | The conversation so far; the runner sends the latest turn and replays the rest | +| `secrets` | Provider API keys as env vars, resolved from the project vault | +| `tools`, `customTools`, `toolCallback` | The resolved runnable tools and where they call back | +| `permissionPolicy` | `auto` or `deny` for a permission-gating harness | +| `trace` | The Agenta trace context, so the run nests under the `/invoke` span | + +**Result** (the reply plus structured run metadata): + +| Field | Meaning | +| --- | --- | +| `output` | The final assistant text (what the playground renders) | +| `messages` | The structured assistant messages | +| `events` | The structured event log for the turn (see below) | +| `usage` | Token and cost totals, rolled onto the workflow span | +| `stopReason` | Why the turn ended | +| `capabilities` | What the harness was probed to support this run | +| `sessionId`, `model`, `traceId` | Identifiers for the run | + +## The shared vocabulary: capabilities, content blocks, events + +Three neutral types travel on that wire. They are ours, not any one engine's, so a non-rivet +adapter implements them too. + +**Capabilities** describe what a harness can do: `mcp_tools`, `images`, `usage`, +`streaming_deltas`, `permissions`, and the rest. The rivet runner probes them live from the +daemon and returns them in the result. This is what removed the brittle `if harness == "pi"` +branches in the runner: it now branches on a flag, where the live answer is. For example, it +delivers tools over MCP only when the harness reports `mcp_tools`. + +**Content blocks** mirror ACP: a message's content is either a plain string or a list of +`text` / `image` / `resource` blocks. Today the playground sends only text. The image and +resource kinds are plumbed through the types so an image-capable harness can take them. + +**Events** are the structured stream. Each event is one of `message`, `thought`, +`tool_call`, `tool_result`, `usage`, `error`, or `done`. The runner builds this log from the +harness as the run proceeds and returns it on the result. An `on_event` sink can also +receive them. Today the transports deliver the whole log at once after the run, since `/run` +is request-and-response; live streaming over the HTTP edge is a documented follow-on. + +## Why this shape + +The port mirrors rivet's vocabulary but keeps the types ours, so rivet is one adapter behind +the seam, not the seam itself. The same ports carry two working engines (rivet over ACP, +in-process Pi) and have room for a standalone local engine. Making the engine a real +`Backend` class, rather than a string the transport carries, is what lets a backend hard-code +its own identity and lets a standalone SDK user construct one directly. The cost of the +flexibility is one extra hop and one wire contract to keep in sync across two languages, which +the `utils/wire.py` and `protocol.ts` pairing contains in one place each. diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/README.md b/docs/design/agent-workflows/scratch/harness-port-redesign/README.md new file mode 100644 index 0000000000..3ba4dbd2c8 --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/README.md @@ -0,0 +1,69 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Harness + Runtime port redesign + +Status: research and proposal, scope approved (full A to E arc, cold per invoke). Not +implemented. Read this, then [`research.md`](research.md) (the side by side), then +[`proposal.md`](proposal.md) (the recommended port shape), then [`plan.md`](plan.md) (the +phased build), with [`status.md`](status.md) holding decisions and open questions. + +## Why this exists + +WP-8 adopted [`rivet-dev/sandbox-agent`](https://github.com/rivet-dev/sandbox-agent) +unmodified and kept our `Harness` and `Runtime` ports unchanged on purpose (see +[`../wp-8-rivet-acp-runtime/`](../wp-8-rivet-acp-runtime/README.md)). That shipped, but +it also exposed how thin our ports are next to rivet's SDK. Our `Harness.invoke()` +takes a request and returns one string. Rivet's SDK models sessions, a live structured +event stream, per harness capabilities, multimodal input, permissions, and an explicit +lifecycle. + +This folder compares the two interfaces and proposes how to evolve our ports so they +borrow rivet's vocabulary without giving up the neutral seam (rivet stays one adapter +behind the port, so the legacy Pi path and a future non-rivet harness still fit). + +## The one screen summary + +Rivet splits the surface into three planes. The split is the main lesson. + +| Plane | Rivet owns it via | Belongs in our port? | +| --- | --- | --- | +| Runtime / sandbox (where the daemon runs, lifecycle) | `SandboxAgent` + providers (`local`, `daytona`, `e2b`, `docker`, ...) | Yes, as the environment seam | +| Agent session (prompt, config, events, permissions) | `Session` (`prompt`, `onEvent`, `setModel`, ...) | Yes, this is the heart of the port | +| System (filesystem, process, desktop) | `SandboxAgent.readFsFile` / `runProcess` / `clickDesktop` ... | No. Provisioning only, never exposed to the config author | + +Our current `Harness` port collapses the first two planes into a single blocking +`invoke()` and ignores most of what the session plane offers. + +## Verdicts on the proposed scope + +The starting hypothesis was: sessions, skills, tools, hooks, and attachments belong in +the port; system (filesystem) does not; streaming and session destroy are worth adopting. +Mostly right. The corrections: + +- **Sessions** — adopt. Make a session a first class object with create, continue, + destroy, and a pluggable persistence driver, the way rivet does. Today a session is + just a `session_id` string and the history is replayed as prompt text. +- **Skills** — adopt, but as config artifacts laid into the workspace, not a new verb. + Rivet exposes `setSkillsConfig(directory, ...)`; the harness reads them from disk. +- **Tools** — adopt and generalize. WP-7 already passes tools as `custom_tools` plus a + callback. Make delivery capability gated (MCP vs native) instead of `if harness == pi`. +- **Hooks** — **correction.** Rivet has no hook API. Hooks are a harness level concept + (Pi and Claude read them from their own config dirs). Model them as part of the agent + config bundle laid into the workspace, not as a port method rivet would host. +- **Attachments** — adopt. Rivet prompts take ACP content blocks (text, image, audio, + resource, resource_link). Our prompt is a bare string, so images and files cannot pass. +- **System (filesystem etc.)** — correct, keep it out of the `Harness` port. It is part + of the runtime/sandbox provider surface and we already use `writeFsFile`/`mkdirFs` only + to provision (upload AGENTS.md, auth, the extension) on Daytona. +- **Communication / streaming** — adopt. Replace the one shot string return with a + structured event stream plus a final result, so tracing, multi message output, and + client streaming all read from one source. +- **Destroy / lifecycle** — adopt. Rivet has `destroySession`, `destroySandbox`, + `pauseSandbox`, `killSandbox`, `dispose`. Our `Runtime.pause` is a no-op stub. + +## What this does not propose + +A rewrite. The recommendation is a phased evolution (see [`proposal.md`](proposal.md)) +that keeps `/invoke` and `/inspect` working at every step and leaves rivet behind the +port. The folder jail, multi tenant isolation, and the warm shared daemon stay deferred +to [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md). diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md b/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md new file mode 100644 index 0000000000..a465409557 --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/implementation.md @@ -0,0 +1,187 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Implementation notes + +How the approved A to E arc was expected to land in code, with the cold + replay +constraint. This was the working note for that effort. The as-built design later diverged: +the neutral runtime moved into the SDK at `agenta.sdk.agents`, the engine became a `Backend` +class rather than a wire string, and the old `Harness`/`AgentSession` shape was replaced by +the `Backend` / `Environment` / `Harness` / `Session` ports. See the +[design pages](../../README.md) for what shipped. + +## Module layout + +### Python — two packages (the plan at the time) + +The plan was to split the engine-agnostic runtime from the Agenta workflow integration, so +nothing in the runtime was Agenta-specific and the god-module was gone. The as-built design +went further: the runtime moved out of the service entirely and into the SDK at +`sdks/python/agenta/sdk/agents/` (`dtos.py`, `interfaces.py`, `errors.py`, `adapters/`, +`utils/`). The package and file names below are the superseded plan, not what shipped. + +The planned `services/oss/src/harness/` runtime package: + +| File | Holds | +| --- | --- | +| `ports.py` | The neutral types and the two seams. Types: `HarnessCapabilities`, `ContentBlock`, `Message`, `AgentEvent`, `TraceContext`, `ToolCallback`, `SessionConfig`, `AgentRequest`, `AgentResult`. Seams: `Environment` (where it runs) and `Harness` (the agent), plus the concrete `AgentSession`. | +| `transports.py` | The two transports: `SubprocessHarness` (spawn the TS CLI) and `HttpHarness` (POST to the sidecar). Both share `wire.py`. | +| `environment.py` | `LocalEnvironment` (subprocess on this host). | +| `wire.py` | Serializes an `AgentRequest` to the camelCase `/run` JSON and parses an `AgentResult` back. The wire shape lives once. | + +`services/oss/src/agent/` — the Agenta workflow app (was the single `agent.py` god-module): + +| File | Holds | +| --- | --- | +| `app.py` | The `/invoke` handler plus backend selection. Thin: it orchestrates the modules below. | +| `tools.py` | Tool resolution through `/tools/resolve` (and slug parsing). | +| `secrets.py` | Provider keys from the project vault. | +| `tracing.py` | `trace_context` and `record_usage` (the OTel glue). | +| `client.py` | Shared Agenta-backend access (base URL + caller credential). | +| `schemas.py` | The `/inspect` schemas. Gains the permission-policy parameter. | +| `config.py` | The file-backed `AgentConfig` and the TS runner path. | + +This plan modelled the backend engine as a wire string the transport carried, with two +transports rather than per-engine classes. The as-built design rejected that: the engine is +a `Backend` class (`RivetBackend`, `InProcessPiBackend`, `LocalBackend`) that hard-codes its +own engine id and supported harnesses, and the HTTP vs subprocess delivery is a transport +helper each backend takes as a constructor argument. Request parsing also moved onto the +DTOs (`AgentConfig.from_params`, `RunSelection.from_params`) instead of a separate +`inputs.py`. + +### TypeScript (`services/agent/src/`) — grouped by role + +| File | Holds | +| --- | --- | +| `cli.ts`, `server.ts` | The two entrypoints (stdio subprocess, HTTP sidecar). Route to an engine by the request's `backend`. | +| `protocol.ts` | Shared wire types: `AgentRunRequest`, `AgentRunResult`, `AgentEvent`, `ContentBlock`, `HarnessCapabilities`. Both engines import from here. | +| `engines/pi.ts` | Legacy engine: drive the Pi SDK in-process. Returns the enriched result. | +| `engines/rivet.ts` | Rivet engine: drive a harness over ACP. Probes `getAgent(harness).capabilities` and branches on capability flags, not on the harness name. Returns the enriched result, with usage for both Pi and Claude. | +| `tracing/otel.ts` | The Pi-extension tracer and the ACP-event tracer; accumulates the structured event log. | +| `tools/client.ts` | The one `/tools/call` HTTP client. | +| `tools/mcp-bridge.ts`, `tools/mcp-server.ts` | Tool delivery over MCP for non-Pi harnesses. | +| `extensions/agenta.ts` | The Pi extension (tracing + tools), bundled to `dist/extensions/agenta.js`. | + +The folder grouping (entrypoints + contract at the top, `engines/`, `tracing/`, `tools/`, +`extensions/`) replaced a flat `src/` of ten files that had grown one work package at a +time. No behavior change. + +## The seams (the planned shape) + +This was the seam shape this effort planned. The as-built design replaced it: there is no +`invoke` transport verb and no `AgentSession` class. Instead a `Backend` owns the sandbox +and session lifecycle, a `Harness` adapter (`PiHarness`, `ClaudeHarness`) holds the +per-harness mapping over an `Environment`, and a `Session` port is the conversation +(`prompt`, `destroy`). See [ports and adapters](../../ports-and-adapters.md). + +```python +class Harness(ABC): + async def setup(self) -> None: ... + async def shutdown(self) -> None: ... + async def invoke(self, request: AgentRequest, *, on_event=None) -> AgentResult: ... + async def destroy_session(self, session_id: str | None) -> None: ... # cold: no-op + def create_session(self, config: SessionConfig) -> AgentSession: ... + +class AgentSession: # sugar over invoke; the first-class session abstraction + async def prompt(self, messages, *, on_event=None) -> AgentResult: ... + async def destroy(self) -> None: ... +``` + +In this plan `invoke` was the single transport call (one cold run) and `AgentSession` was +the rivet-shaped abstraction on top: `create_session(config)` then `session.prompt(messages)`. +Under cold + replay the session holds no warm daemon; continuation replays the caller-supplied +history into a fresh run, exactly as WP-8 does today. Server-side persisted history is the +deferred Phase C bit (see Deferred below). + +## Capabilities: probed in TS, reported in the result + +A separate capability probe would cost a whole daemon spin-up under the cold model. So the +rivet runner probes `getAgent(harness).capabilities` while its daemon is already up, drives +tool delivery and tracing off the flags (`mcpTools`, `usage`, `streamingDeltas`, ...), and +returns the capabilities in the result. Python keeps a small static table only for input +shaping (for example, do not send image blocks to a harness without `images`). This is +what removes the `if harness == "pi"` branching: the decision moves to where the live +answer is, the TS runner. + +## Wire contract (`/run`) + +Request (camelCase), superset of today: `harness`, `sandbox`, `sessionId`, `agentsMd`, +`model`, `messages` (each `content` is a string or a `ContentBlock[]`), `prompt`, +`secrets`, `tools`, `customTools`, `toolCallback`, `permissionPolicy` (`auto` | `deny`), +`trace`. + +Result: `ok`, `output` (final text), `messages` (structured assistant messages), `events` +(the `AgentEvent` log for the turn), `usage` (`{input, output, total, cost}`, now for the +rivet path too), `stopReason`, `capabilities`, `sessionId`, `model`, `traceId`, `error`. + +## What each phase delivers here + +- **A** capabilities + structured result: `HarnessCapabilities`, the enriched `AgentResult` + (messages, usage, stopReason, capabilities), and capability-driven branching in `runRivet`. +- **B** event stream through the port: `AgentEvent` log on the result, plus an optional + `on_event` callback on `invoke`/`prompt`. The HTTP edge (`/invoke`) stays request and + response; live SSE to the playground is deferred (ties to WP-4). +- **C** first-class sessions: `AgentSession` create / prompt / destroy. Continuation stays + cold + replay with caller-held history. A server-side `SessionStore` is deferred. +- **D** content blocks, permissions, skills, hooks: `ContentBlock` on the turn (text now, + image-ready), `permissionPolicy`, and skills/hooks carried as workspace artifacts. +- **E** retire the exec port: `Runtime` becomes `Environment`; `exec` survives only as the + subprocess transport's mechanism. + +## Verification + +Local CLI runs against real models (2026-06-17), driving `services/agent/src/cli.ts`: + +| Combo | Result | Usage source | Live capabilities | +| --- | --- | --- | --- | +| `pi` (legacy in-process) | reply ok | Pi extension (`otel.usage()`) | mcpTools=false | +| `rivet` + `pi` + `local` | reply ok | extension usage file | probed: mcpTools=false, images=true | +| `rivet` + `claude` + `local` | reply ok | ACP `usage_update` | probed: mcpTools=true, permissions=true | + +The capability probe returns the harness's real flags (Pi and Claude differ), and tool +delivery routes off `mcpTools`. The structured result carries output, messages, events, +usage (token split + cost), stopReason, capabilities, sessionId, model, traceId. Python +compiles and passes `ruff`; TypeScript passes `tsc --strict --noEmit`. + +### Review + +A high-effort recall review (8 finder angles, 36 candidates, single-vote verify) found 10 +issues, all fixed and re-verified: + +- usage_update read non-existent `input`/`output` fields, so the Claude/Codex token split + was always 0. Fixed: read the split from `PromptResponse.usage` in `runRivet`. Verified + Claude now reports input/output (3327/6). +- `Message.to_wire()` crashed on list (content-block) content. Fixed: `Message.from_raw` + coerces blocks into `ContentBlock`; `to_wire` tolerates dicts. Verified a content-block + turn returns cleanly. +- `priorMessages` dropped every prior user turn equal to the prompt, not just the latest. +- The legacy Pi engine silently swallowed a `claude`/`daytona` selection. Fixed: + `_select_backend` upgrades to rivet when the harness/sandbox needs it. +- The `/tools/call` client was triplicated across `runPi`, `piExtension`, and + `toolBridgeServer`. Fixed: one shared `toolClient.ts`. +- Dead code removed: the `RunCall` alias and a stale type re-export block. + +### Live verification (dev stack, 2026-06-17) + +Run on the dev box with the agent-pi sidecar and services container reloaded onto this +branch (both bind-mount the repo): + +- **Daytona**: `rivet+pi+daytona` through the live sidecar returned a correct answer in + ~14s with usage read back from the in-sandbox extension file. +- **Full playground run**: the agent app in the `pi-agents` project answered "Hello! The + capital of Germany is Berlin." with status Success, 6.54s, 1.2K tokens. The new + Harness/Sandbox config selectors render from `schemas.py`. +- **Trace nesting**: the trace shows `invoke_agent` nested directly under the `_agent` + workflow root span (same trace, usage propagated). The agent's run joins the `/invoke` + trace as required. + +Remaining manual check: a Composio tool end to end through the playground (the tool +routing is verified by capability; the WP-7 resolution path is unchanged). + +## Deferred (documented, not built in this pass) + +- Server-side persisted session history (the `SessionStore` / DB). Today the playground + holds history and replays it; the session abstraction is in place for when we add the store. +- Live SSE streaming to the playground client (the event stream is delivered through the + port as a log + callback; the HTTP edge stays request and response). +- Image content blocks end to end (the type is plumbed; the playground does not send images yet). +- `session/fork`, the folder jail, and the warm shared daemon (all out of scope per WP-8). diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md b/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md new file mode 100644 index 0000000000..c23f0ce820 --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/plan.md @@ -0,0 +1,100 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Build plan + +Scope set by the user (2026-06-17): full A to E arc, cold per invoke (no warm daemon). +See [`status.md`](status.md) for the decisions and [`proposal.md`](proposal.md) for the +target shape. Each phase ships independently and keeps `/invoke` and `/inspect` working. + +Reading key for the file column: `ports.py`, `rivet_harness.py`, `schemas.py`, +`agent.py`, `pi_harness.py`, `pi_http_harness.py` are under `services/oss/src/`; +`runRivet.ts`, `runPi.ts`, `server.ts`, `cli.ts`, `toolBridge*.ts`, `agenta-otel.ts` are +under `services/agent/src/`. + +## Phase A. Capabilities and a structured result + +Goal: kill the `if harness == "pi"` branches and stop flattening the run to one string. + +| Task | Files | +| --- | --- | +| Add a `HarnessCapabilities` dataclass (the rivet `AgentCapabilities` flags we use: `mcp_tools`, `images`, `file_attachments`, `plan_mode`, `reasoning`, `permissions`, `usage`, `streaming_deltas`, `session_lifecycle`) | `ports.py` | +| Probe capabilities once per harness via the rivet SDK `getAgent(id)`; cache; pass to the result | `runRivet.ts` | +| Replace harness-name branches (tools native vs MCP, tracing `emitSpans`) with capability checks | `runRivet.ts` | +| Widen `HarnessResult` / `AgentRunResult` to carry `messages`, `usage`, `tool_calls`, `stop_reason`, `capabilities` (data already accumulates in the event handler) | `ports.py`, `runPi.ts`, `rivet_harness.py` | +| Keep `output` as the derived final string so `/invoke` is unchanged | `agent.py` | + +Done when: a Pi run and a Claude run both return a structured result; no code path reads +`harness == "pi"`; the `/invoke` response body is byte-identical for a simple turn. + +## Phase B. Event streaming through the port + +Goal: forward the rivet `session/update` stream through the port instead of consuming it +privately for tracing. + +| Task | Files | +| --- | --- | +| Define an `AgentEvent` type (variants: `message`, `thought`, `tool_call`, `plan`, `usage`, `done`) mapped from ACP `session/update` | `ports.py`, `runPi.ts` | +| Add an event sink to `invoke` (callback or async generator); tracing reads from it rather than its own `session.onEvent` | `ports.py`, `rivet_harness.py`, `runRivet.ts`, `agenta-otel.ts` | +| Transport: stream events over the `/run` hop (NDJSON or SSE) for the HTTP sidecar; keep a final JSON result frame | `server.ts`, `cli.ts`, `rivet_harness.py` | +| Optional: expose a streaming surface from `agent.py` (feeds WP-4 multi message output); `/invoke` still returns the final message | `agent.py` | + +Done when: tracing is built from the forwarded event stream (no private subscription in +`runRivet.ts`); a caller can observe `message`/`tool_call`/`usage` events live; `/invoke` +still returns one final message. + +## Phase C. First class sessions (cold, replay backed) + +Goal: a real `AgentSession` object backed by persisted history. Continue a conversation by +replaying persisted events into a fresh cold sandbox, not by the caller passing transcript +text and not by a warm ACP `session/load`. + +| Task | Files | +| --- | --- | +| Add `create_session(config) -> AgentSession`, `resume_session(id)`, `AgentSession.prompt(...)`, `AgentSession.destroy()` to the port | `ports.py` | +| Define a `SessionStore` analogue of rivet's `SessionPersistDriver` (`get_session`, `list_events`, `insert_event`); persist the `AgentEvent` stream from Phase B | new module under `services/oss/src/agent_pi/` | +| Implement continuation as replay: on `resume`, load persisted events, rebuild turn context, run in a fresh cold sandbox (replaces `buildTurnText` transcript replay) | `rivet_harness.py`, `runRivet.ts` | +| Wire the store: backend DB on the platform, file standalone (default assumption, open Q3) | `agent.py`, new module | +| Optional: model `session/fork` for "try N variations of a turn" (defer unless a caller exists, open Q5) | `ports.py`, `runRivet.ts` | + +Done when: a second turn against a `session_id` reconstructs context from the store (not +from caller-supplied `messages`); destroying a session drops its history; cold lifecycle +is unchanged (no warm daemon). + +## Phase D. Content blocks, permissions, skills, hooks + +Goal: richer input and the remaining config surface. + +| Task | Files | +| --- | --- | +| Turn `prompt` into ACP content blocks (`text`, `image`, `audio`, `resource`, `resource_link`); gate images/files on `images`/`file_attachments` capability | `ports.py`, `runRivet.ts`, `runPi.ts` | +| Surface attachments in the workflow input schema so the playground can send them | `schemas.py` | +| Add a `permission_policy` to the session config (auto-allow, deny, delegate-to-callback); replace the hardcoded auto-approve | `ports.py`, `runRivet.ts` | +| Optional: surface permission requests as events for human in the loop | `ports.py`, `runRivet.ts`, `agent.py` | +| Add `skills` to the session config, resolved before the run and laid into `cwd` (or via rivet `setSkillsConfig`) | `ports.py`, `rivet_harness.py`, `runRivet.ts` | +| Add `hooks` as config artifacts laid into the workspace / agent dir (not a port verb; same shape as the Pi extension install) | `ports.py`, `runRivet.ts` | + +Done when: an image attachment reaches a capable harness; a deny policy blocks a tool; a +skill file and a hook artifact are present in the run and exercised. + +## Phase E. Retire the `Runtime` exec port + +Goal: fold "where it runs" fully into the environment seam backed by rivet providers. + +| Task | Files | +| --- | --- | +| Rename/replace `Runtime` with an `Environment` seam (`start`, `dispose`, `destroy`, `pause`, provisioning `put_file`); back lifecycle with `destroySandbox`/`dispose`/`pauseSandbox` | `ports.py`, `rivet_harness.py`, `local_runtime.py` | +| Move provisioning (AGENTS.md, auth, extension upload) behind `Environment.put_file` | `runRivet.ts`, `rivet_harness.py` | +| Keep `exec` only while the legacy in-process Pi subprocess transport needs it; otherwise remove | `ports.py`, `pi_harness.py` | +| Update `_build_harness` to construct the environment from provider config, not an exec runtime | `agent.py` | + +Done when: the rivet path no longer depends on `Runtime.exec`; lifecycle calls map to +rivet provider lifecycle; the legacy Pi path still runs or is explicitly retired. + +## Cross cutting + +- **Legacy adapters.** `PiHarness` and `PiHttpHarness` must satisfy the widened port at + each phase, or be adapted behind a shim. Decide per phase whether to keep them. +- **Tracing.** The `createRivetOtel` event-stream tracer is the reference consumer of the + Phase B stream; keep its output stable so existing traces do not regress. +- **No regressions to `/invoke` / `/inspect`.** Verify after every phase with a live + playground run (the WP-8 verification path). diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md b/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md new file mode 100644 index 0000000000..9d6c78f89a --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/proposal.md @@ -0,0 +1,171 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Proposal: evolve the ports toward a session shaped seam + +## Principle + +Borrow rivet's vocabulary, keep the neutral seam. Rivet stays one adapter behind the +port so the legacy in process Pi path and any future non rivet harness still fit. We are +not adopting the rivet SDK as our public interface. We are reshaping our port so the rich +session that rivet already gives us stops getting flattened to a string at the boundary. + +Three moves carry most of the value: + +1. Split the port into an **Environment** seam (where it runs, its lifecycle) and an + **AgentSession** seam (the conversation), matching rivet's plane A and plane B. +2. Make the turn call **event shaped**: stream structured events, return a structured + result. Stop returning one string. +3. Make a **session a first class object** with create, continue, destroy, backed by a + persistence driver, so "continue" uses ACP `session/load` instead of replaying + transcript text. + +Everything else (capabilities, content blocks, permissions, skills, lifecycle) hangs off +those three. + +## Target shape (conceptual, Python) + +Not final signatures. The intent, so the phased plan has a destination. + +```python +# Plane A: where the agent runs and its lifecycle. Rivet providers live below this. +class Environment(ABC): + async def start(self) -> None: ... + async def dispose(self) -> None: ... + async def destroy(self) -> None: ... # tear the sandbox down + async def pause(self) -> None: ... # optional, provider dependent + # provisioning only, never exposed to the agent author: + async def put_file(self, path: str, body: bytes) -> None: ... + +# Capabilities the runtime probed from the harness (rivet AgentCapabilities). +@dataclass +class HarnessCapabilities: + mcp_tools: bool = False + images: bool = False + file_attachments: bool = False + plan_mode: bool = False + reasoning: bool = False + permissions: bool = False + usage: bool = False + session_lifecycle: bool = False + streaming_deltas: bool = False + # ... the rest of the 18 flags + +# Plane B: the agent conversation. +class AgentSession(ABC): + id: str + capabilities: HarnessCapabilities + + async def prompt(self, blocks: list[ContentBlock]) -> AsyncIterator[AgentEvent]: ... + async def destroy(self) -> None: ... + # config the harness honors (each is capability gated): + async def set_model(self, model: str) -> None: ... + async def set_mode(self, mode: str) -> None: ... + async def on_permission(self, request: PermissionRequest) -> PermissionReply: ... + +class Harness(ABC): + async def get_capabilities(self) -> HarnessCapabilities: ... + async def create_session(self, config: SessionConfig) -> AgentSession: ... + async def resume_session(self, session_id: str) -> AgentSession: ... +``` + +`SessionConfig` is the agent config bundle: `agents_md`, `model`, `skills`, `tools` +(definition plus body plus delivery), `mcp`, `hooks` (as artifacts), `harness`, +`permission_policy`. `ContentBlock` mirrors ACP: `text | image | audio | resource | +resource_link`. `AgentEvent` mirrors the `session/update` variants: +`message`, `thought`, `tool_call`, `plan`, `usage`, `done`. + +## Field by field: where today's fields go + +| Today (`HarnessRequest`) | Tomorrow | +| --- | --- | +| `agents_md` | `SessionConfig.agents_md` (still written as `AGENTS.md`) | +| `model` | `SessionConfig.model`, applied via `set_model` (capability gated) | +| `prompt` | a `text` content block in `prompt(blocks)` | +| `messages` | prior turns become `session/load` replay, not transcript text | +| `session_id` | `resume_session(id)` returning an `AgentSession` | +| `tools` / `custom_tools` / `tool_callback` | `SessionConfig.tools`, delivered by capability (MCP vs native) | +| `trace` | unchanged; still injected at the environment's birth | +| (new) attachments / images | `image` / `resource` content blocks | +| (new) per harness behavior | `HarnessCapabilities` instead of `if harness == "pi"` | + +`HarnessResult.output` becomes the terminal `done` event plus the accumulated `message` +events. The single string is still trivially derivable for `/invoke`'s current response. + +## How each piece maps to rivet + +- **Sessions** → `createSession` / `resumeSession` / `resumeOrCreateSession` / + `destroySession`, plus a `SessionPersistDriver`. Adopt the persist driver interface + shape so the platform backs it with Postgres and a standalone run backs it with a file, + exactly as rivet already splits in memory vs Postgres. +- **Streaming** → `session.onEvent`. `runRivet.ts` already subscribes for tracing + (`otel.handleUpdate`). The change is to forward those events through the port instead of + consuming them privately and returning a string. +- **Capabilities** → `getAgent().capabilities`. Probe once per harness, cache, branch on + flags. +- **Attachments** → ACP content blocks on `prompt`. Gate on `fileAttachments` / `images`. +- **Skills** → `setSkillsConfig(directory, ...)` or laid into `cwd` as files. Part of + `SessionConfig`, resolved before the run like AGENTS.md. +- **Tools** → keep WP-7's definition plus body plus callback. Deliver over MCP when + `mcpTools` is set, native when the harness wants native (today's Pi extension path). +- **Hooks** → **not a rivet call.** Lay them into the workspace or agent dir as artifacts, + the way we already install the Pi tracing extension. Model `hooks` as files in + `SessionConfig`, not a port verb. +- **Permissions** → `onPermissionRequest` / `respondPermission`. Replace the hardcoded + auto approve with a `permission_policy` on `SessionConfig` (auto allow, deny, or + delegate to a callback), and later surface requests as events for true human in the + loop. +- **Lifecycle / destroy** → `Environment.destroy` / `dispose` and `AgentSession.destroy`, + mapping to `destroySandbox` / `dispose` / `destroySession`. Retire the `Runtime.pause` + no-op or back it with `pauseSandbox` where the provider supports it. + +## What stays out of the port + +The system plane: filesystem, process, desktop. We use `writeFsFile` / `mkdirFs` only to +provision a Daytona sandbox (upload AGENTS.md, auth, the extension). Keep that inside the +`Environment` adapter as provisioning. Never surface it to the agent config author. The +agent author sees AGENTS.md, skills, tools, model, harness, attachments. Not a filesystem. + +## Phased path (each phase ships and keeps `/invoke` working) + +The phases are ordered by value over risk. Stop wherever the payoff flattens. + +- **Phase A. Capabilities and structured result.** Probe `getAgent().capabilities`, + thread a `HarnessCapabilities` object through, and replace the `harness == "pi"` + branches in `runRivet.ts` with capability checks. Widen `HarnessResult` to carry + `messages`, `usage`, `tool_calls`, `stop_reason` (the data is already in the event + stream). Low risk, immediately removes brittle harness name checks. + +- **Phase B. Event streaming through the port.** Add an event channel to `invoke` + (callback or async generator) carrying the `session/update` variants. Tracing reads from + it instead of a private subscription. `/invoke` still returns the final message, so the + HTTP contract is unchanged; client side streaming (WP-4) becomes a small add on. + +- **Phase C. First class sessions.** Introduce `create_session` / `resume_session` / + `destroy` and a `SessionPersistDriver` analogue. Continue a conversation with ACP + `session/load` instead of `buildTurnText` transcript replay. This needs the warm daemon + decision (see open questions) because cold per invoke sandboxes cannot hold a session + across turns without replay. + +- **Phase D. Content blocks, permissions, skills, hooks.** Turn `prompt` into content + blocks (attachments, images). Add `permission_policy`. Move skills and hooks into + `SessionConfig` as resolved artifacts. + +- **Phase E. Retire the `Runtime` exec port.** Fold "where it runs" fully into the + `Environment` seam backed by rivet providers. Keep `exec` only as long as the legacy + subprocess Pi transport needs it. + +## Risks and caveats + +- **Cold per invoke lifecycle fights first class sessions.** Phase C is the moment to + decide warm vs cold (the WP-8 status calls this out). First class sessions and ACP + `session/load` want a daemon that survives between turns, which reopens the per session + env and folder jail questions in + [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md). +- **Harness capability gaps are real.** Pi 0.79.4 has no MCP, so `mcpTools` is false and + Pi tools still go native. The capability model makes that explicit instead of surprising. +- **Usage is harness dependent.** Pi emits no `usage_update` over ACP; Claude does. The + structured result must tolerate missing usage (the WP-8 tracing deviation already notes + this). +- **Neutral seam vs rivet coupling.** Mirroring rivet's names risks the port drifting into + a rivet wrapper. Keep the port types ours (content blocks, events, capabilities as our + dataclasses) and translate in the adapter, so a non rivet harness can still implement it. diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/research.md b/docs/design/agent-workflows/scratch/harness-port-redesign/research.md new file mode 100644 index 0000000000..7d12e19942 --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/research.md @@ -0,0 +1,198 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Research: our ports vs the rivet SDK + +Source verified June 2026 against the installed `sandbox-agent@0.4.2` SDK +(`services/agent/node_modules/.pnpm/sandbox-agent@0.4.2.../dist/index.d.ts`), the +`acp-http-client@0.4.2` client, the `@agentclientprotocol/sdk` schema, and our own code +(`services/oss/src/agent_pi/ports.py`, `services/agent/src/runRivet.ts`, +`services/oss/src/agent.py`). Method and type names below are copied from those files. + +## 1. Our current ports + +### `Harness` (`services/oss/src/agent_pi/ports.py`) + +```python +class Harness(ABC): + async def setup(self) -> None: ... + async def invoke(self, request: HarnessRequest) -> HarnessResult: ... + async def shutdown(self) -> None: ... +``` + +`HarnessRequest`: `agents_md`, `model`, `prompt`, `messages`, `session_id`, `tools`, +`custom_tools`, `tool_callback`, `trace`. +`HarnessResult`: `output` (one string), `session_id`, `model`. + +Properties of this port: + +- **One shot and blocking.** One turn in, one string out. No incremental events. +- **Session is a string.** `session_id` is threaded through; "continue" means replaying + prior turns as transcript text inside the prompt (`buildTurnText` in `runRivet.ts`), + not loading an ACP session. +- **No capability model.** The service branches on `harness == "pi"` to decide tools + delivery and tracing (see `runRivet.ts`). +- **Text only.** `prompt` is a string; `messages` are `{role, content: str}`. +- **No permissions, modes, thought level, plan, usage, tool call surfacing.** + +### `Runtime` (`services/oss/src/agent_pi/ports.py`) + +```python +class Runtime(ABC): + async def start(self) -> None: ... + async def shutdown(self) -> None: ... + async def pause(self) -> None: ... # no-op default + async def connect_volume(self, ...) -> None: # no-op default + async def exec(self, command, input_bytes, *, cwd, env, timeout) -> ExecResult: ... +``` + +This is a generic "run a subprocess and feed it stdin" port. It predates rivet. The rivet +path only uses `exec` for the local subprocess transport; the real "where it runs" choice +(local vs daytona) now lives in `runRivet.ts` as the rivet provider. So this port is now +half vestigial. + +### The wire contract (`AgentRunRequest` / `AgentRunResult` in `runPi.ts`) + +Mirrors `HarnessRequest`/`HarnessResult` plus `harness`, `sandbox`, `traceId`. Also one +shot. `/run` returns the final result; no streaming endpoint exists. + +## 2. The rivet SDK surface + +Rivet splits cleanly into three planes. + +### Plane A. Runtime / sandbox: `SandboxAgent` + +The control plane and the environment. + +- Construct and connect: `SandboxAgent.start({ sandbox, persist, replayMaxEvents, replayMaxChars, token, signal })`, `SandboxAgent.connect({ baseUrl })`. +- **Lifecycle:** `dispose()`, `destroySandbox()`, `pauseSandbox()`, `killSandbox()`. +- **Session registry:** `createSession`, `resumeSession`, `resumeOrCreateSession`, + `destroySession`, `listSessions`, `getSession`, `getEvents`. +- **Capability discovery:** `listAgents`, `getAgent` (returns `AgentInfo` with + `capabilities`, `configOptions`, `installed`, `credentialsAvailable`), `installAgent`. +- **Config plane (per directory):** `getSkillsConfig`/`setSkillsConfig`/`deleteSkillsConfig` + and `getMcpConfig`/`setMcpConfig`/`deleteMcpConfig`. + +The sandbox is chosen by a provider passed to `start`: `local`, `daytona`, `e2b`, +`docker`, `vercel`, `cloudflare`, `modal`, `computesdk`, `sprites`. This is the real +environment seam, and it is richer than our `Runtime.exec`. + +### Plane B. Agent session: `Session` + +The agent conversation. This is the heart of what we should adopt. + +```ts +class Session { + prompt(prompt: ContentBlock[]): Promise; + setModel(model): ...; setMode(modeId): ...; setThoughtLevel(level): ...; + setConfigOption(id, value): ...; + getConfigOptions(): ...; getModes(): ...; + onEvent(listener): () => void; + onPermissionRequest(listener): () => void; + respondPermission(permissionId, reply): ...; // reply: "once" | "always" | "reject" + rawSend(method, params): ...; // escape hatch +} +``` + +- **Multimodal input.** `prompt` takes ACP content blocks. The block `type` is one of + `text`, `image`, `audio`, `resource`, `resource_link`. Attachments and images ride here. +- **Live structured events.** `onEvent` delivers ACP `session/update` notifications. + The variants (verified in the ACP schema): + + | `sessionUpdate` | Meaning | + | --- | --- | + | `agent_message_chunk` | assistant text delta or snapshot | + | `agent_thought_chunk` | reasoning / thinking | + | `user_message_chunk` | echoed user content | + | `tool_call` / `tool_call_update` | a tool started / progressed / finished | + | `plan` | the agent's plan (plan mode) | + | `available_commands_update` | slash commands available | + | `config_option_update` / `current_mode_update` | config or mode changed mid run | + | `usage_update` | token usage | + | `session_info_update` | session metadata | + +- **Permissions / human in the loop.** `onPermissionRequest` + `respondPermission`. + Today `runRivet.ts` auto approves these; the policy is hardcoded, not expressed in the + port. + +### Plane C. System: filesystem, process, desktop + +`SandboxAgent` also exposes the sandbox internals: `readFsFile`, `writeFsFile`, `mkdirFs`, +`moveFs`, `uploadFsBatch`; `runProcess`, `createProcess`, `followProcessLogs`, +`connectProcessTerminal`; and a full desktop API (mouse, keyboard, screenshot, recording, +WebRTC stream). These are **not** part of the agent config contract. We use a few of them +(`writeFsFile`, `mkdirFs`) only to provision a Daytona sandbox in `runRivet.ts`. They +belong to the runtime/sandbox adapter, never to the agent author. + +### Persistence and replay + +`SandboxAgent.start({ persist })` takes a `SessionPersistDriver`: + +```ts +interface SessionPersistDriver { + getSession(id): Promise; + listSessions(req?): Promise>; + updateSession(session): Promise; + listEvents(req): Promise>; + insertEvent(sessionId, event): Promise; +} +``` + +`InMemorySessionPersistDriver` ships; Postgres is wired in the daemon. A `SessionEvent` +carries `eventIndex`, `sender` ("client" | "agent"), and the ACP `payload`. Replay is +bounded by `replayMaxEvents` / `replayMaxChars`. `runRivet.ts` already constructs an +`InMemorySessionPersistDriver`, but because each invoke is a cold sandbox, it never spans +turns. The continue path falls back to transcript text instead. + +### Capability model: `AgentCapabilities` + +`getAgent(id)` returns capabilities the runtime probed from the harness: + +``` +commandExecution, errorEvents, fileAttachments, fileChanges, images, itemStarted, +mcpTools, permissions, planMode, questions, reasoning, sessionLifecycle, sharedProcess, +status, streamingDeltas, textMessages, toolCalls, toolResults +``` + +This is the clean answer to the `if harness == "pi"` branching we do today. The service +should ask "does this harness support `mcpTools` / `images` / `planMode` / `usage`" and +degrade, rather than hardcode harness names. + +### Session lifecycle in ACP (what the protocol allows) + +The ACP schema defines `session/new`, `session/load` (replay), `session/prompt`, +`session/cancel`, plus `ForkSessionRequest`/`ForkSessionResponse` and +`ResumeSessionRequest`/`ResumeSessionResponse`. **Fork is a first class ACP operation.** +That connects to [`../wp-8-rivet-acp-runtime/isolation-and-fork.md`](../wp-8-rivet-acp-runtime/isolation-and-fork.md): +a forked session is a cheap branch point for "try N variations of a turn", separate from +the filesystem jail discussed there. + +### Hooks: not in the SDK + +A grep for `hook` across `sandbox-agent/dist` and `acp-http-client` returns nothing. +Rivet has no hook concept. Hooks exist inside the harnesses (Pi loads extensions and +settings from `~/.pi/agent`; Claude reads its own hook config). So "set up hooks" is not a +rivet control plane call. It is an agent config artifact: files and settings laid into the +workspace or agent dir before the run. Our Pi tracing extension is exactly this shape +already (`installPiExtensionLocal` / `uploadPiExtensionToSandbox` in `runRivet.ts`). + +## 3. Side by side + +| Concern | Our `Harness` port today | Rivet SDK | +| --- | --- | --- | +| Turn call | `invoke(req) -> str` (blocking) | `session.prompt(blocks)` + `onEvent` stream + `PromptResponse` | +| Output | single string | structured events: text, thought, tool calls, plan, usage | +| Session | `session_id` string, transcript replay | `Session` object; create / load / resume / fork / destroy | +| Persistence | none (history held by caller) | `SessionPersistDriver` (in memory or Postgres), bounded replay | +| Input modality | text only | content blocks (text, image, audio, resource, resource_link) | +| Model / mode | `model` field | `setModel`, `setMode`, `setThoughtLevel`, `getConfigOptions` | +| Capabilities | `if harness == "pi"` | `getAgent().capabilities` (18 flags) | +| Tools | `custom_tools` + `tool_callback` | per directory MCP config + capability `mcpTools` | +| Skills | not in port | per directory `setSkillsConfig` (artifacts on disk) | +| Hooks | not in port | not in rivet either; harness config artifacts | +| Permissions | hardcoded auto approve in `runRivet.ts` | `onPermissionRequest` / `respondPermission` policy | +| Environment | `Runtime.exec(cmd, stdin)` | sandbox providers (local, daytona, e2b, docker, ...) | +| Lifecycle | `Runtime.pause` no-op stub | `destroySession`, `destroySandbox`, `pauseSandbox`, `killSandbox`, `dispose` | +| System (fs/proc/desktop) | absent (correct) | present on `SandboxAgent`, used only for provisioning | + +The gap is not that our port is wrong. It is that it stops at "send a turn, get text", +while rivet models the whole session as a first class, observable, resumable object. diff --git a/docs/design/agent-workflows/scratch/harness-port-redesign/status.md b/docs/design/agent-workflows/scratch/harness-port-redesign/status.md new file mode 100644 index 0000000000..1f19d84982 --- /dev/null +++ b/docs/design/agent-workflows/scratch/harness-port-redesign/status.md @@ -0,0 +1,76 @@ +> Superseded by the as-built design in [the design pages](../../README.md) and [scratch/sdk-local-backend/status.md](../sdk-local-backend/status.md). Kept for history. + +# Status + +Source of truth for this design effort. Keep it current. + +## Current state + +IMPLEMENTED, reviewed, and verified live (2026-06-17). Draft PR +[#4721](https://github.com/Agenta-AI/agenta/pull/4721), stacked on the WP-8 PR (#4718). +The as-built reference is [`implementation.md`](implementation.md); the comparison is in +[`research.md`](research.md); the recommended shape and phased path are in +[`proposal.md`](proposal.md) and [`plan.md`](plan.md). Builds on the shipped WP-8 runtime +([`../wp-8-rivet-acp-runtime/status.md`](../wp-8-rivet-acp-runtime/status.md)). + +The port this effort shipped (`Environment` + `Harness` + `AgentSession`, capabilities, +content blocks, structured events/result) drove both backends (rivet ACP, in-process Pi) +over a shared wire contract. Verified live: pi, rivet+pi+local, rivet+claude+local, +rivet+pi+daytona; a playground run nests `invoke_agent` under the `/invoke` span with usage. +A high-effort review found and fixed 10 issues. + +The runtime later moved into the SDK at `agenta.sdk.agents` and the ports were reshaped +into `Backend` / `Environment` / `Harness` / `Session`, with the engine modelled as a +`Backend` class rather than a wire string. The names in this status file are from the +original effort. See the [design pages](../../README.md) for the as-built shape. + +## Recommendation in one line + +Evolve the ports in phases (A: capabilities + structured result, B: event streaming, +C: first class sessions, D: content blocks + permissions + skills + hooks, E: retire the +`Runtime.exec` port), keeping rivet behind the seam and `/invoke` working at every step. + +## Decisions taken + +| Decision | Rationale | +| --- | --- | +| Keep a neutral port; rivet stays one adapter behind it | Legacy Pi path and future non rivet harnesses still fit; avoids the port becoming a rivet wrapper | +| Split the port into Environment (plane A) and AgentSession (plane B) | Matches rivet's own split; our single `invoke` collapses both today | +| System plane (fs/process/desktop) stays out of the harness port | It is provisioning, used only by the Environment adapter; never exposed to the agent author | +| Hooks are config artifacts, not a port verb | Rivet has no hook API; hooks live inside the harnesses, read from disk | +| Adopt a capability model over `if harness == "pi"` | Rivet already probes `getAgent().capabilities`; removes brittle name checks | +| Structured result + event stream replace the single string | The data already flows through `runRivet.ts` for tracing; the port flattens it | + +## User decisions (2026-06-17) + +1. **Ambition: full A to E arc.** Plan all five phases, including first class sessions and + retiring the `Runtime.exec` port. See [`plan.md`](plan.md). +2. **Session model: stay cold and replay.** Keep WP-8's one daemon per invoke. Do not + stand up a warm daemon. This avoids the per session env channel and the folder jail. + +### Reconciling "first class sessions" with "stay cold" + +A warm daemon is the usual way to get ACP `session/load`. We are not doing that. So Phase +C gives a first class `AgentSession` object in the **port** backed by a persisted history, +and the adapter implements "continue" by **replaying persisted events into a fresh cold +sandbox** each turn (the WP-8 model, but the history lives in a persistence driver instead +of being passed in by the caller). The session abstraction is real and stable; the +continuation mechanism stays replay. ACP `session/load` is reserved for a future warm +daemon and is explicitly out of scope. + +## Open questions (still need the user) + +3. **Persistence ownership.** Where does the event history live: the backend DB on the + platform, a file standalone, or rivet's own Postgres? Default assumption in + [`plan.md`](plan.md): backend DB on the platform, file standalone, mirroring how WP-8 + framed the history store. +4. **Streaming at the HTTP edge.** Phase B streams events through the port but keeps + `/invoke` request/response. A streaming endpoint (ties into WP-4 multi message output) + is planned as a Phase B option, not a hard requirement. Confirm if wanted now. +5. **Fork.** ACP exposes `session/fork`. Plan treats it as a Phase C optional add for "try + N variations of a turn". Defer unless there is a caller. + +## Next step + +Build plan is in [`plan.md`](plan.md). Phase A is the entry point. Open questions 3 to 5 +do not block Phase A or B; settle them before Phase C. diff --git a/docs/design/agent-workflows/scratch/research/auth-secrets.md b/docs/design/agent-workflows/scratch/research/auth-secrets.md new file mode 100644 index 0000000000..b90af4ace5 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/auth-secrets.md @@ -0,0 +1,441 @@ +# Research: Auth and Secrets for the pi.dev Agent Harness + +Status: research only. No code changes. This file answers the five auth/secrets +questions for the agent-workflows feature (see +[`../README.md`](../README.md)). Every claim is cited. Items I could not verify +from a primary source are marked **UNVERIFIED**. + +## Summary + +- **pi is a local CLI/SDK, not a hosted service.** "pi.dev" is the marketing and + docs site plus a package registry. There is no pi.dev account, no pi-issued API + key, and no pi-managed model gateway. You authenticate to *model providers*, not + to pi. ("Pi is a local coding agent. It runs with the permissions of the user + account that starts it." — `security.md`.) +- **Provider auth is bring-your-own-key (BYOK) or provider OAuth.** pi reaches + OpenAI/Anthropic/etc. with the user's own provider keys, or with a provider's + subscription OAuth (Claude Pro/Max, ChatGPT Plus/Pro (Codex), GitHub Copilot). + Keys live in env vars or `~/.pi/agent/auth.json`. There is no pi gateway in the + middle, though pi can be *pointed at* a gateway you run (Cloudflare AI Gateway, + OpenShell inference routing, a corporate proxy). +- **There is no first-class "secrets vault" in pi core.** pi has an *auth* + concept (provider credentials) and a flexible key-resolution syntax + (`$ENV`, `${ENV}`, `!shell-command`, literal). Anything beyond provider creds is + just environment variables / files the host process already has. The "named + secrets, scoped, agent-never-sees-the-value" feature surfaced in searches is a + set of **third-party community extensions** (e.g. `pi-secret-guard`, + `pi-secured-setup`, `pi-heimdall`, "Greywall"), not pi core. +- **The Codex secret has two shapes.** (a) Keep pi as the harness and use pi's + native `openai-codex-responses` API + the built-in "ChatGPT Plus/Pro (Codex)" + OAuth login — the credential is a pi `OAuthCredentials` object in + `~/.pi/agent/auth.json`. (b) Swap the harness to the real **OpenAI Codex CLI** + (`codex exec`), in which case the "codex secret" is either an `OPENAI_API_KEY` + /`CODEX_API_KEY` value or a ChatGPT access token, materialized into + `~/.codex/auth.json` (or `$CODEX_HOME/auth.json`) before the headless run. +- **For the Agenta feature: manage secrets in Agenta and inject them.** pi has no + vault to delegate to. Agenta should store secrets at rest (encrypted), then the + startup/secrets hook lays them into the sandbox as env vars and/or the right + auth file. pi's observability layer is already designed to keep keys/headers/ + payloads out of traces by default — lean on that and verify it. + +## 1. pi.dev auth model + +### Authenticating to pi itself + +There is nothing to authenticate to. pi is installed locally (npm/pnpm/bun/curl) +and runs as the local user. The only network calls pi makes on its own behalf are +version/telemetry pings to `pi.dev`, which are opt-out: + +- `enableInstallTelemetry` -> `https://pi.dev/api/report-install` +- version check -> `https://pi.dev/api/latest-version` +- `PI_OFFLINE=1` / `--offline` disables all startup network ops; + `PI_SKIP_VERSION_CHECK=1` disables the version check; `PI_TELEMETRY=0` disables + the ping. (Source: `settings.md`, `usage.md`.) + +So "auth to pi.dev" is not a concept we need to model. There is no pi account, +no pi org, no pi-issued token. (Source: `security.md`; `pi.dev` landing page.) + +### How pi authenticates to model providers + +Three mechanisms, with a defined precedence. From `sdk.md` (AuthStorage) and +`providers.md`: + +1. CLI `--api-key ` flag (or SDK runtime override `setRuntimeApiKey`, not + persisted). +2. `~/.pi/agent/auth.json` entry (API key **or** OAuth tokens). Stored with `0600` + perms. Auth-file entries take priority over env vars. +3. Provider env var (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`, ...). +4. Fallback resolver for custom-provider keys from `models.json`. + +`auth.json` is a flat object keyed by provider name. API-key shape +(`providers.md`): + +```json +{ + "anthropic": { "type": "api_key", "key": "sk-ant-..." }, + "openai": { "type": "api_key", "key": "sk-..." } +} +``` + +Provider **OAuth / subscription login** is also first-class. `/login` (interactive) +supports Claude Pro/Max, **ChatGPT Plus/Pro (Codex)**, and GitHub Copilot. OAuth +tokens auto-refresh and persist in the same `auth.json` as an `OAuthCredentials` +object (`providers.md`, `custom-provider.md`): + +```ts +interface OAuthCredentials { + refresh: string; // refresh token + access: string; // access token (what getApiKey() returns) + expires: number; // ms epoch expiry +} +``` + +So the answer to "pass-through provider keys, a pi-managed gateway, or both?" is: +**pass-through only.** No pi-managed gateway exists. pi *can* be pointed at a +gateway you operate — Cloudflare AI Gateway as a unified-billing/observability +proxy ([issue #3850](https://github.com/earendil-works/pi/issues/3850)), a +corporate proxy via `pi.registerProvider("openai", { baseUrl, headers })` +(`custom-provider.md`), or OpenShell inference routing where the gateway injects +upstream provider creds and the sandbox only sees `https://inference.local` +(`containerization.md`). Those are *your* gateways, not pi's. + +## 2. Provider-key handling and the key-resolution syntax + +This matters because it is how a secret gets indirected instead of pasted as a +literal. `apiKey`, custom header values, and `auth.json` `key` values share one +resolution syntax (`providers.md`, `custom-provider.md`): + +- `!command` at the **start** of the value runs a shell command and uses its + output (e.g. `"!security find-generic-password -ws 'anthropic'"`, or + `"!op read 'op://vault/item/secret'"` for 1Password). +- `$ENV_VAR` and `${ENV_VAR}` interpolate environment variables. +- `$$` -> literal `$`; `$!` -> literal `!`. +- Otherwise the value is a literal. + +Custom providers/proxies can carry secrets in headers using the same syntax: + +```ts +pi.registerProvider("google", { + baseUrl: "https://ai-gateway.corp.com/google", + headers: { "X-Corp-Auth": "$CORP_AUTH_TOKEN" } // env var or literal +}); +``` + +Implication for Agenta: we do **not** have to write raw secrets into pi config +files. We can inject env vars into the sandbox and reference them as `$VAR` in +pi's `auth.json`/provider config, or reference a secrets manager via `!command`. + +## 3. Secrets concept + injection + +### Is there a first-class "secrets" feature in pi core? No. + +pi core has an **auth** concept (provider credentials, above) and project +**trust** (an input-loading guard for `.pi/` resources, not a secret store — +`security.md`). It does **not** ship a named-secret/vault/scoped-secret feature. +The "secrets with a value + allowed host patterns, where the agent never sees the +real value" model that searches surface is from **third-party extensions**, not +Earendil: + +- `pi-secret-guard` — author **acarerdinc**, third-party. Scans `git commit`/ + `git push` via the `tool_call` event and blocks if secrets are detected; + regex + LLM review. (Source: `https://pi.dev/packages/pi-secret-guard` package + page.) This is a *leak-prevention* tool, not a secret *store*. +- `pi-secured-setup`, `pi-heimdall`, "Greywall" — third-party permission/redaction + layers (community blogs; **UNVERIFIED** beyond existence — treat as ecosystem + examples, not core). + +Conclusion: if Agenta wants named, scoped secrets, Agenta owns that. pi gives us +the *injection surface* (env vars, files, `$ENV`/`!cmd` references), not a vault. + +### How secrets reach a pi run and the tools inside it + +Because pi runs as the local user with the local environment, **every secret a +tool sees is whatever is in the process environment / filesystem of the pi +process**. There is no per-tool secret broker in core. Built-in tools +(`read`, `write`, `edit`, `bash`, `grep`, `find`, `ls`) and extension tools run +"with the permissions of the pi process" (`security.md`). So a `bash` tool can +read any env var or file the process can. Scope is the *process/sandbox boundary*, +not a pi ACL. + +This is exactly why the Agenta design runs pi in a **sandbox** (Daytona) and uses +**startup hooks** to lay down files then inject secrets — that sandbox *is* the +secret-scoping boundary. pi's own docs say the same: for unattended/untrusted +work, "run pi in a contained environment ... with only the files and credentials +required for the task" and "pass the minimum required API keys or use short-lived +credentials" (`security.md`, `containerization.md`). + +### Where to inject (three concrete options, all supported by pi) + +1. **Env vars in the sandbox** (simplest; matches pi's BYOK model). Set + `OPENAI_API_KEY` etc. in the sandbox env; pi resolves them via precedence rule + #3. The Docker example does exactly this: `docker run -e ANTHROPIC_API_KEY ...` + (`containerization.md`). +2. **`~/.pi/agent/auth.json` file** laid into the sandbox (precedence #2, beats + env). Either literal keys or `$ENV`/`!cmd` indirection. Note the doc warning: + "Mounting your host `~/.pi/agent` exposes host auth and session files to the + container." For a sandbox we generate a fresh `auth.json`, we do not mount the + host's. +3. **Gateway / inference routing** (strongest isolation): the sandbox calls + `https://inference.local` and a gateway injects the real provider key upstream, + so "OpenShell providers can keep raw model API keys outside the sandbox" + (`containerization.md`). This keeps the model key out of the sandbox entirely. + +### Scoping per-agent / per-session + +- **Per-agent**: each agent revision's secrets become that sandbox's env/auth + files. Different agent => different sandbox => different secret set. pi's + precedence model means a per-sandbox `auth.json` or per-sandbox env fully + determines what that agent can use. +- **Per-session**: the SDK exposes `authStorage.setRuntimeApiKey(provider, key)` + (runtime override, **not persisted**) and a "custom auth storage location" + (`sdk.md`). A session can be given a short-lived key in memory without writing + it to disk — useful for per-`session_id` credentials that should not outlive the + run. **UNVERIFIED**: exact API for a fully custom per-session AuthStorage path + beyond `setRuntimeApiKey` and the "custom auth storage location" mention. + +## 4. The Codex secret (the swappable-harness question) + +The README says the harness is swappable and could run OpenAI Codex instead of +pi's own loop. There are two genuinely different ways to do this, and the "codex +secret" means something different in each. + +### Option A — keep pi as the harness, talk to the Codex backend through pi + +pi already speaks Codex natively. `custom-provider.md` lists an API type +**`openai-codex-responses`** ("OpenAI Codex Responses API"), and `/login` offers +**"ChatGPT Plus/Pro (Codex)"** OAuth login ("Officially endorsed by OpenAI: Codex +for OSS", per `providers.md`). In this option: + +- The "codex secret" is just a pi credential: either an `OPENAI_API_KEY` (env or + `auth.json` `{"openai": {"type":"api_key","key":"..."}}`) for API-key access, or + a pi `OAuthCredentials` object for ChatGPT-subscription Codex access. +- Injection is identical to any other pi provider (section 3). No separate Codex + install needed. This is the lowest-friction path and stays inside pi's + instrumentation/observability. + +### Option B — swap in the real OpenAI Codex CLI as the harness + +Here pi is replaced (or wrapped) by the `codex` CLI, run headless with +`codex exec`. The "codex secret" is Codex's own credential. How Codex authenticates +(OpenAI Codex docs): + +- **ChatGPT login (default)** when no valid session exists — interactive, browser + or device flow. Not suitable headless unless you transplant a token. +- **API key** — recommended for "programmatic Codex CLI workflows, such as CI/CD + jobs" (`developers.openai.com/codex/auth`). +- **Access token** — ChatGPT-workspace token for "trusted, non-interactive + workflows" (`developers.openai.com/codex/enterprise/access-tokens`). + +Credential storage: `~/.codex/auth.json` (plaintext) or an OS keyring, controlled +by `cli_auth_credentials_store` = `file` | `keyring` | `auto`; the file lives +under `CODEX_HOME` (default `~/.codex`). Treat `auth.json` "like a password" +(`developers.openai.com/codex/auth`). + +Headless injection patterns: + +1. **Per-invocation API key (no persisted login):** + ```bash + CODEX_API_KEY= codex exec --json "your task" + ``` + Set it only for the single invocation, not as a job-level env var, "in workflows + that execute untrusted code" (`developers.openai.com/codex/noninteractive`). +2. **Persisted API-key login (writes `auth.json`):** + ```bash + printenv OPENAI_API_KEY | codex login --with-api-key # reads key from stdin + codex login status # -> "Logged in using an API key - sk-proj-***ABCD1" + ``` + (`developers.openai.com/codex/auth`, simplified.guide.) Note: setting + `OPENAI_API_KEY` env var **alone does not persist a login** — you must run a + login command or use `CODEX_API_KEY` per invocation. A request to honor + `OPENAI_API_KEY` without writing `auth.json` was closed "not planned" + ([issue #5212](https://github.com/openai/codex/issues/5212)); the documented + workaround is a custom `[model_providers.*]` with `env_key = "OPENAI_API_KEY"`. +3. **ChatGPT access token via stdin (subscription/workspace, headless):** + ```bash + printenv CODEX_ACCESS_TOKEN | codex login --with-access-token + ``` + (`developers.openai.com/codex/auth`.) +4. **Transplant a prepared `auth.json`** generated on a machine that did the + browser login, copied into `$CODEX_HOME/auth.json` in the sandbox (SSH/Docker + copy pattern; `developers.openai.com/codex/auth`). + +Custom-provider config (e.g. proxy/Azure) uses `config.toml` with `env_key` so the +secret is never checked into the dotfile (`developers.openai.com/codex/config-advanced`): + +```toml +model = "gpt-5.4" +model_provider = "proxy" + +[model_providers.proxy] +name = "OpenAI using LLM proxy" +base_url = "http://proxy.example.com" +env_key = "OPENAI_API_KEY" +``` + +Useful headless flags: `codex exec --json`, `--output-schema `, +`--ephemeral` (don't persist session files), `--skip-git-repo-check`, +`--ignore-user-config`, `--sandbox ` (`developers.openai.com/codex/noninteractive`, +`/codex/cli/reference`). + +**Gotcha to design around:** Codex's API-key-via-env sign-in is blocked while a +ChatGPT subscription login is active in the same `CODEX_HOME` +([issue #3286](https://github.com/openai/codex/issues/3286)). For deterministic +headless runs give each agent run a clean `CODEX_HOME` and exactly one credential +mode. + +### Recommendation on the Codex secret + +Model a **harness-typed "codex secret"** in the agent config that can carry either +(i) an OpenAI API key or (ii) a ChatGPT access token, plus a target mode. The +startup/secrets hook then materializes it for whichever harness is selected: + +- pi harness, `openai-codex-responses` -> write to pi `auth.json` / env as the + `openai` credential. +- Codex CLI harness -> either export `CODEX_API_KEY` for the single `codex exec`, + or render a fresh `$CODEX_HOME/auth.json`, or pipe a token to + `codex login --with-access-token`. + +This keeps the secret abstraction harness-agnostic and matches the README's +"swappable harness" requirement. + +## 5. Security best practices + +### Keeping secrets out of logs / traces / instrumentation + +pi's observability design (`packages/agent/docs/observability.md`) already treats +this as a first-class concern. pi emits structured lifecycle events +(`pi.agent.prompt`, `pi.ai.provider.request`, `pi.agent.tool_call`, ...) that an +adapter turns into OTel/Sentry spans. The doc defines an explicit allow/deny list: + +- **Safe by default** (emitted): provider, model, API id, session id, entry type, + tool name, status code, stop reason, token counts, costs, durations. +- **Unsafe by default** (NOT emitted): prompts, completions, tool args, tool + results, shell output, file contents, provider request payloads, provider + response bodies, **API keys**, **headers**. "Content capture can be opt-in later + with explicit redaction hooks." + +So if Agenta maps pi observability events to its tracing/instrumentation, secrets +in keys/headers/payloads are excluded by default. **Action for Agenta:** verify our +adapter does not turn on content capture, and confirm we never log resolved +`auth.json` values or the sandbox env. Also: the `before_provider_request` / +`before_provider_payload` hooks can inspect/replace the outgoing payload, which is +the right place to add redaction if we ever capture content +(`packages/agent/docs/hooks.md`, `extensions.md`). + +Additional bleed paths to guard (pi-specific): + +- `!command` key resolution runs a shell; ensure the command itself does not echo + the secret to a place pi captures. +- pi tools include `bash`; agent-run shell output is large and can contain secrets. + pi keeps tool/shell output out of traces by default, but if we surface the + multi-message agent output to users, scrub it. +- Do not mount the host `~/.pi/agent` into the sandbox (would leak host + auth/sessions) — generate fresh files per sandbox (`containerization.md`). + +### Storage at rest + +pi stores provider creds in `~/.pi/agent/auth.json` at `0600` (or an OS keyring is +not offered by pi core — that's Codex's `cli_auth_credentials_store`, not pi). +**For Agenta:** the agent config carries secrets that get versioned as a workflow +revision, so they must be **encrypted at rest in Agenta's store**, not persisted in +plaintext alongside the rest of the config, and decrypted only at injection time. +pi gives no at-rest encryption beyond file perms, so this is Agenta's +responsibility. Prefer short-lived/scoped credentials where the provider supports +them (pi docs explicitly recommend this for sandboxed runs). + +### How secrets reach the sandbox: env vs file vs API + +Ranked by isolation: + +1. **Gateway / inference routing (best):** raw provider key stays *outside* the + sandbox; sandbox calls `inference.local`; gateway injects upstream + (`containerization.md`). Use when we don't want the model key in the sandbox at + all. +2. **Mounted auth file** (`auth.json` / `$CODEX_HOME/auth.json`): file perms + `0600`, generated per run, removed on teardown. Can use `$ENV`/`!cmd` + indirection so the file holds a reference, not the literal. +3. **Env vars (simplest, matches pi BYOK):** fine inside a per-run sandbox; avoid + job-level env in any context that runs untrusted code (Codex doc warning). + +In all cases the **sandbox is the scope**: one agent/session -> one sandbox -> one +minimal credential set, torn down after the run. + +## Open questions + +- **Per-session custom AuthStorage in pi SDK.** `setRuntimeApiKey` (non-persisted) + and a "custom auth storage location" are documented in `sdk.md`, but the full + API for a per-`session_id` in-memory credential store is not spelled out. + Confirm against `@earendil-works/pi-agent-core` / `pi-coding-agent` types. +- **Does Agenta want pi-harness Codex (`openai-codex-responses`) or the real Codex + CLI as the swappable harness?** They have different secret shapes and different + instrumentation stories (pi events vs Codex `--json` stream). Decide before + designing the "codex secret" type. +- **Daytona secret primitives.** This file covers pi + Codex. Whether Daytona has + its own secret/env-injection API that the startup hook should use (vs writing + files/env ourselves) is out of scope here — covered by the Daytona research + topic in the README. +- **Codex `CODEX_HOME` isolation per run.** Confirm we give each Codex-harness run + a clean `CODEX_HOME` to avoid the ChatGPT-vs-API-key conflict + ([issue #3286](https://github.com/openai/codex/issues/3286)). +- **Third-party secret extensions.** `pi-secured-setup` / `pi-heimdall` / + "Greywall" exist but are **UNVERIFIED** as to maintenance and fit; do not depend + on them. If we want redaction, build it on the core `before_provider_*` hooks. +- **pi's `enableAnalytics` / `trackingId`.** Opt-in analytics exists + (`PI_EXPERIMENTAL=1` setup). Confirm it is off in our sandbox image so nothing + leaves the box unexpectedly. + +## Sources + +pi.dev (Earendil) — primary: + +- pi.dev landing page — product overview, providers, modes: https://pi.dev +- providers.md (auth.json, provider env vars, /login, OAuth, ChatGPT Plus/Pro + (Codex)): https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/providers.md +- custom-provider.md (registerProvider, apiKey/header syntax, + `openai-codex-responses` API type, OAuthCredentials, authHeader): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/custom-provider.md +- security.md (local trust boundary, no built-in sandbox, "minimum credentials"): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/security.md +- containerization.md (Docker `-e` keys, Gondolin, OpenShell inference routing): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md +- settings.md (telemetry endpoints, PI_OFFLINE, analytics, sessionDir): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/settings.md +- usage.md (env vars, /login, --api-key): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/usage.md +- quickstart.md / index.md (subscription vs API-key first run): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/quickstart.md +- extensions.md (events: session_start, tool_call, before_provider_request): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md +- sdk.md (AuthStorage precedence, setRuntimeApiKey, custom auth storage): + https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- packages/agent/docs/observability.md (safe/unsafe-by-default trace fields): + https://github.com/earendil-works/pi/blob/main/packages/agent/docs/observability.md +- packages/agent/docs/hooks.md (before_provider_request/payload transform hooks): + https://github.com/earendil-works/pi/blob/main/packages/agent/docs/hooks.md +- Cloudflare AI Gateway request (gateway is user-operated): + https://github.com/earendil-works/pi/issues/3850 +- pi-secret-guard package page (third-party, author acarerdinc): + https://pi.dev/packages/pi-secret-guard + +OpenAI Codex — primary: + +- Codex authentication (ChatGPT vs API key, auth.json, CODEX_HOME, + cli_auth_credentials_store, --with-api-key, --with-access-token): + https://developers.openai.com/codex/auth +- Codex non-interactive (codex exec, CODEX_API_KEY, --ephemeral, --json, sandbox): + https://developers.openai.com/codex/noninteractive +- Codex CLI reference (flags): https://developers.openai.com/codex/cli/reference +- Codex advanced config (model_providers, env_key): + https://developers.openai.com/codex/config-advanced +- Codex enterprise access tokens: + https://developers.openai.com/codex/enterprise/access-tokens +- Issue #5212 (OPENAI_API_KEY without writing auth.json — closed not planned): + https://github.com/openai/codex/issues/5212 +- Issue #3286 (env API-key sign-in blocked when ChatGPT login active): + https://github.com/openai/codex/issues/3286 + +Secondary / corroborating (not load-bearing): + +- simplified.guide Codex API-key login (codex login --with-api-key, login status): + https://www.simplified.guide/codex/api-key-login +- Mario Zechner (pi author) build notes: https://mariozechner.at/posts/2025-11-30-pi-coding-agent/ diff --git a/docs/design/agent-workflows/scratch/research/daytona-sandbox.md b/docs/design/agent-workflows/scratch/research/daytona-sandbox.md new file mode 100644 index 0000000000..df794d25c8 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/daytona-sandbox.md @@ -0,0 +1,482 @@ +# Daytona sandbox integration for agent workflows + +Research only. This file documents how the backend would programmatically create a +Daytona sandbox, install and run the pi.dev harness inside it, lay down files, inject +secrets, run the agent, stream output, and tear down. Every claim is cited. Items I could +not confirm from a primary source are marked UNVERIFIED. + +Context: see [`../README.md`](../README.md). Agents run on a pi.dev harness inside a +Daytona sandbox ("or any provider that works with our port"). Startup hooks lay down +config files, then inject secrets. + +## Summary + +- Daytona is an open-source (AGPL 3.0) "secure and elastic infrastructure for running + AI-generated code." Sandboxes are isolated machines with their own kernel, filesystem, + and network. It advertises sandbox start "under 90ms from code to execution." + [README](https://github.com/daytonaio/daytona), [docs](https://www.daytona.io/docs/en/). +- There is a first-class **Python SDK** (`pip install daytona`, package `daytona`, with + both sync `Daytona` and async `AsyncDaytona` clients), plus TypeScript, Go, Ruby, and + Java SDKs, a REST API, and a CLI. + [Python SDK](https://www.daytona.io/docs/en/python-sdk/), + [docs landing](https://www.daytona.io/docs/en/). +- Lifecycle: `daytona.create(...)` → `sandbox.process.exec(...)` / sessions → + `sandbox.stop()` / `sandbox.delete()`. States are creating/started/stopping/stopped/ + archiving/archived/deleting/deleted/error. Auto-stop (default 15 min), auto-archive + (default 7 days), and auto-delete (off by default) timers manage idle sandboxes. + [Sandboxes](https://www.daytona.io/docs/en/sandboxes/), + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). +- **Installing pi**: best fit is to bake pi into a custom **snapshot** (reusable image + template) so cold start does not pay an `npm install`. Build the snapshot from a base + image plus install commands using the **declarative Image builder** or a Dockerfile, or + install pi at runtime via `npm i -g @earendil-works/pi-coding-agent` / + `curl -fsSL https://pi.dev/install.sh | sh`. pi runs headless in print/JSON/RPC modes. + [Snapshots](https://www.daytona.io/docs/en/snapshots/), + [Declarative builder](https://www.daytona.io/docs/en/declarative-builder/), + [pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). +- **Files**: `sandbox.fs.upload_file` / `upload_files` (in-memory bytes → remote path), + plus `git` clone and mounted **volumes**. **Secrets/env**: `env_vars={...}` at create + time, `env={...}` per `exec`, baked `.env` in the image, or write a `.env`-style file + via the filesystem API. [File system](https://www.daytona.io/docs/en/file-system-operations/), + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). +- **Streaming**: run the agent in a **session** with `run_async=True`, then stream + stdout/stderr through `get_session_command_logs_async(session_id, cmd_id, on_stdout, + on_stderr)`. This maps cleanly onto pi's multi-message output if pi runs in JSON/RPC + mode (each emitted JSON line is one log chunk). [Process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). +- **Ports / "works with our port"**: `sandbox.get_preview_link(port)` returns a public URL + `https://{port}-{sandboxId}.proxy.daytona.work` plus an auth `token` (sent as + `x-daytona-preview-token`). Any HTTP port 1–65535 can be previewed. This is the + provider-agnostic "port contract" the design alludes to. + [Preview](https://www.daytona.io/docs/en/preview/). +- **Self-host**: yes, AGPL, via docker-compose (local) or a domain deployment behind + Caddy. Auth is API keys (`DAYTONA_API_KEY`, `X-Daytona-Organization-ID` for JWT) backed + by Dex/Auth0 OIDC. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/), + [API keys](https://www.daytona.io/docs/en/api-keys/). + +## Daytona SDK and lifecycle (Python, with code) + +### Install and client + +```bash +pip install daytona # package name: "daytona"; module import: "daytona" +``` + +```python +from daytona import Daytona, DaytonaConfig + +# From env vars: DAYTONA_API_KEY, DAYTONA_API_URL, DAYTONA_TARGET +daytona = Daytona() + +# Or explicit config +daytona = Daytona(DaytonaConfig( + api_key="YOUR_API_KEY", + api_url="https://app.daytona.io/api", # point at self-hosted URL for own infra + target="us", +)) +``` + +Async client (recommended for a FastAPI backend): + +```python +from daytona import AsyncDaytona + +async with AsyncDaytona() as daytona: + sandbox = await daytona.create() +``` + +Source: [Python SDK](https://www.daytona.io/docs/en/python-sdk/), +[API keys](https://www.daytona.io/docs/en/api-keys/). + +### Create / exec / stop / delete + +```python +# Create (defaults: python language, 1 vCPU / 1GB RAM / 3GiB disk) +sandbox = daytona.create() + +# Run a command +resp = sandbox.process.exec("echo 'Hello, World!'") +print(resp.result) + +# Stop, then delete (method names per SDK reference and sandboxes doc) +sandbox.stop() +sandbox.delete() +``` + +`Daytona.create()` signatures (note the default 60s creation timeout): + +```python +create(params: CreateSandboxFromSnapshotParams | None = None, + *, timeout: float = 60) -> Sandbox + +create(params: CreateSandboxFromImageParams | None = None, + *, timeout: float = 60, + on_snapshot_create_logs: Callable[[str], None] | None = None) -> Sandbox +``` + +`Sandbox` exposes submodules: `process`, `fs` / `file_system`, `git`, `object_storage`, +`volume`. Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/), +[Sandboxes](https://www.daytona.io/docs/en/sandboxes/). + +### Creation params (the important fields) + +`CreateSandboxFromSnapshotParams` and `CreateSandboxFromImageParams` both inherit +`CreateSandboxBaseParams`: + +- `snapshot: str` (snapshot params) or `image: str | Image` (image params) +- `resources: Resources | None` — only on the image params variant +- `name`, `language` (default `"python"`), `os_user` +- `env_vars: dict[str, str] | None` — **environment variables in the sandbox** +- `labels: dict[str, str] | None` +- `public: bool | None` +- `timeout: float | None` +- `auto_stop_interval: int | None` — minutes; default 15; `0` disables +- `auto_archive_interval: int | None` — minutes; default 7 days; `0` = max +- `auto_delete_interval: int | None` — minutes; off by default; `0` deletes immediately +- `volumes: list[VolumeMount] | None` +- `network_block_all: bool | None`, `network_allow_list: str | None` (CIDRs) +- `ephemeral: bool | None` — sets `auto_delete_interval=0` when True +- `linked_sandbox: str | None` + +Source: [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/). + +## Installing pi (image / snapshot strategy) + +pi.dev (the "pi coding agent") is a minimal, swappable agent harness. Install options +([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)): + +```bash +npm install -g --ignore-scripts @earendil-works/pi-coding-agent +# or +curl -fsSL https://pi.dev/install.sh | sh +``` + +Three baking strategies, in order of recommendation for the agent loop: + +### 1. Prebuilt snapshot (recommended) + +A **snapshot** is a reusable sandbox template built from a Docker/OCI image. Bake pi (and +Node) into it once, reuse for every run, and you avoid paying `npm install` on each cold +start. [Snapshots](https://www.daytona.io/docs/en/snapshots/). + +```python +from daytona import Daytona, CreateSnapshotParams, Image, Resources + +daytona = Daytona() + +image = ( + Image.base("node:22-bookworm") + .run_commands("npm install -g --ignore-scripts @earendil-works/pi-coding-agent") + .workdir("/home/daytona") +) + +daytona.snapshot.create( + CreateSnapshotParams( + name="agenta-pi-harness", + image=image, + resources=Resources(cpu=2, memory=4, disk=8), + ), + on_logs=print, # build logs +) +``` + +Then create sandboxes from it (fast path): + +```python +from daytona import CreateSandboxFromSnapshotParams + +sandbox = daytona.create( + CreateSandboxFromSnapshotParams(snapshot="agenta-pi-harness") +) +``` + +CLI equivalents: `daytona snapshot create --image `, +`daytona snapshot create --dockerfile ./Dockerfile`, +`daytona snapshot push --name `, `daytona snapshot list|activate|delete`. + +### 2. Declarative Image built on demand + +Pass an `Image` object straight to `create()` and Daytona builds it on the fly. Good for +iteration, slower than a prebuilt snapshot on first use. +[Declarative builder](https://www.daytona.io/docs/en/declarative-builder/). + +```python +from daytona import CreateSandboxFromImageParams, Image + +image = ( + Image.debian_slim("3.12") + .run_commands( + "apt-get update && apt-get install -y curl", + "curl -fsSL https://pi.dev/install.sh | sh", + ) + .add_local_file("AGENTS.md", "/home/daytona/AGENTS.md") # config files + .env({"PI_HOME": "/home/daytona/.pi"}) + .workdir("/home/daytona") +) + +sandbox = daytona.create( + CreateSandboxFromImageParams(image=image), + timeout=0, # 0 = no timeout while the image builds + on_snapshot_create_logs=print, # stream build logs +) +``` + +Builder methods available: `Image.debian_slim(py_ver)`, `Image.base(ref)`, +`Image.from_dockerfile(path)`, `.pip_install([...])`, +`.pip_install_from_requirements(path)`, `.pip_install_from_pyproject(path, ...)`, +`.run_commands(...)`, `.env({...})`, `.workdir(path)`, `.add_local_file(src, dst)`, +`.add_local_dir(src, dst)`, `.dockerfile_commands([...])`. + +### 3. Install at runtime + +Create a plain sandbox, then `sandbox.process.exec("npm i -g @earendil-works/pi-coding-agent")`. +Simplest but pays install latency on every run; only sensible for prototyping. + +Note on local parity (design requirement): the same `@earendil-works/pi-coding-agent` +package and `AGENTS.md` / skills layout work identically on a developer machine, so a +config pulled from the server runs the same locally. pi resolves `AGENTS.md` from +`~/.pi/agent/agent.md` (global), parent dirs, and cwd; skills live in +`~/.pi/agent/skills/`, `.pi/skills/`, or project dirs. +[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). + +## Files + secrets injection + +Order matches the design's startup hooks: files first, secrets second. + +### Files into the sandbox + +In-memory upload (no local temp file needed — good for config blobs pulled from the DB): + +```python +# Single file: source bytes -> remote path +sandbox.fs.upload_file(agents_md_bytes, "/home/daytona/AGENTS.md") + +# Bulk +from daytona import FileUpload +sandbox.fs.upload_files([ + FileUpload(source=agents_md_bytes, destination="/home/daytona/AGENTS.md"), + FileUpload(source=skill_bytes, destination="/home/daytona/.pi/agent/skills/x/SKILL.md"), +]) + +sandbox.fs.create_folder("/home/daytona/.pi/agent/skills", "755") +sandbox.fs.set_file_permissions("/home/daytona/AGENTS.md", "644") +``` + +Source: [File system operations](https://www.daytona.io/docs/en/file-system-operations/). + +Other ways to get files in: `sandbox.git` clone; mounted **volumes** (`VolumeMount`, +shared persistent storage); baking files into the image with `.add_local_file` / +`.add_local_dir`. [Volumes](https://www.daytona.io/docs/en/volumes/) (UNVERIFIED on exact +volume API surface; listed in SDK submodules and snapshots doc). + +### Secrets / env vars + +Several layers, pick by sensitivity and lifetime: + +```python +# A) Whole-sandbox env at creation +sandbox = daytona.create(CreateSandboxFromSnapshotParams( + snapshot="agenta-pi-harness", + env_vars={"OPENAI_API_KEY": "sk-...", "ANTHROPIC_API_KEY": "sk-ant-..."}, +)) + +# B) Per-command env (scoped to one exec) +sandbox.process.exec("echo $CUSTOM_SECRET", env={"CUSTOM_SECRET": "DAYTONA"}) + +# C) Write a .env file via the filesystem API, then have pi/harness read it +sandbox.fs.upload_file(b"ANTHROPIC_API_KEY=sk-ant-...\n", "/home/daytona/.env") +``` + +`env_vars` is a field on `CreateSandboxBaseParams` +([SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/)); per-exec `env` +is shown in [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). +pi reads provider keys from standard env vars (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, +etc.), so `env_vars` at create time is the cleanest secret injection path +([pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md)). +The OpenClaw guide confirms the same pattern: extra keys (e.g. `ANTHROPIC_API_KEY`) added +to `.env.sandbox` are loaded into the sandbox +([OpenClaw guide](https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/)). + +Daytona also has a server-side **secrets** concept (scoped secret injection) referenced in +its security program, but I did not find a dedicated public SDK method for an +organization secret vault; treat that as UNVERIFIED and prefer `env_vars` for now. +[SECURITY.md](https://github.com/daytonaio/daytona/blob/main/SECURITY.md). + +## Process exec + streaming + ports + +### One-shot exec + +```python +resp = sandbox.process.exec("pi -p 'analyze repo'", cwd="/home/daytona", timeout=600) +print(resp.result) # buffered stdout; returned after the command finishes +``` + +`exec` supports `cwd`, `env`, and `timeout`. +[process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx). + +### Long-running agent + live stdout/stderr streaming (the agent loop) + +Run the harness async inside a **session** and stream both streams via callbacks: + +```python +import asyncio +from daytona import SessionExecuteRequest + +session_id = "agent-run-" +sandbox.process.create_session(session_id) + +command = sandbox.process.execute_session_command( + session_id, + SessionExecuteRequest( + command="pi --mode json -p 'do the task'", + run_async=True, + ), +) + +logs_task = asyncio.create_task( + sandbox.process.get_session_command_logs_async( + session_id, + command.cmd_id, + lambda chunk: handle_stdout(chunk), # each chunk = pi JSON line(s) + lambda chunk: handle_stderr(chunk), + ) +) + +# Optional interactive input back into the process +sandbox.process.send_session_command_input(session_id, command.cmd_id, "y") + +await logs_task +``` + +This is the recommended shape for the multi-message agent output: run pi in +`--mode json` (or `--mode rpc`), and each emitted JSON line becomes a streamed log chunk +the backend forwards to the client. pi's JSON/RPC event stream emits typed events +(`agent_start`, `message_update` with `text_delta`, `tool_execution_start/update/end`, +`agent_end`), so the backend can map each event to an agent message / tool span for +tracing. RPC framing is strict LF-delimited JSONL — split on `\n` only. +Sources: [process execution](https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx), +[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md), +[pi README](https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md). + +pi mode summary for headless use: +- `pi -p ""` — print mode, runs once and exits (buffered text). +- `pi --mode json` — same as print but emits all events as JSON lines (best for parsing). +- `pi --mode rpc` — bidirectional JSONL over stdin/stdout; send + `{"type":"prompt","message":"..."}`, receive `response` + streamed events; supports + `steer` / `followUp` mid-run, `get_state`, `fork`, `switch_session`. +- Flags: `--provider`, `--model` (or `--model anthropic/claude-opus`), `--name`, + `--no-session`. +[pi RPC](https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md). + +### Ports / preview ("works with our port") + +If the harness or a tool serves HTTP, expose it with a preview link: + +```python +preview = sandbox.get_preview_link(3000) +print(preview.url) # https://3000-.proxy.daytona.work +print(preview.token) # send as header: x-daytona-preview-token +``` + +Any HTTP port 1–65535 is previewable; the port opens automatically if closed. For private +sandboxes the `token` is required (header `x-daytona-preview-token`), and the token resets +when the sandbox restarts, so re-fetch after a restart. This preview/port mechanism is the +provider-agnostic "port contract" the design refers to. A self-hosted deployment serves +the equivalent under `*.proxy.`. +[Preview](https://www.daytona.io/docs/en/preview/), +[Preview & auth](https://www.daytona.io/docs/en/preview-and-authentication/). + +## Cold start, lifecycle states, timeouts, limits + +- **Cold start:** advertised "under 90ms from code to execution" + ([README](https://github.com/daytonaio/daytona)). UNVERIFIED how that interacts with + on-demand image builds; a *prebuilt snapshot* should hit the fast path, whereas building + a declarative `Image` on first `create()` is a separate, slower one-time build. +- **States:** creating, started, stopping, stopped, archiving, archived, deleting, + deleted, error. Archived preserves state cheaply (on object storage); restarting from + archived is slower than from stopped. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). +- **Timeouts / timers:** + - `create(..., timeout=60)` default 60s creation timeout (use `timeout=0` for builds). + - `auto_stop_interval`: default **15 min** of inactivity → stop; `0` disables. + - `auto_archive_interval`: default **7 days** stopped → archive; `0` = max (30 days). + - `auto_delete_interval`: **disabled by default**; `0` = delete immediately on stop; + `-1` disables. `ephemeral=True` sets it to 0. + [SDK reference](https://www.daytona.io/docs/python-sdk/sync/daytona/), + [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). +- **Resources:** default **1 vCPU / 1GB RAM / 3GiB disk**; per-sandbox org max + **4 vCPU / 8GB RAM / 10GB disk**. Set via `Resources(cpu=2, memory=4, disk=8)` on the + from-image path. [Sandboxes](https://www.daytona.io/docs/en/sandboxes/). + +Implication for an agent loop: a long agent run will hit the 15-min auto-stop unless you +raise `auto_stop_interval` or keep the session active; set it explicitly for runs expected +to exceed 15 minutes, and `delete()`/`ephemeral=True` to guarantee teardown. + +## Self-host + auth + +- **Self-hostable:** yes. AGPL 3.0; "free to deploy and run in any environment," + community-supported. If you modify it and expose over a network, AGPL requires releasing + your modifications. [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/). +- **Deploy modes:** local docker-compose, or a domain deployment behind Caddy (TLS, DNS + provider token, ports 80/443/2222, 4GB+ RAM). Components: API (3000, dashboard + REST), + Proxy (4000, preview routing), SSH Gateway (2222), PostgreSQL, Redis, Dex (OIDC), + Registry, MinIO (S3-compatible storage). + ```bash + git clone https://github.com/daytonaio/daytona + docker compose -f docker/docker-compose.yaml up -d # http://localhost:3000 + # or: ./scripts/setup-domain-oss-deployment.sh # guided domain + TLS setup + ``` + Local default login: `dev@daytona.io` / `password` (Dex). Domain setup generates + `ENCRYPTION_KEY`, `ENCRYPTION_SALT`, `PROXY_API_KEY`, `RUNNER_API_KEY`, + `SSH_GATEWAY_API_KEY`. Auth0 OIDC is an optional alternative. + [OSS deployment](https://www.daytona.io/docs/en/oss-deployment/). +- **Auth model (API):** API keys created in the Dashboard or via the API; SDK/CLI read + `DAYTONA_API_KEY` (and `DAYTONA_API_URL` to point at self-hosted). JWT-authenticated + requests additionally need `X-Daytona-Organization-ID`. For self-host, set + `api_url` / `DAYTONA_API_URL` to your deployment. + [API keys](https://www.daytona.io/docs/en/api-keys/). + +## Open questions + +- **Snapshot build pipeline ownership.** Who builds/owns the `agenta-pi-harness` snapshot + and how is it pinned/versioned per agent revision? Building a declarative `Image` on the + hot path is slow; we likely need a prebuild step in CI or at config-publish time. +- **Cold start with custom image.** The "<90ms" figure is for sandbox start; the + first-time build of a custom image/snapshot is separate and unmeasured here. UNVERIFIED: + start time from a *prebuilt* pi snapshot vs. the default image. +- **pi output → Agenta tracing mapping.** Which pi events (`message_update`, + `tool_execution_*`) map to Agenta's multi-message output and pi-instruments tracing, and + whether RPC mode (bidirectional, supports steering) or JSON print mode is the better fit + for our streaming endpoint. RPC's "bash output appears in context on the *next* prompt" + semantics needs design attention. +- **Secrets vault.** Whether Daytona exposes a real scoped-secret API beyond `env_vars` + (referenced in SECURITY.md but no public SDK method found). For now `env_vars` at + create time. UNVERIFIED. +- **Provider abstraction.** The design says "any provider that works with our port." The + Daytona preview-URL/port + token model is concrete; a sandbox-provider interface would + need to abstract create/exec/stream/preview across providers (e.g. E2B, Modal). Out of + scope here but the port + streaming-logs contract is the seam. +- **Volume API surface.** Exact `VolumeMount` / `daytona.volume` Python API not fully + confirmed here. UNVERIFIED. +- **Long-run auto-stop.** Confirm whether an actively streaming session resets the + `auto_stop_interval` idle timer or whether we must raise it explicitly. UNVERIFIED. + +## Sources + +- Daytona docs landing — https://www.daytona.io/docs/en/ +- Daytona GitHub (README, license, "<90ms") — https://github.com/daytonaio/daytona +- Python SDK overview — https://www.daytona.io/docs/en/python-sdk/ +- Python SDK reference (params, fields, create signatures) — https://www.daytona.io/docs/python-sdk/sync/daytona/ +- Sandboxes (lifecycle, states, resources, timers) — https://www.daytona.io/docs/en/sandboxes/ +- Snapshots (custom images, CLI) — https://www.daytona.io/docs/en/snapshots/ +- Declarative builder (Image API) — https://www.daytona.io/docs/en/declarative-builder/ +- Process & code execution (exec, sessions, async log streaming) — https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx +- File system operations (upload/download/permissions) — https://www.daytona.io/docs/en/file-system-operations/ +- Preview / ports / token — https://www.daytona.io/docs/en/preview/ +- Preview & authentication — https://www.daytona.io/docs/en/preview-and-authentication/ +- OSS deployment (self-host, components, auth) — https://www.daytona.io/docs/en/oss-deployment/ +- API keys (auth model) — https://www.daytona.io/docs/en/api-keys/ +- SECURITY.md (secrets management mention) — https://github.com/daytonaio/daytona/blob/main/SECURITY.md +- OpenClaw-in-sandbox guide (agent + secrets + preview pattern) — https://www.daytona.io/docs/en/guides/openclaw/openclaw-sdk-sandbox/ +- pi.dev landing — https://pi.dev , https://pi.dev/docs/latest +- pi coding-agent README (install, modes, AGENTS.md, skills) — https://github.com/earendil-works/pi/blob/main/packages/coding-agent/README.md +- pi RPC protocol doc (JSONL events, streaming) — https://github.com/badlogic/pi-mono/blob/main/packages/coding-agent/docs/rpc.md +- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent diff --git a/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md b/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md new file mode 100644 index 0000000000..5eb0848e84 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/diskless-in-memory-config.md @@ -0,0 +1,461 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# Pi agent harness: diskless / in-memory config + +Research target: Pi coding agent (pi.dev, Earendil Inc.), npm +`@earendil-works/pi-coding-agent`, verified against version **0.79.4** (matches the +version installed by `npm view`). All signatures below are quoted from the published +package's TypeScript declaration files (`dist/**/*.d.ts`), the compiled JS +(`dist/**/*.js`), the bundled SDK examples (`examples/sdk/*.ts`), and the dependency +`@earendil-works/pi-ai@0.79.4`. Source URLs are in the Sources section. + +## Summary / net answer + +**Yes — Pi can run fully diskless with all invocation-specific data in process memory.** +Every invocation-specific input we care about has a confirmed in-memory path: + +- **System prompt / AGENTS.md**: pass as in-memory strings via `DefaultResourceLoader` + (`systemPrompt` / `systemPromptOverride`, `appendSystemPrompt` / + `appendSystemPromptOverride`, `agentsFilesOverride`). No file required. +- **Skills**: register in-memory `Skill` objects via `skillsOverride`, or point at an + arbitrary directory via `additionalSkillPaths`. No fixed disk convention required. +- **Provider auth**: `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` (not + persisted), or per-provider env vars. Both confirmed disk-free. +- **Custom tools**: defined in-process via `customTools: ToolDefinition[]` / + `defineTool(...)` or `pi.registerTool(...)` in an inline `extensionFactories` function. + No file. +- **Sessions/state**: `SessionManager.inMemory()` writes nothing. + `SettingsManager.inMemory()` and `ModelRegistry.inMemory()` likewise avoid disk. + +The one thing that is **not** purely in-memory is bash/tool **output spillover**: when a +bash command (or a tool using the output accumulator) exceeds an in-memory byte +threshold, Pi spills the tail to a temp file under `os.tmpdir()`. This is the only +unavoidable write in a headless run that uses the bash/grep/find tools. Point `TMPDIR` +at a tmpfs (or make `/tmp` tmpfs) and it never touches a persistent volume. + +If you drive Pi via the **SDK** (`createAgentSession`) rather than the CLI, you also avoid +startup migrations and the CLI's `agentDir` touches entirely. If you drive it via +`pi --mode rpc`/`--print` (the `main()` CLI entrypoint), redirect `agentDir` and +`sessionDir` to tmpfs and pass `--no-session`. + +--- + +## Per-question findings + +### 1. System prompt / AGENTS.md in memory — CONFIRMED in-memory + +The system prompt and AGENTS.md content are supplied through the `ResourceLoader`, not +through top-level `createAgentSession` options. `DefaultResourceLoaderOptions` exposes +both direct values and override callbacks (quoted from +`dist/core/resource-loader.d.ts`): + +```typescript +export interface DefaultResourceLoaderOptions { + cwd: string; + agentDir: string; + ... + noContextFiles?: boolean; // disable AGENTS.md discovery from disk + systemPrompt?: string; // in-memory base system prompt + appendSystemPrompt?: string[]; // in-memory appended instructions + ... + agentsFilesOverride?: (base: { + agentsFiles: Array<{ path: string; content: string }>; + }) => { agentsFiles: Array<{ path: string; content: string }> }; + systemPromptOverride?: (base: string | undefined) => string | undefined; + appendSystemPromptOverride?: (base: string[]) => string[]; +} +``` + +The `ResourceLoader` interface returns these to the session via +`getSystemPrompt(): string | undefined`, `getAppendSystemPrompt(): string[]`, and +`getAgentsFiles(): { agentsFiles: Array<{ path: string; content: string }> }`. + +**Replace the entire system prompt (in memory)** — from `examples/sdk/03-custom-prompt.ts`: + +```typescript +const loader1 = new DefaultResourceLoader({ + cwd, agentDir, + systemPromptOverride: () => `You are a helpful assistant that speaks like a pirate. +Always end responses with "Arrr!"`, + // Needed to avoid DefaultResourceLoader appending APPEND_SYSTEM.md from ~/.pi/agent or /.pi. + appendSystemPromptOverride: () => [], +}); +await loader1.reload(); +const { session } = await createAgentSession({ + resourceLoader: loader1, + sessionManager: SessionManager.inMemory(), +}); +``` + +**Inject AGENTS.md content in memory** — from `examples/sdk/07-context-files.ts`: + +```typescript +const loader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + agentsFilesOverride: (current) => ({ + agentsFiles: [ + ...current.agentsFiles, + { path: "/virtual/AGENTS.md", content: `# Project Guidelines ...` }, + ], + }), +}); +``` + +Note the file comment: "Disable context files entirely by returning an empty list in +`agentsFilesOverride`." (return `{ agentsFiles: [] }`), or set `noContextFiles: true`. + +**Where Pi reads AGENTS.md from disk by default** (so it can be pointed at tmpfs or +disabled): `loadProjectContextFiles({ cwd, agentDir })` walks from `cwd` upward and reads +the `agentDir`. CLI flag to disable: `--no-context-files` (`Args.noContextFiles`). +The CLI also exposes `--system-prompt` and `--append-system-prompt` +(`Args.systemPrompt?: string`, `Args.appendSystemPrompt?: string[]` in +`dist/cli/args.d.ts`), so over RPC/print mode you can pass the prompt as a process arg +(in memory, no file). + +### 2. Skills in memory — CONFIRMED both in-memory registration and arbitrary path + +Skills are normally a **directory-of-files** convention. From `dist/core/skills.d.ts` +(`loadSkillsFromDir` doc comment): + +> Discovery rules: +> - if a directory contains SKILL.md, treat it as a skill root and do not recurse further +> - otherwise, load direct .md children in the root +> - recurse into subdirectories to find SKILL.md + +Default discovery locations (from the docs and `DefaultResourceLoader`): `.pi/skills/`, +`.agents/skills/` (walking up), `~/.agents/skills/`, `~/.pi/agent/skills/`. + +A `Skill` is a plain object, so it can be created **in memory** with no file: + +```typescript +export interface Skill { + name: string; + description: string; + filePath: string; + baseDir: string; + sourceInfo: SourceInfo; + disableModelInvocation: boolean; +} +``` + +**Register an in-memory skill** — from `examples/sdk/04-skills.ts`: + +```typescript +const customSkill: Skill = { + name: "my-skill", + description: "Custom project instructions", + filePath: "/virtual/SKILL.md", + baseDir: "/virtual", + sourceInfo: createSyntheticSourceInfo("/virtual/SKILL.md", { source: "sdk" }), + disableModelInvocation: false, +}; +const loader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + skillsOverride: (current) => ({ + skills: [...current.skills, customSkill], + diagnostics: current.diagnostics, + }), +}); +``` + +**Point skills at an arbitrary path**: `DefaultResourceLoaderOptions.additionalSkillPaths?: +string[]` (and `noSkills?: boolean` to disable default discovery). CLI equivalents: +`--skills ` (`Args.skills?: string[]`) and `--no-skills` (`Args.noSkills`). +The lower-level `loadSkills({ cwd, agentDir, skillPaths, includeDefaults })` confirms +`skillPaths` is an explicit list and `includeDefaults` can be turned off. + +Caveat: the skill's `filePath`/`baseDir` only matter if the skill body is read lazily on +invocation. For a fully synthetic in-memory skill you must ensure the content is provided +up front; if Pi reads `filePath` on `/skill:name` invocation it would need that path to +exist. For pure "inject instructions into the system prompt" use, `formatSkillsForPrompt` +uses `name`/`description` and the prompt formatting only. UNVERIFIED whether explicit +`/skill:name` expansion re-reads `filePath` from disk for an SDK-injected synthetic skill; +to be safe, point synthetic skills at a tmpfs path or set +`disableModelInvocation`/use systemPrompt injection instead. + +### 3. Provider / LLM auth in memory — CONFIRMED (three disk-free paths) + +**(a) Environment variables.** `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js` +contains the canonical provider→env-var map (`getApiKeyEnvVars`). Exact names: + +- anthropic: `ANTHROPIC_OAUTH_TOKEN` (precedence) then `ANTHROPIC_API_KEY` +- openai: `OPENAI_API_KEY` +- google (Gemini): `GEMINI_API_KEY` +- google-vertex: `GOOGLE_CLOUD_API_KEY` (or ADC via `GOOGLE_APPLICATION_CREDENTIALS` + + `GOOGLE_CLOUD_PROJECT`/`GCLOUD_PROJECT` + `GOOGLE_CLOUD_LOCATION`) +- amazon-bedrock: `AWS_PROFILE` | `AWS_ACCESS_KEY_ID`+`AWS_SECRET_ACCESS_KEY` | + `AWS_BEARER_TOKEN_BEDROCK` | ECS/IRSA container creds +- azure-openai-responses: `AZURE_OPENAI_API_KEY` +- xai: `XAI_API_KEY`; groq: `GROQ_API_KEY`; cerebras: `CEREBRAS_API_KEY`; + deepseek: `DEEPSEEK_API_KEY`; mistral: `MISTRAL_API_KEY`; nvidia: `NVIDIA_API_KEY`; + openrouter: `OPENROUTER_API_KEY`; together: `TOGETHER_API_KEY`; + fireworks: `FIREWORKS_API_KEY`; vercel-ai-gateway: `AI_GATEWAY_API_KEY`; + github-copilot: `COPILOT_GITHUB_TOKEN`; huggingface: `HF_TOKEN`; + moonshotai / moonshotai-cn: `MOONSHOT_API_KEY`; kimi-coding: `KIMI_API_KEY`; + zai: `ZAI_API_KEY`; zai-coding-cn: `ZAI_CODING_CN_API_KEY`; + minimax: `MINIMAX_API_KEY`; minimax-cn: `MINIMAX_CN_API_KEY`; + opencode / opencode-go: `OPENCODE_API_KEY`; nvidia, etc.; + cloudflare-workers-ai / cloudflare-ai-gateway: `CLOUDFLARE_API_KEY`; + xiaomi family: `XIAOMI_API_KEY`, `XIAOMI_TOKEN_PLAN_{CN,AMS,SGP}_API_KEY`; + ant-ling: `ANT_LING_API_KEY`. + +**(b) Runtime in-memory setter — CONFIRMED.** `dist/core/auth-storage.d.ts`: + +```typescript +export declare class AuthStorage { + static create(authPath?: string): AuthStorage; + static fromStorage(storage: AuthStorageBackend): AuthStorage; + static inMemory(data?: AuthStorageData): AuthStorage; + /** Set a runtime API key override (not persisted to disk). Used for CLI --api-key flag. */ + setRuntimeApiKey(provider: string, apiKey: string): void; + removeRuntimeApiKey(provider: string): void; + setFallbackResolver(resolver: (provider: string) => string | undefined): void; + ... +} +export declare class InMemoryAuthStorageBackend implements AuthStorageBackend { ... } +``` + +So `setRuntimeApiKey(provider: string, apiKey: string): void` is real (UNVERIFIED in the +original brief — now CONFIRMED). Resolution priority in `getApiKey()`: +1. runtime override (`--api-key` / `setRuntimeApiKey`), 2. `auth.json` API key, +3. `auth.json` OAuth (auto-refreshed), 4. environment variable, 5. fallback resolver. + +`AuthStorage.inMemory()` plus `InMemoryAuthStorageBackend` give a fully in-memory store. +Verified in the compiled `dist/core/auth-storage.js`: every `writeFileSync`/`mkdirSync`/ +`chmodSync` call lives inside `FileAuthStorageBackend` (class starts line 17); the +`InMemoryAuthStorageBackend` class (line 127) performs no filesystem writes. + +From `examples/sdk/09-api-keys-and-oauth.ts`: + +```typescript +// Runtime API key override (not persisted to disk) +authStorage.setRuntimeApiKey("anthropic", "sk-my-temp-key"); +// No models.json - only built-in models +const simpleRegistry = ModelRegistry.inMemory(authStorage); +``` + +**(c) RPC protocol credential message — NOT PRESENT.** The full `RpcCommand` union in +`dist/modes/rpc/rpc-types.d.ts` has no `set_api_key` / `set_credential` / auth message +(commands are: prompt, steer, follow_up, abort, new_session, get_state, set_model, +cycle_model, get_available_models, set_thinking_level, cycle_thinking_level, +set_steering_mode, set_follow_up_mode, compact, set_auto_compaction, set_auto_retry, +abort_retry, bash, abort_bash, get_session_stats, export_html, switch_session, fork, +clone, get_fork_messages, get_last_assistant_text, set_session_name, get_messages, +get_commands). **Implication:** in RPC mode, credentials must be supplied at process spawn +— via env vars or the `--api-key`/`--provider` CLI flags (`Args.apiKey`, `Args.provider`). +You cannot inject a key over the JSONL channel after spawn. If you need post-spawn, +in-memory key injection without env vars, drive Pi via the **SDK** and pass a custom +`AuthStorage` instead of RPC mode. + +### 4. Tool auth / custom tools in memory — CONFIRMED in-process, no file + +Custom tools are pure in-process definitions. Two confirmed paths: + +**Via `customTools` on `createAgentSession`** (`dist/core/sdk.d.ts`): + +```typescript +export interface CreateAgentSessionOptions { + ... + /** Custom tools to register (in addition to built-in tools). */ + customTools?: ToolDefinition[]; + ... +} +``` + +A `ToolDefinition` (`dist/core/extensions/types.d.ts`) carries its own `execute(...)` +function — so any auth/config the tool needs is closed over in code, no on-disk config: + +```typescript +export interface ToolDefinition { + name: string; label: string; description: string; + parameters: TParams; // TypeBox schema + execute(toolCallId, params, signal, onUpdate, ctx): Promise>; + ... +} +export declare function defineTool<...>(tool: ToolDefinition<...>): ...; +``` + +**Via inline extension factory + `pi.registerTool`** (`examples/sdk/06-extensions.ts`): + +```typescript +const resourceLoader = new DefaultResourceLoader({ + cwd: process.cwd(), agentDir: getAgentDir(), + extensionFactories: [ + (pi) => { pi.on("agent_start", () => { ... }); }, + ], +}); +// inside an extension: pi.registerTool({ name: "my_tool", label: "My Tool", ... }) +``` + +`ExtensionRunner.registerTool<...>(tool: ToolDefinition<...>): void` is in the type +surface. Both paths require no file: the extension can be an inline function passed in +`extensionFactories`, and tool auth is whatever the closure references (e.g. an HTTP +client back to your backend). Built-in tool selection is also code-only via +`tools`/`excludeTools`/`noTools` on `createAgentSession`. + +### 5. Working directory / cwd and state files — what Pi writes, and how to redirect + +**Path knobs (from `dist/config.js`):** + +- `getAgentDir()` returns `process.env.PI_CODING_AGENT_DIR` (expanded) if set, else + `~/.pi/agent`. The env var name is built as + `` `${APP_NAME.toUpperCase()}_CODING_AGENT_DIR` `` with `APP_NAME = "pi"`, i.e. + **`PI_CODING_AGENT_DIR`**. +- Session dir env var **`PI_CODING_AGENT_SESSION_DIR`** (`ENV_SESSION_DIR`), read in + `main.js`. Resolution order in CLI: `--session-dir` flag → `PI_CODING_AGENT_SESSION_DIR` + → settings default. Default session dir: + `getDefaultSessionDir(cwd, agentDir)` = `/sessions/----/` + (it `mkdirSync`s the dir). +- All other config files hang off `agentDir`: `auth.json`, `models.json`, `settings.json`, + `tools/`, `bin/`, `prompts/`, `themes/`, `sessions/`, and the debug log + `/pi-debug.log`. Redirecting `PI_CODING_AGENT_DIR` moves all of them. + +**SDK-level in-memory replacements (no disk):** + +- `SessionManager.inMemory(cwd?)` — "Create an in-memory session (no file persistence)". + Verified: `SessionManager` only `writeFileSync`s when `this.persist` is true; `inMemory` + sets `persist=false`. +- `SettingsManager.inMemory(settings?)` — no `settings.json` read/write. +- `ModelRegistry.inMemory(authStorage)` — built-in models only, no `models.json`. +- `AuthStorage.inMemory()` / custom `AuthStorageBackend` — no `auth.json`. + +**What Pi writes on its own during a run (headless), and how to neutralize it:** + +| Writer (dist file) | Path | When | Redirect / avoid | +| --- | --- | --- | --- | +| `core/session-manager.js` | `/sessions/...*.jsonl` | every persisted session | `SessionManager.inMemory()` (SDK) or `--no-session` (CLI). Else `PI_CODING_AGENT_SESSION_DIR`→tmpfs. | +| `core/bash-executor.js` | `os.tmpdir()/pi-bash-.log` | only when bash output exceeds `DEFAULT_MAX_BYTES` (spillover) | set `TMPDIR` to tmpfs / make `/tmp` tmpfs | +| `core/tools/output-accumulator.js` | `os.tmpdir()/-.log` | tool output spillover above threshold | same (`TMPDIR`→tmpfs) | +| `core/settings-manager.js` | `/settings.json`, `/.pi/settings.json` | only on settings change with persistence | `SettingsManager.inMemory()` | +| `core/auth-storage.js` (`FileAuthStorageBackend`) | `/auth.json` | only with file-backed AuthStorage | `AuthStorage.inMemory()` / `setRuntimeApiKey` | +| `core/trust-manager.js` | project trust file under `/.pi` / agentDir | only when project-trust resolution runs | avoid project `.pi` resources; SDK path skips trust prompts | +| `core/package-manager.js` | `/tmp/extensions/` | only when installing/loading extension packages | use inline `extensionFactories` (no package install) | +| `core/agent-session-runtime.js` | `/` | only when attaching files + persistence | in-memory session; don't attach files | +| `core/agent-session.js` | export path | only on explicit `exportToHtml`/`exportToJsonl` | don't call exports | +| `utils/tools-manager.js` | `/bin/{rg,fd}` | only if `rg`/`fd` not found in PATH | pre-install ripgrep + fd in the sandbox image (it prefers system binaries in PATH) | +| `migrations.js` (CLI only) | `/auth.json`, `settings.json` | `main()` startup, only if legacy files present | SDK path doesn't call it; or point `PI_CODING_AGENT_DIR` at an empty tmpfs | + +The interactive TUI also writes `pi-debug.log` and reads more of `agentDir`, but those +code paths (`modes/interactive/*`) do not run in `--mode rpc`, `--print`, or the SDK. + +### 6. Net answer — concrete diskless recipe + +**Recommended: drive Pi via the SDK (`createAgentSession`), not the RPC CLI**, because the +SDK lets you inject `AuthStorage`, system prompt, skills, AGENTS.md, and custom tools as +in-memory objects, and skips CLI startup migrations. Run many sessions in one shared +sandbox, one `createAgentSession` per invocation, each with its own in-memory loader and +auth. + +Per invocation, in code (all in memory): + +```typescript +const auth = AuthStorage.inMemory(); +auth.setRuntimeApiKey("anthropic", perRunKey); // never persisted + +const loader = new DefaultResourceLoader({ + cwd: perRunWorkdir, // a per-run tmpfs subdir + agentDir: perRunAgentDir, // a per-run tmpfs subdir (or unused) + noContextFiles: true, // ignore on-disk AGENTS.md + systemPrompt: baseSystemPrompt, // in memory + appendSystemPromptOverride: () => [extraInstructions], + agentsFilesOverride: () => ({ agentsFiles: [{ path: "/virtual/AGENTS.md", content: agentsMd }] }), + skillsOverride: (cur) => ({ skills: [...inMemorySkills], diagnostics: cur.diagnostics }), + extensionFactories: [(pi) => { pi.registerTool(myProxyTool); }], +}); +await loader.reload(); + +const { session } = await createAgentSession({ + cwd: perRunWorkdir, + authStorage: auth, + modelRegistry: ModelRegistry.inMemory(auth), + settingsManager: SettingsManager.inMemory(), + sessionManager: SessionManager.inMemory(perRunWorkdir), + resourceLoader: loader, + model: getModel("anthropic", "claude-..."), + customTools: [/* or here instead of via extensionFactories */], +}); +``` + +Environment for the sandbox process: + +- `TMPDIR=/dev/shm/pi-tmp` (or any tmpfs) — captures bash/tool output spillover. +- Optionally `PI_CODING_AGENT_DIR=/dev/shm/pi-agent` and + `PI_CODING_AGENT_SESSION_DIR=/dev/shm/pi-sessions` as a belt-and-suspenders redirect for + any code path that still resolves `agentDir`/`sessionDir`. +- `PI_OFFLINE=1` to suppress version-check network/file activity (optional). +- Provider key via env var (e.g. `ANTHROPIC_API_KEY`) **only if** you use env-var auth + instead of `setRuntimeApiKey`. +- Pre-install `ripgrep` (`rg`) and `fd` in the sandbox image so the `grep`/`find` tools + never trigger a download to `/bin`. + +**What must be a file (therefore tmpfs):** nothing strictly required for config. The only +forced writes are (a) bash/tool **output spillover** to `os.tmpdir()` (point `TMPDIR` at +tmpfs), and (b) any session/settings/auth persistence you opt into — all avoidable with +the `inMemory()` factories. If you instead use `pi --mode rpc`, sessions and `agentDir` +are file-based by default, so you must pass `--no-session` and redirect both env vars to +tmpfs, and you lose post-spawn in-memory key injection (RPC has no auth message). + +**Verdict:** fully diskless (process memory + a tmpfs `TMPDIR`) is achievable via the SDK. +No persistent-volume write is required for prompts, skills, AGENTS.md, auth, tools, or +session state. + +--- + +## Open questions / UNVERIFIED + +- **Synthetic skill body re-read.** Whether an SDK-injected `Skill` whose `filePath` points + at a non-existent `/virtual/SKILL.md` is safe when the model triggers `/skill:name` + expansion (which may re-read `filePath`). The system-prompt listing only needs + `name`/`description`, but explicit invocation might hit disk. Mitigation: put synthetic + skills' `filePath`/`baseDir` on tmpfs, or rely on systemPrompt injection. Confirm by + reading `_expandSkillCommand` in `dist/core/agent-session.js` or testing. +- **`os.tmpdir()` honoring `TMPDIR`.** Node's `os.tmpdir()` respects `TMPDIR` on Linux, so + setting `TMPDIR` to a tmpfs path redirects the spillover files. This is standard Node + behavior, not Pi-specific; verify the sandbox doesn't override `TMPDIR`. +- **OAuth refresh writes.** If you use OAuth credentials (not API keys), token refresh in + `FileAuthStorageBackend` writes back to `auth.json`. With `AuthStorage.inMemory()` / + `InMemoryAuthStorageBackend`, refreshed tokens stay in memory — confirm refresh path + uses the injected backend (it goes through `withLock`/`withLockAsync`, which the + in-memory backend implements). +- **`ModelRegistry` provider registration side effects.** `ModelRegistry.inMemory` avoids + `models.json`, but custom provider registration (Bedrock/Vertex) may read other on-disk + creds (`~/.aws`, ADC json). Out of scope if using API-key providers. +- Version drift: verified at 0.79.4. Re-check `rpc-types.d.ts` for an auth message and + `resource-loader.d.ts` option names if upgrading. + +--- + +## Sources + +Primary (package source / types — inspected from the published tarball; equivalent files +on GitHub): + +- `@earendil-works/pi-coding-agent@0.79.4` npm tarball, files: + `dist/core/sdk.d.ts` (`CreateAgentSessionOptions`, `customTools`, `createAgentSession`), + `dist/core/resource-loader.d.ts` (`DefaultResourceLoaderOptions`: `systemPrompt`, + `appendSystemPrompt`, `systemPromptOverride`, `agentsFilesOverride`, `skillsOverride`, + `additionalSkillPaths`, `noContextFiles`, `noSkills`), + `dist/core/auth-storage.d.ts` + `dist/core/auth-storage.js` (`AuthStorage`, + `setRuntimeApiKey`, `inMemory`, `InMemoryAuthStorageBackend`), + `dist/core/session-manager.d.ts` + `.js` (`SessionManager.inMemory`, `getDefaultSessionDir`), + `dist/core/settings-manager.js` (`inMemory`), `dist/core/model-registry.js` (`inMemory`), + `dist/core/skills.d.ts` (`Skill`, `loadSkills`, `loadSkillsFromDir`), + `dist/core/extensions/types.d.ts` (`ToolDefinition`, `defineTool`, `registerTool`), + `dist/config.js` (`getAgentDir`, `ENV_AGENT_DIR=PI_CODING_AGENT_DIR`, + `ENV_SESSION_DIR=PI_CODING_AGENT_SESSION_DIR`, session/auth/bin paths), + `dist/cli/args.d.ts` (`--api-key`, `--system-prompt`, `--append-system-prompt`, + `--no-session`, `--session-dir`, `--skills`, `--no-skills`, `--no-context-files`), + `dist/modes/rpc/rpc-types.d.ts` (full `RpcCommand` union — no auth message), + `dist/core/bash-executor.js` + `dist/core/tools/output-accumulator.js` (tmpdir spillover), + `dist/utils/tools-manager.js` (rg/fd download, prefers system PATH binaries), + `dist/main.js` (`runMigrations`, session-dir resolution), + `examples/sdk/03-custom-prompt.ts`, `04-skills.ts`, `05-tools.ts`, `06-extensions.ts`, + `07-context-files.ts`, `09-api-keys-and-oauth.ts`, `11-sessions.ts`. +- `@earendil-works/pi-ai@0.79.4` `dist/env-api-keys.js` — provider→env-var map + (`getApiKeyEnvVars`, `getEnvApiKey`). + +Docs / GitHub (corroborating): + +- SDK reference: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- npm: https://www.npmjs.com/package/@earendil-works/pi-coding-agent +- Docs site: https://pi.dev/docs/latest/sdk +- DeepWiki overview: https://deepwiki.com/earendil-works/pi/7.1-pi-coding-agent-sdk diff --git a/docs/design/agent-workflows/scratch/research/open-questions.md b/docs/design/agent-workflows/scratch/research/open-questions.md new file mode 100644 index 0000000000..f1883fd408 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/open-questions.md @@ -0,0 +1,313 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# Agent Workflows: Daytona and pi.dev due-diligence + +Status: research only. Broad due-diligence to surface what the focused research topics +(interaction API, OTel instrumentation, sandbox creation, auth/secrets, sandbox-sharing) +might miss. Every claim is cited. Items I could not verify from a primary source are +marked UNVERIFIED. Researched 2026-06-15. + +## Summary + +- **pi.dev** is a young but very active open-source (MIT) agent harness from Earendil Inc., + authored by Mario Zechner (GitHub `badlogic`, creator of libGDX). The npm package + `@earendil-works/pi-coding-agent` first published 2026-05-07 and is on **0.79.4** (released + the day of this research), shipping roughly weekly with frequent **breaking changes** in + the 0.x line. It runs locally as a CLI/SDK/RPC server; **it does not depend on Daytona**. +- **Daytona** is a mature, well-funded ($5M, Upfront Ventures), SOC-2 open-source (AGPL-3.0) + sandbox platform for running AI-generated code. Sub-90ms container starts, usage-based + pricing, $200 free credits, US/EU regions. The managed cloud is the same codebase as the + OSS repo and can be self-hosted via Docker Compose. +- **Biggest risks for this project:** (1) pi's 0.x velocity and breaking changes mean we + pin a version and budget for upgrade churn; the RPC/SDK contract is pi-specific and + **not** a portable cross-harness standard, so "configurable harness" is an abstraction + *we* own. (2) pi has **no first-party OpenTelemetry**; the only OTel path today is a + third-party community extension. (3) Daytona uses shared-kernel containers (not microVMs), + a weaker isolation story for hostile code; (4) default **15-min auto-stop** can kill + long-running agents mid-run; (5) network egress is restricted by default below Tier 3. + +## Maturity & risk + +**pi.dev** +- Open source, **MIT** license; monorepo `earendil-works/pi` (mirror/origin also seen as + `badlogic/pi-mono`). Packages: `pi-coding-agent` (CLI), `pi-agent-core` (runtime, tool + calling, state), `pi-ai` (unified multi-provider LLM API), `pi-tui` (terminal UI). A + separate `pi-chat` repo does Slack/chat workflows. + [README](https://github.com/earendil-works/pi/blob/main/README.md), + [npm](https://www.npmjs.com/package/@earendil-works/pi-coding-agent) +- Author: **Mario Zechner** (`badlogic`), an experienced OSS developer (libGDX). Earendil Inc. + is the company. + [HN](https://news.ycombinator.com/item?id=46629341), + [GitHub badlogic](https://github.com/badlogic) +- **Very young, very active.** npm package created **2026-05-07**, latest **0.79.4** on + **2026-06-15**. Release cadence is ~weekly (0.75.0 2026-05-17 through 0.79.4 2026-06-15 = + ~15 releases in a month). Still firmly **pre-1.0**. + [npm metadata via `npm view`](https://www.npmjs.com/package/@earendil-works/pi-coding-agent), + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **Breaking-change history is real and frequent** (0.x). Recent examples from the changelog: + 0.75.0 raised min Node to 22.19.0 and reworked tool selection from cwd-bound instances to + tool-name allowlists; 0.72.0 replaced `compat.reasoningEffortMap` with `thinkingLevelMap`; + 0.71.0 removed built-in Gemini/Antigravity providers; 0.69.0 migrated TypeBox and + invalidated captured session-bound extension objects. A `legacy-node20` dist-tag (0.74.2) + exists for older Node. + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **Lock-in:** low at the model layer (15+ providers, MIT). But the integration surface + (RPC commands/events, extension API, session JSONL format) is **pi-specific** and changes + between minor versions, so coupling to pi is a real cost even though the code is open. +- Community size: hard to quantify; active HN presence, third-party extensions appearing + (otel, sandboxing, oh-my-pi fork). Smaller and newer than Claude Code / Codex ecosystems. + [HN](https://news.ycombinator.com/item?id=47634337) + +**Daytona** +- Open source, **AGPL-3.0**; repo `daytonaio/daytona` reports ~72k stars on the repo page + (other sources cite ~21k — figure is noisy, treat as "large, popular"). 200+ releases, + latest ~v0.187.0 (2026-06-11). Polyglot (TS/Go/Python/Ruby/Java SDKs). + [GitHub](https://github.com/daytonaio/daytona), + [stars/funding search](https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments) +- Company: Ivan Burazin (CEO, ex-Codeanywhere/Infobip), raised **$5M** (Upfront Ventures, + 500 EE). **SOC-2** compliant. + [PRNewswire](https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html) +- **AGPL note:** the AGPL-3.0 license is copyleft and network-triggered. We consume Daytona + as a hosted service or via SDK over the network (not by linking/modifying its source), so + AGPL obligations should not reach Agenta's own code, but legal should confirm before any + self-host-and-modify path. The cloud and OSS share a codebase, so self-hosting is a real + fallback (Docker Compose stack + customer-managed compute/BYOC). + [GitHub](https://github.com/daytonaio/daytona) + +## Pricing & limits + +**Daytona** (managed cloud, pay-as-you-go, no minimum/commitment): +- vCPU **$0.0504/h**; RAM **$0.0162/h per GiB**; storage **$0.000108/h per GiB** (first 5 GiB + free). Billed per second. GPU: H100 $3.95/h, RTX PRO 6000 $3.03/h. Windows/Android OS + add-ons extra. **$200 free credits** at signup (no card for trial); startups up to $50k. + [Pricing](https://www.daytona.io/pricing), + [pricing search](https://www.morphllm.com/comparisons/daytona-alternative) +- **Cost intuition:** a 1 vCPU / 2 GiB sandbox ≈ $0.0504 + 2×$0.0162 = **~$0.083/h** of + active compute (storage extra). 10 such sandboxes running continuously ≈ **$0.83/h** ≈ + ~$600/mo if never stopped; auto-stop after idle cuts this sharply since CPU/RAM stop + billing while stopped (storage persists). Costs scale with concurrency × active runtime, + not request count. (Derived from the per-hour rates above — arithmetic ours.) +- **Rate limits (per minute, by tier):** Tier1 10k general / 300 create / 10k lifecycle; + Tier2 20k/400/20k; Tier3 40k/500/40k; Tier4 50k/600/50k; Enterprise custom. +- **Resource quotas (per tier):** Tier1 10 vCPU / 20 GiB RAM / 30 GiB disk; Tier2 + 100/200/300; Tier3 250/500/2000; Tier4 500/1000/5000. Concurrency is gated by these + pooled quotas (how many sandboxes run at once depends on each one's size). +- **Tier gating:** Tier1 email-verified; Tier2 card + $25 top-up; Tier3 $500 top-up; Tier4 + $2000 top-up / 30 days; Enterprise contact. + [Limits](https://www.daytona.io/docs/en/limits/), + [DeepWiki quotas](https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits) + +**pi.dev** +- The harness itself is free/MIT. Cost is the **LLM provider tokens** (BYO key or OAuth to + Claude Pro/Max, ChatGPT/Codex, Copilot, plus API-key providers) plus whatever sandbox you + run it in. No pi-side metering. + [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/), + [pi.dev](https://pi.dev/) + +## Operational concerns + +**Daytona** +- **Cold start:** advertised sub-90ms sandbox creation (container-based). + [docs overview](https://www.daytona.io/docs), [vstorm](https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/) +- **Lifecycle/timeouts:** default **auto-stop after 15 min** of inactivity, **auto-archive + after 7 days** stopped; auto-delete configurable. Stopped = storage kept, CPU/RAM freed; + archived = no quota. **Sharp edge:** a long-running process (e.g. a >15-min agent run with + no external interaction) can be auto-stopped mid-run because the process itself does not + count as "activity" — set/extend auto-stop for long agents. + [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona), + [Northflank](https://northflank.com/blog/daytona-vs-modal) +- **Regions / residency:** shared regions **US** (`us`) and **EU** (`eu`); you can target a + region per sandbox. Custom Regions (BYO runners, full isolation, residency control) are + invite-only/experimental. Some sources note the **managed cloud is effectively single + primary region (us-east-1/iad1)** in practice — UNVERIFIED against official docs, treat + EU availability as "claimed, confirm before relying on it for residency". + [Regions](https://www.daytona.io/docs/en/regions/), + [single-region claim](https://www.zenml.io/blog/e2b-vs-daytona) +- **Networking egress:** per-sandbox network stack with firewall. **Tier 1 & 2: restricted + egress by default; Tier 3 & 4: full internet by default.** Controls: `networkAllowList` + (CIDR, max 10 /32 entries) and `networkBlockAll`. Only Tier 3/4 can change firewall after + creation. All tiers get allowlisted access to npm/PyPI, Docker/k8s registries, + GitHub/GitLab, CDNs, and AI providers (Anthropic/OpenAI/Google). **Implication:** to inject + an arbitrary secret endpoint or call a non-allowlisted internal service, plan for Tier 3+. + [Network limits](https://www.daytona.io/docs/en/network-limits/), + [egress issue](https://github.com/daytonaio/daytona/issues/3357) +- **Isolation:** container with dedicated kernel claims, but multiple comparisons note it + shares the host kernel (not Firecracker microVM) — weaker boundary for genuinely hostile + code than E2B/Fly. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) + +**pi.dev** +- Runs as a local process; operational profile (cold start, scaling) is whatever sandbox/ + host we run it on. No managed pi runtime to scale or rate-limit. Reliability is a function + of (a) pi's own stability at 0.x and (b) the chosen LLM provider's limits. + +## Local parity + +- **Strong yes — pi is local-first and needs no Daytona.** pi is a CLI/SDK/RPC harness that + runs in any project directory. Four surfaces: interactive TUI, print/JSON event-stream + mode, **RPC mode** (JSONL over stdin/stdout), and a **Node SDK** (`AgentSession`). The same + binary/SDK runs locally or inside a sandbox. + [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md), + [RPC docs](https://pi.dev/docs/latest/rpc) +- This makes "pull config from server, run the same harness locally" realistic: the agent + config (AGENTS.md, skills, model, tools, files) maps onto pi's own context model + (AGENTS.md/SYSTEM.md, skills, tool allowlists, presets/extensions). + [overview](https://pi.dev/), [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md) +- **What differs local vs sandboxed (the parity gaps we own):** + - **Sandbox/isolation layer.** Server runs pi inside Daytona; local runs pi on the host (or + pi's own local sandbox options: **Gondolin** QEMU micro-VM, plain Docker, OpenShell). + These are pi's *own* local isolation, not Daytona — so the file/secret startup hooks and + the FS/network surface differ between Daytona and a local run unless we replicate them. + [containerization search](https://github.com/pasky/pi-gondolin) + - **Secrets/auth injection.** Server injects secrets via startup hooks into the sandbox; + locally the user supplies keys/OAuth. Parity requires our wrapper to lay down the same + files/env both places. + - **Network egress.** Daytona's tiered firewall has no local equivalent; a tool that works + locally could be blocked in-sandbox below Tier 3. + - **Instrumentation.** OTel is an opt-in extension either way (see below); it is not on by + default, so parity depends on us loading the same extension/config in both modes. +- Net: pi gives genuine local parity for the *agent loop*; the *environment* (sandbox, + secrets, egress, telemetry) is the part Agenta must make identical across local and server. + +## Harness swappability + +- **Important framing:** in pi, "harness" means *the agent loop you customize within pi* + (tools, prompts, auth, event loop), not a pluggable adapter where you drop in Codex or + Claude Code behind a common interface. pi's own docs/talks define the harness as "the set + of abstractions which transforms [the] IO machine into an 'agent'" and emphasize + composition *within* pi, not interchangeable backends. + [harness-engineering slides](https://dmg-egg.github.io/slides-harness-engineering-with-pi/) +- pi supports many **models/providers** (Anthropic, OpenAI, Google, Bedrock, Mistral, xAI, + Groq, Cerebras, OpenRouter, Ollama, etc.) and **subscription OAuth** to Claude Pro/Max, + ChatGPT/Codex, and Copilot. But these are *models behind pi's loop*, not separate harnesses + like the Claude Code CLI or Codex CLI. + [providers/RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- The RPC protocol is rich (85+ commands, ~12 event types incl. `agent_start/end`, + `turn_start/end`, `message_*`, `tool_execution_*`, plus `get_state` exposing `sessionId`, + and `agent_end` carrying **all messages from the run** = the multi-message output). But it + is **pi-specific and unversioned** (no documented stability/deprecation policy), and pi's + own docs say to prefer `AgentSession` directly over the subprocess RPC when embedding in + Node. So it is a good integration surface for pi, **not** a neutral cross-harness standard. + [RPC docs](https://pi.dev/docs/latest/rpc) +- **Conclusion for the design:** "configurable/swappable harness" is **an abstraction Agenta + must own.** If we ever want to run Codex CLI or Claude Code as alternative harnesses, we + define our own port (config in -> sandbox setup -> run -> normalized multi-message output + + session_id + traces out) and write per-harness adapters. pi will be the first and + best-fitting adapter because of its RPC/SDK, but it does not hand us a ready-made + multi-harness interface. + +## Gotchas / sharp edges + +- **pi 0.x churn.** Weekly releases with breaking changes (Node-version bumps, tool-selection + model changes, provider removals, session-object invalidation). Pin an exact version, test + upgrades, watch the changelog. + [CHANGELOG](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md) +- **No first-party OTel in pi.** The only OpenTelemetry path is a **third-party community + extension** (`mprokopov/pi-otel-telemetry`), which emits one trace tree per prompt (turns, + LLM requests, tool calls) over OTLP. It is unofficial and unversioned against pi; the + instrumentation research topic should treat first-party telemetry as absent today. + [pi-otel repo](https://github.com/mprokopov/pi-otel-telemetry), + [pi-otel writeup](https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html) +- **pi has no built-in permission system / MCP / sub-agents / plan mode** by design — they + are extension territory. Anything we assume "the agent will ask before X" must be added. + [README](https://github.com/earendil-works/pi/blob/main/README.md), + [docs index](https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md) +- **JSONL framing is strict** in RPC mode: split on `\n` only; do not use Node `readline` + (it splits on Unicode separators too) or records corrupt. + [RPC search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- **Daytona 15-min auto-stop** can kill long agent runs mid-flight (process activity does not + reset the idle timer) — set auto-stop explicitly for agents. + [lifecycle search](https://www.zenml.io/blog/e2b-vs-daytona) +- **Daytona egress is tiered**; below Tier 3 you cannot freely reach arbitrary endpoints and + cannot change the firewall post-creation. Budget for Tier 3 if agents call internal/custom + services. + [Network limits](https://www.daytona.io/docs/en/network-limits/) +- **Daytona shared-kernel isolation** is weaker than microVM competitors for untrusted code. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **pi.dev's own sandbox examples (Gondolin/Docker/OpenShell) are local/host-side**, with no + first-party Daytona integration — the pi <-> Daytona glue is ours to build. + [containerization search](https://github.com/pasky/pi-gondolin) + +## Alternatives (fallback landscape — one line each) + +Sandbox providers (alternatives to Daytona): +- **E2B** — Firecracker microVM with a dedicated kernel per sandbox; strongest isolation for + untrusted code. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Modal** — native GPU sandboxes; the pick when agents need inference/GPU in-sandbox. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Fly.io (Machines / "Sprites")** — full filesystem persistence across sessions so agents + resume without rebuilding; Firecracker-based. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Morph** — VM branching/fork in <250ms for parallel exploration of multiple solution paths. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Freestyle** — full root + nested virtualization (Docker-in-VM) for heavy/custom envs. + [morphllm](https://www.morphllm.com/comparisons/daytona-alternative) +- **Vercel Sandbox / Northflank / Cloudflare / microsandbox** — other credible options that + show up in 2026 comparisons; differentiators not deeply verified here. UNVERIFIED specifics. + [comparison](https://northflank.com/blog/ai-sandbox-pricing), + [comparison](https://betterstack.com/community/comparisons/best-sandbox-runners/) + +Harnesses (alternatives to pi.dev): +- **Claude Code** (Anthropic) — the de-facto reference coding agent; more opinionated, larger + ecosystem, less "minimal/composable" than pi. Often cited by pi users as the thing they + came from. + [HN](https://news.ycombinator.com/item?id=47634337) +- **Codex CLI** (OpenAI) — OpenAI's agent CLI; pi can use Codex *as a provider via OAuth*, but + as a *harness* it's a separate tool with its own loop. + [providers search](https://hochej.github.io/pi-mono/coding-agent/rpc/) +- **oh-my-pi** — a community fork of pi adding subagents/LSP/browser/optimized tool harness; + signal that pi's design invites forks, and a possible drop-in if pi mainline diverges. + [oh-my-pi](https://github.com/can1357/oh-my-pi) + +## Open questions (for the focused topics / before committing) + +1. Pin strategy for pi version (exact pin + upgrade cadence) given weekly breaking 0.x + releases. Who owns watching the changelog? +2. Telemetry: do we adopt/fork `pi-otel-telemetry`, or write our own pi extension to emit the + spans Agenta tracing expects? (No first-party OTel exists.) → instrumentation topic. +3. Confirm Daytona EU region + data-residency guarantees against official docs/sales; the + "single-region us-east-1" claim needs verification before we promise EU residency. +4. Decide the default auto-stop / max-run-duration for agent sandboxes so long runs aren't + killed at 15 min. → sandbox-creation topic. +5. Which Daytona tier do we operate on? Egress + post-creation firewall + concurrency quotas + all hinge on Tier 3+. → auth/secrets + sandbox-creation topics. +6. Define Agenta's own harness port (config -> setup -> run -> normalized output + session_id + + traces) since pi gives no neutral multi-harness interface; validate it against pi first, + then sketch a Codex/Claude-Code adapter to prove the abstraction. → pi.dev harness topic. +7. Local-parity contract: which startup hooks (files, secrets, egress, telemetry) must be + replicated locally, and do we reuse pi's Gondolin/Docker locally or run bare on host? + → local-execution topic. +8. AGPL review for any self-hosted-and-modified Daytona path (network copyleft). + +## Sources + +- pi.dev overview — https://pi.dev/ +- pi README — https://github.com/earendil-works/pi/blob/main/README.md +- pi docs index — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/docs/index.md +- pi coding-agent CHANGELOG — https://raw.githubusercontent.com/earendil-works/pi/main/packages/coding-agent/CHANGELOG.md +- pi npm package — https://www.npmjs.com/package/@earendil-works/pi-coding-agent +- pi RPC docs — https://pi.dev/docs/latest/rpc +- pi RPC (mirror) — https://hochej.github.io/pi-mono/coding-agent/rpc/ +- Harness engineering with pi (slides) — https://dmg-egg.github.io/slides-harness-engineering-with-pi/ +- Mario Zechner GitHub — https://github.com/badlogic +- HN discussion on pi — https://news.ycombinator.com/item?id=47634337 and https://news.ycombinator.com/item?id=46629341 +- pi-otel telemetry extension — https://github.com/mprokopov/pi-otel-telemetry +- pi-otel writeup — https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html +- pi-gondolin sandbox extension — https://github.com/pasky/pi-gondolin +- oh-my-pi fork — https://github.com/can1357/oh-my-pi +- Daytona docs overview — https://www.daytona.io/docs +- Daytona limits — https://www.daytona.io/docs/en/limits/ +- Daytona resource quotas (DeepWiki) — https://deepwiki.com/daytonaio/daytona/6.3-resource-quotas-and-limits +- Daytona regions — https://www.daytona.io/docs/en/regions/ +- Daytona network limits — https://www.daytona.io/docs/en/network-limits/ +- Daytona dynamic egress issue — https://github.com/daytonaio/daytona/issues/3357 +- Daytona pricing — https://www.daytona.io/pricing +- Daytona GitHub — https://github.com/daytonaio/daytona +- Daytona funding (PRNewswire) — https://www.prnewswire.com/news-releases/daytona-secures-5m-to-simplify-development-environments-302181407.html +- Daytona funding (blog) — https://www.daytona.io/dotfiles/daytona-secures-5m-to-simplify-development-environments +- E2B vs Daytona — https://www.zenml.io/blog/e2b-vs-daytona +- Daytona vs Modal — https://northflank.com/blog/daytona-vs-modal +- AI sandbox pricing comparison — https://northflank.com/blog/ai-sandbox-pricing +- Daytona alternatives — https://www.morphllm.com/comparisons/daytona-alternative +- Sandbox runners comparison — https://betterstack.com/community/comparisons/best-sandbox-runners/ +- Daytona sub-90ms (vstorm) — https://oss.vstorm.co/blog/daytona-sub-90ms-code-execution/ diff --git a/docs/design/agent-workflows/scratch/research/otel-instrumentation.md b/docs/design/agent-workflows/scratch/research/otel-instrumentation.md new file mode 100644 index 0000000000..5f632e8ca6 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/otel-instrumentation.md @@ -0,0 +1,379 @@ +# OTel Instrumentation for the pi.dev Agent Harness + +Status: research only. No code changed. Research date: 2026-06-15. + +This file answers the five research questions in the agent-workflows brief: +how to instrument the pi.dev harness with OpenTelemetry (OTel), what already +exists, what span conventions to use, how spans get out of a sandbox, and how +all of that lands in Agenta's existing OTel ingestion. + +## Summary + +- **pi.dev is "Pi", a minimal agent harness by Earendil Inc.** (the company is + "earendil-works" on GitHub, repo `earendil-works/pi`). It is a coding-agent + toolkit: a unified multi-provider LLM API, an agent loop with tool calling, + a TUI, and a CLI. It ships as npm packages `@earendil-works/pi-ai`, + `@earendil-works/pi-agent-core`, `@earendil-works/pi-coding-agent`, + `@earendil-works/pi-tui`. MIT licensed. +- **"pi instruments" is not a built-in OTel exporter.** Pi has no native OTel + emitter in its docs. What it has is an **extension event system**: an + extension registers handlers with `pi.on(, handler)` and gets + lifecycle events for the agent loop (session, agent_start/agent_end, + turn_start/turn_end, tool_execution_start/end, before_provider_request / + after_provider_response, message_start/message_end). "Instrumentation" = + writing (or installing) an extension that listens to those events and turns + them into OTel spans. There is no first-party Pi telemetry dashboard to + reuse. +- **Three community OTel extensions for Pi already exist** and all emit OTLP: + `maxmalkin/pi-OTEL`, `mprokopov/pi-otel-telemetry`, and the `pi-otel` covered + by the nikiforovall blog. They all use **OTel GenAI semantic conventions** + (`gen_ai.*`), not OpenInference. They are TypeScript Pi extensions. +- **Agenta already ingests exactly this.** Agenta exposes an OTLP/HTTP + protobuf endpoint at `POST /otlp/v1/traces` and normalizes incoming spans + through an adapter registry that already understands **OTel GenAI semconv**, + **OpenLLMetry (Traceloop)**, **OpenInference (Arize)**, **Logfire**, and + **Vercel AI**. A Pi extension that emits `gen_ai.*` spans over OTLP/HTTP to + Agenta's endpoint would flow through the existing pipeline with little or no + new backend code. +- **Recommended path:** emit OTel GenAI-semconv spans from a Pi extension + (fork/reuse one of the three), export OTLP/HTTP to Agenta's + `/otlp/v1/traces` with `Authorization: ApiKey ` and `?project_id=`, + and let the existing GenAI-semconv adapter map them. Add a thin Agenta-side + adapter only if we want richer agent/turn structure than `gen_ai.*` carries. + +## What "pi instruments" is + +**Product.** pi.dev = "Pi", "a minimal agent harness" by Earendil Inc. Tagline +"Adapt Pi to your workflows, not the other way around." Four operating modes: +interactive TUI, print/JSON output, RPC (stdin/stdout JSONL), and an SDK for +embedding in Node.js. It deliberately omits MCP, sub-agents, permission popups, +and plan mode from the core, expecting you to add them via extensions. +Source: https://pi.dev/ , https://github.com/earendil-works/pi/blob/main/README.md + +**Packages** (npm, scope `@earendil-works`): +- `pi-ai` — unified multi-provider LLM API (OpenAI, Anthropic, Google, etc.) +- `pi-agent-core` — agent runtime: tool calling + state management +- `pi-coding-agent` — interactive coding-agent CLI +- `pi-tui` — terminal UI library +Source: https://github.com/earendil-works/pi/blob/main/README.md + +**The instrumentation mechanism is the extension event bus, not a built-in +exporter.** Pi's official docs have an "Extensions" page but **no telemetry / +OTel / observability page**. Extensions are TypeScript modules that subscribe +to lifecycle events: + +```ts +pi.on(eventName, async (event, ctx) => { + // ctx is an ExtensionContext: ctx.sessionManager (read-only session), + // ctx.signal (abort-aware), ctx.ui (interaction) +}); +``` + +Events relevant to telemetry (exact names from the Extensions doc): +- Session lifecycle: `session_start` (reasons: startup/reload/new/resume/fork), + `session_shutdown`, `project_trust`, `resources_discover`. +- Agent loop: `before_agent_start`, `agent_start` (once per user prompt), + `agent_end` (has `event.messages`), `turn_start`, `turn_end` (per LLM + response cycle). +- Messages: `message_start`, `message_update`, `message_end` (user, assistant, + tool-result messages). +- Tools: `tool_execution_start` (has `toolCallId`, `toolName`, `args`), + `tool_execution_update`, `tool_execution_end`; plus `tool_call` (pre-exec, + can block) and `tool_result` (post-exec, can modify). +- Provider/model: `before_provider_request` (built payload, before HTTP), + `after_provider_response` (HTTP status/headers, before stream consumed), + `model_select`, `thinking_level_select`. +- Input: `input`, `user_bash`. +Source: https://pi.dev/docs/latest/extensions + +So when the agent-workflows README says runs are "instrumented through pi +instruments," concretely that means: **a Pi extension hooks these events and +produces spans/metrics.** There is no proprietary "instruments" object to +adopt; it is the standard extension API. (UNVERIFIED: whether "pi instruments" +is an internal Agenta shorthand for a specific bundled extension vs. the +generic extension mechanism. The public Pi docs only expose `pi.on` + tools.) + +Installation pattern for an extension (from pi-OTEL): +`pi install git:github.com//` (or `pi install npm:`), then +`/reload`. Source: https://github.com/maxmalkin/pi-OTEL + +## Existing libraries + +### Pi-specific OTel extensions (closest fit — reuse candidates) + +All three are TypeScript Pi extensions emitting OTLP and using OTel GenAI +semconv. They differ mainly in span tree shape and whether they also emit +metrics. + +1. **`maxmalkin/pi-OTEL`** — "OpenTelemetry harness for the Pi coding agent." + - Span tree: `pi.session` -> `pi.agent_turn` -> (`gen_ai.chat `, + `tool.`). + - Attributes follow OTel GenAI semconv. Honors standard OTLP env vars: + `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_ENDPOINT` + (appends `/v1/traces`), `OTEL_EXPORTER_OTLP_HEADERS` (`k=v,k=v`), + `OTEL_SERVICE_NAME` (default `pi`), `OTEL_RESOURCE_ATTRIBUTES`. + Pi-specific: `PI_OTEL_DISABLED` (default `0`), + `PI_OTEL_CAPTURE_CONTENT` (default `0`, gates prompt/completion/tool I/O). + Same keys accepted in `settings.json` under `otel`. Falls back to + `http://localhost:4318/v1/traces` (OTLP/HTTP). + - Runtime commands: `/otel-status`, `/otel-flush`. + - Source: https://github.com/maxmalkin/pi-OTEL + +2. **`pi-otel` (nikiforovall)** — emits one trace tree per user prompt. + - Span tree: `pi.interaction` (root, per prompt) -> `pi.turn` -> + (`pi.llm_request`, `pi.tool.`). Deliberately **does not** make the + session a span ("a pi session can run for hours; long-running root spans + are an OTel anti-pattern") — it correlates via `gen_ai.conversation.id`. + - Attributes: GenAI semconv — `gen_ai.system`, `gen_ai.request.model`, + `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`, finish reason, + tool call ids, `gen_ai.conversation.id`. + - Config: default endpoint `http://localhost:4317` (OTLP **gRPC**), + `settings.json` `otel` block `{enabled, endpoint, protocol:"grpc"}`, + `OTEL_*` env overrides, `PI_OTEL_DISABLED=1` to disable. Default backend + is a local .NET Aspire dashboard (auto-spawned via `/otel start`); any + OTLP backend works (Grafana LGTM, Jaeger, Honeycomb). + - Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html + +3. **`mprokopov/pi-otel-telemetry`** — traces **and metrics**. + - Span tree: `session` (root) -> `agent.prompt` (per user message) -> + `agent.turn` (LLM call + tool cycle) -> `tool.` (e.g. `tool.bash`, + `tool.read`, `tool.edit`). Span events: `llm.request`, `model.changed`, + `session.compacted`. + - Metrics: `pi.tokens.input`, `pi.tokens.output` (counters); `pi.tool.calls`, + `pi.tool.errors` (counters, labelled `tool.name`); `pi.tool.duration` + (histogram ms); `pi.turns`, `pi.prompts` (counters); + `pi.session.duration` (histogram s). + - Attributes: `session.id`, `session.cwd`, token counts, user identity; + turn spans `turn.index`, `llm.usage.input_tokens`, + `llm.usage.output_tokens`; tool spans `tool.name`, `tool.call_id`, + `tool.duration_ms`. + - Config: `OTEL_EXPORTER_OTLP_ENDPOINT` default `http://localhost:4318` + (OTLP/HTTP), `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` / + `..._METRICS_ENDPOINT` overrides, `PI_OTEL_DEBUG=true`. + - Source: https://github.com/mprokopov/pi-otel-telemetry + +**Takeaway:** there is no single canonical Pi OTel package; the three diverge on +span-tree shape and span names (`pi.session` vs `pi.interaction` vs `session`). +What they agree on is **GenAI semconv `gen_ai.*` attributes over OTLP**. For +Agenta we should pick/fork one and pin the span tree we want; don't assume a +stable upstream contract. + +### Framework instrumentations (not Pi-specific) + +- **OpenInference (Arize)** — OTel-based semantic conventions + auto-instrumentors + for LangChain, LlamaIndex, OpenAI SDK, etc. Defines 10 span kinds via the + required `openinference.span.kind` attribute: `LLM`, `EMBEDDING`, + `RETRIEVER`, `RERANKER`, `TOOL`, `CHAIN`, `AGENT`, `GUARDRAIL`, `EVALUATOR`, + `PROMPT`. It does **not** ship a Pi instrumentor — Pi isn't one of its + supported frameworks — so using OpenInference for Pi means writing the span + kinds by hand in a Pi extension. Fit: good vocabulary for agent/tool/chain + structure, but no off-the-shelf Pi support. + Sources: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md , + https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions + +- **OpenLLMetry (Traceloop)** — OTel SDK + instrumentations that emit `gen_ai.*` + (plus `traceloop.*`, `llm.*`) attributes. Auto-instruments LLM providers and + some frameworks. No Pi instrumentor; same story as OpenInference — you'd hand + off via a Pi extension or rely on its provider-level auto-instrumentation of + the underlying LLM HTTP client (possible but indirect, and Pi's `pi-ai` may + not match a provider Traceloop patches). + (UNVERIFIED whether Traceloop's provider instrumentation intercepts + `@earendil-works/pi-ai`'s HTTP calls automatically.) + +- **OTel GenAI semantic conventions (official)** — the upstream spec the Pi + extensions follow. Operation names: `create_agent`, `invoke_agent`, + `execute_tool`, plus the chat/inference spans. Span naming guidance: + `invoke_agent {gen_ai.agent.name}` (or just `invoke_agent`), and + `execute_tool {gen_ai.tool.name}` for tool calls (used for MCP tool calls + too). Key attributes: `gen_ai.operation.name`, `gen_ai.agent.name`, + `gen_ai.agent.id`, `gen_ai.conversation.id`, `gen_ai.tool.name`, + `gen_ai.tool.call.id`, `gen_ai.request.model`, `gen_ai.usage.input_tokens`, + `gen_ai.usage.output_tokens`. This is the most "standard" and the most + future-proof target. + Sources: https://opentelemetry.io/docs/specs/semconv/gen-ai/ , + https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ + (NOTE: the gen-ai pages now redirect to the + `open-telemetry/semantic-conventions` repo; the agent-spans operation + names above come from the indexed spec text, lightly UNVERIFIED against the + latest repo revision.) + +## Span / attribute conventions and how well they map to agent runs + +A multi-turn agent run = one logical conversation -> N user prompts -> +per-prompt agent invocation -> M turns (each an LLM call) -> per-turn 0..K tool +calls. All three conventions can express this; they differ in vocabulary. + +| Layer in a Pi run | OTel GenAI semconv | OpenInference span kind | Pi extension span (varies) | +|---|---|---|---| +| Whole conversation | `gen_ai.conversation.id` (correlation, not a span) | `session.id` attr / CHAIN root | `pi.session` / `session` (or skipped) | +| Per-prompt agent invocation | `invoke_agent` op | `AGENT` | `pi.interaction` / `agent.prompt` / `pi.agent_turn` | +| Per-turn LLM call | chat/inference span, `gen_ai.request.model` | `LLM` | `gen_ai.chat ` / `pi.turn` / `pi.llm_request` | +| Tool call | `execute_tool`, `gen_ai.tool.name`, `gen_ai.tool.call.id` | `TOOL` | `tool.` | +| Glue/orchestration | (no dedicated kind) | `CHAIN` | n/a | +| Retrieval / rerank / embeddings | embeddings spans | `RETRIEVER` / `RERANKER` / `EMBEDDING` | n/a | + +Assessment: +- **GenAI semconv** maps cleanly to LLM calls and tool calls and has explicit + agent + tool operation names. Its weak spot is the multi-turn *tree*: it + leans on `gen_ai.conversation.id` for correlation rather than mandating a + session/turn span hierarchy, which is why the Pi extensions invent their own + parent spans (`pi.session`, `pi.interaction`, `pi.turn`). Good attribute + vocabulary; you still design the tree. +- **OpenInference span kinds** (AGENT/CHAIN/LLM/TOOL/RETRIEVER) map *very* + cleanly to a nested agent run and are what Agenta's UI already keys off (see + next section). The cost: no Pi auto-instrumentor, so you set + `openinference.span.kind` yourself. +- A pragmatic hybrid works: emit GenAI `gen_ai.*` attributes (what the Pi + extensions already produce) **and** set `openinference.span.kind` per span so + Agenta types the node correctly. Agenta's adapters read both. + +## Export-from-sandbox path + +Inside a Daytona (or other) sandbox the Pi extension runs the OTel SDK and +exports OTLP. To reach Agenta's collector across the sandbox boundary: + +1. **Endpoint.** Agenta accepts OTLP/HTTP **protobuf** at `POST /otlp/v1/traces` + (mounted in `api/entrypoints/routers.py` with prefix `/otlp/v1`). Binary + protobuf only (`Content-Type: application/x-protobuf`); JSON OTLP is **not** + accepted. Batch size limit default 10 MB (`AGENTA_OTLP_MAX_BATCH_BYTES`, + env `OTLPConfig.max_batch_bytes`); over-limit -> 413. (The router docstring + says "default 4 MB"; the actual env default in `env.py` is 10 MB — doc/code + drift worth noting.) + Files: `api/oss/src/apis/fastapi/otlp/router.py`, + `api/oss/src/utils/env.py` (`OTLPConfig`, line ~326), + `api/entrypoints/routers.py` (~line 770). + - So set `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=https:///otlp/v1/traces` + and use the **OTLP/HTTP protobuf** exporter. The gRPC-default extension + (nikiforovall) would need reconfiguring to HTTP/protobuf, or a collector + sidecar to translate. +2. **Auth + tenant scope.** Agenta's auth middleware expects + `Authorization: ApiKey ` (prefix `ApiKey `) and resolves + organization/workspace/project/user from it; `project_id` can also come + from a `?project_id=` query param. So the exporter needs + `OTEL_EXPORTER_OTLP_HEADERS=Authorization=ApiKey ` and the project id + either in the key's scope or the URL query string. In EE the ingest path + also checks `EDIT_SPANS` permission and `TRACES_INGESTED` quota. + Files: `api/oss/src/middlewares/auth.py` (`_APIKEY_TOKEN_PREFIX = "ApiKey "`, + query `project_id` handling), `api/oss/src/apis/fastapi/otlp/router.py` + (EE permission + entitlement checks). +3. **Secret delivery.** The Agenta API key is a secret; per the agent-workflows + README, secrets are injected into the sandbox by the startup hook. The key + and the OTLP endpoint should be injected the same way (env vars consumed by + the OTel SDK), so the harness running locally vs server-side only differs in + endpoint/key values — preserving the local/server parity requirement. +4. **Trace-context propagation across the boundary.** Two cases: + - If the agent run is *initiated by* an Agenta backend request, propagate + W3C `traceparant`/`traceparent` into the sandbox (env or RPC metadata) so + the in-sandbox root span is a child of the backend span and the run shows + as one trace. (UNVERIFIED: whether Agenta currently sets/forwards + `traceparent` to invocations — needs a check of the invocation service.) + - If the run is standalone, the extension creates its own root and relies on + `gen_ai.conversation.id` / `session.id` for correlation; Agenta's + OpenInference + Logfire adapters map `session.id` / + `gen_ai.conversation.id` -> `ag.session.id`, which lines up with the + agent-workflows `session_id` concept. +5. **Network egress.** The sandbox must be allowed outbound HTTPS to the Agenta + host. With Daytona this is a sandbox network-policy concern (UNVERIFIED for + our port). A collector/agent sidecar in the sandbox is an alternative that + also lets us batch, retry, and strip content centrally. + +## How it maps to Agenta's existing OTel ingestion + +Agenta already has the whole receive-and-normalize pipeline; a Pi agent is just +another OTLP producer. + +- **Ingest.** `OTLPRouter.otlp_ingest` parses the protobuf + (`parse_otlp_stream`), converts each OTel span to an internal DTO + (`parse_from_otel_span_dto`), runs an EE quota soft-check, then queues spans + on a Redis stream for async persistence (same path as native ingest). + File: `api/oss/src/apis/fastapi/otlp/router.py`. +- **Normalization via adapter registry.** `AdapterRegistry` runs, in order: + `OpenLLMmetryAdapter`, `OpenInferenceAdapter`, `LogfireAdapter`, + `VercelAIAdapter`, `DefaultAgentaAdapter`. Each maps its vendor attributes to + Agenta's canonical `ag.*` namespace. + File: `api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py`. +- **GenAI semconv is already mapped.** `api/.../otlp/opentelemetry/semconv.py` + and the OpenLLMetry adapter map `gen_ai.system`, `gen_ai.request.model`, + `gen_ai.usage.prompt_tokens|completion_tokens|total_tokens`, + `gen_ai.prompt.*`, `gen_ai.completion.*`, etc. -> `ag.meta.*` / + `ag.data.*` / `ag.metrics.unit.tokens.*`. **This is precisely what the Pi + OTel extensions emit**, so Pi `gen_ai.*` spans largely normalize today. + - Caveat: the existing map uses the older `gen_ai.usage.prompt_tokens` / + `completion_tokens` names. The Pi extensions emit the newer + `gen_ai.usage.input_tokens` / `output_tokens`. Those newer keys are **not** + in `semconv.py` yet, so token metrics from Pi would be dropped until we add + the two aliases. (Verified by reading `semconv.py` — only `prompt_tokens` / + `completion_tokens` / `total_tokens` are present.) +- **Span typing / agent structure.** `OpenInferenceAdapter` maps + `openinference.span.kind` -> `ag.type.node` with + `OPENINFERENCE_TO_AGENTA_SPAN_KIND_MAP`: `CHAIN->chain`, `RETRIEVER->query`, + `RERANKER->rerank`, `LLM->chat`, `EMBEDDING->embedding`, `AGENT->agent`, + `TOOL->tool`, `GUARDRAIL->task`, `EVALUATOR->task`. It also normalizes tool + definitions (`llm.tools.{i}.tool.json_schema`), tool calls, and + input/output messages into the canonical OpenAI shape Agenta's UI expects. + File: `api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py`. +- **Session correlation.** `session.id` (OpenInference) and + `gen_ai.conversation.id` (Logfire adapter) both map to `ag.session.id`, + which aligns with the agent-workflows `session_id`. + +**Net:** the lowest-effort integration is a Pi extension emitting GenAI-semconv +spans **and** `openinference.span.kind` over OTLP/HTTP protobuf to +`/otlp/v1/traces`. To get full fidelity we'd add a small amount of backend +mapping (token-name aliases; optionally a dedicated "Pi/agent" adapter if we +want first-class agent/turn nodes instead of generic chat/tool). No new ingest +infrastructure is needed. + +## Open questions + +1. **Which span tree do we standardize on?** The three Pi extensions disagree + (`pi.session` vs `pi.interaction` vs `session`; whether the session is a + span at all). We must pin one to get a stable Agenta UI. The + "no long-running session root" argument (nikiforovall) matters if Pi + sessions can run for hours. +2. **Build vs fork.** Fork `maxmalkin/pi-OTEL` (OTLP/HTTP, content gate) or + `mprokopov/pi-otel-telemetry` (also metrics) vs write our own minimal + extension? Need to read their actual source for license/quality and to see + the exact `pi.on(...)` wiring (the READMEs describe spans, not code). +3. **Token attribute drift.** Add `gen_ai.usage.input_tokens` / + `output_tokens` (and `gen_ai.usage.*` newer keys) to Agenta's `semconv.py` + so Pi token metrics aren't silently dropped. Confirm against the live + GenAI semconv revision. +4. **Trace-context propagation.** Does Agenta forward W3C `traceparent` into an + invocation today? If we want the in-sandbox spans stitched under the + originating backend span, we need to propagate context across the + harness/sandbox boundary (env var or RPC metadata). Needs a code check of + the invocation/workflow run path. +5. **Content capture policy.** Pi extensions gate prompt/completion/tool I/O + behind `PI_OTEL_CAPTURE_CONTENT`. Decide default (privacy vs. eval + usefulness) and whether to enforce it server-side too. +6. **Transport mismatch.** Agenta is OTLP/HTTP **protobuf only**. The + gRPC-default extension and any JSON-OTLP setup need reconfiguration or a + collector sidecar in the sandbox. +7. **"pi instruments" terminology.** Confirm with whoever wrote the + agent-workflows README whether it refers to the generic `pi.on` extension + API or a specific Earendil/Agenta-internal "instruments" bundle. The public + Pi docs only expose `pi.on` + tool registration; no "instruments" object. +8. **Doc/code drift.** OTLP router docstring says 4 MB max batch; `env.py` + default is 10 MB. Worth fixing when this work lands. + +## Sources + +- Pi product site: https://pi.dev/ +- Pi repo README: https://github.com/earendil-works/pi/blob/main/README.md +- Pi extensions doc (event system / `pi.on`): https://pi.dev/docs/latest/extensions +- Pi docs index: https://pi.dev/docs/latest +- pi-OTEL extension (maxmalkin): https://github.com/maxmalkin/pi-OTEL +- pi-otel-telemetry (mprokopov): https://github.com/mprokopov/pi-otel-telemetry +- pi-otel blog (nikiforovall): https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html +- Pi as customer-hosted agent runtime discussion: https://github.com/earendil-works/pi/discussions/3337 +- OTel GenAI semconv (index): https://opentelemetry.io/docs/specs/semconv/gen-ai/ +- OTel GenAI agent spans: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ +- OpenInference semantic conventions spec: https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md +- OpenInference conventions (Arize docs): https://arize.com/docs/ax/observe/tracing-concepts/openinference-semantic-conventions +- Agenta OTLP ingest router: api/oss/src/apis/fastapi/otlp/router.py +- Agenta adapter registry: api/oss/src/apis/fastapi/otlp/extractors/adapter_registry.py +- Agenta GenAI/OpenLLMetry semconv map: api/oss/src/apis/fastapi/otlp/opentelemetry/semconv.py +- Agenta OpenInference adapter: api/oss/src/apis/fastapi/otlp/extractors/adapters/openinference_adapter.py +- Agenta auth middleware: api/oss/src/middlewares/auth.py +- Agenta OTLP config: api/oss/src/utils/env.py (OTLPConfig) +- Router mounting: api/entrypoints/routers.py diff --git a/docs/design/agent-workflows/scratch/research/pi-interaction.md b/docs/design/agent-workflows/scratch/research/pi-interaction.md new file mode 100644 index 0000000000..d982693113 --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/pi-interaction.md @@ -0,0 +1,585 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# Research: Programmatically driving the pi.dev agent harness + +Status: research only. No code changed outside this file. +Scope: how the Agenta backend can drive a "pi.dev" harness for the new `agents` +workflow type. Answers questions 1-7 from the research brief, with sources. + +## Summary + +- **pi.dev is the Pi coding agent** by Earendil Inc.: "a minimal, extensible agent + harness." It is a TypeScript/Node monorepo, MIT-licensed, distributed on npm. + Latest published version at time of research: **0.79.4**. The CLI binary is `pi`. +- Three layers matter to us, smallest to largest: + - `@earendil-works/pi-ai` - unified multi-provider LLM API (`getModel`, `stream`, + `complete`, content blocks incl. images, image generation). + - `@earendil-works/pi-agent-core` - the agent loop: stateful `Agent` class, tool + calling, event stream, `sessionId`, before/after tool hooks, transport abstraction. + - `@earendil-works/pi-coding-agent` - the full harness + CLI: `createAgentSession`, + built-in tools (read/bash/edit/write/...), extensions/hooks, skills, AGENTS.md + loading, session persistence (JSONL), and four run surfaces (TUI, print/JSON, RPC, + SDK). +- **Four ways to drive it programmatically.** For a Python backend driving pi inside a + sandbox, the realistic options are (a) **RPC mode** (`pi --mode rpc`, JSONL over + stdin/stdout, bidirectional, supports follow-ups/steering/abort), or (b) **print/JSON + mode** (`pi --mode json "prompt"`, one-shot, JSON-lines events on stdout). The + **SDK** (`createAgentSession`) is the in-process TypeScript path and gives the richest + control; it is what you would use if any part of the harness is itself Node. +- **Multi-message output, sessions, streaming, hooks, tools, model selection** are all + first-class and map cleanly onto the design doc's requirements. The one soft spot is + **"pi instruments"**: pi itself ships no built-in "instruments" product. The + observability story is OpenTelemetry via the community `pi-otel` extension (built on + pi's hooks), plus an in-house extensions/hooks API you can instrument against. See + Question 3 and the Open questions section. +- **Swappable harness + local parity** are supported by design: the harness is the thing + behind a thin run surface (RPC/JSON/SDK), so a different harness (e.g. OpenAI Codex) + that speaks the same surface can be slotted in; and the same `pi` binary/SDK runs + locally and in the sandbox, which is exactly the parity the design wants. + +## What pi.dev is (with sources) + +"Pi is a minimal, extensible agent harness... Adapt Pi to your workflows, not the other +way around." It deliberately omits things like sub-agents and plan mode so you compose +them yourself via extensions. +Source: https://pi.dev/ and https://github.com/earendil-works/pi + +Packages (all MIT, all `0.79.4` at research time; confirmed via the npm registry API): +- `@earendil-works/pi-coding-agent` - "Coding agent CLI with read, bash, edit, write + tools and session management." Bin: `{"pi": "dist/cli.js"}`. Depends on + `pi-agent-core`, `pi-ai`, `pi-tui` (all `^0.79.4`), `typebox@1.x`, `undici`, etc. +- `@earendil-works/pi-agent-core` - "General-purpose agent with transport abstraction, + state management, and attachment support." +- `@earendil-works/pi-ai` - "Unified LLM API with automatic model discovery and provider + configuration." +Source: `https://registry.npmjs.org/@earendil-works/pi-coding-agent` (and `/pi-ai`, +`/pi-agent-core`), GitHub repo root README. + +Repository layout (monorepo): +``` +packages/ + coding-agent/ # CLI + harness (SDK lives here) + agent/ # @earendil-works/pi-agent-core + ai/ # @earendil-works/pi-ai + tui/ # @earendil-works/pi-tui +``` +Key docs in-repo: `packages/coding-agent/docs/{sdk,extensions,json,rpc,models,settings, +containerization}.md`. +Source: https://github.com/earendil-works/pi/tree/main/packages + +Why this matches the design doc's "agent harness with tools, hooks, instruments, +sessions, runs in sandboxes": pi provides tools (built-in + custom via TypeBox), +25+ TypeScript hooks, JSONL sessions with a `sessionId`, a documented containerization +story, and a community OTel instrumentation extension. The name "pi.dev" in the design +doc is unambiguously this product. + +Install (host or inside sandbox image): +```bash +npm install @earendil-works/pi-coding-agent # SDK + CLI +# CLI is also installable via curl / PowerShell / pnpm / bun per pi.dev +``` +Source: https://github.com/earendil-works/pi, https://pi.dev/ + +--- + +## Question 1 - How do you programmatically interact with pi.dev (API/SDK/CLI surface)? + +**Language:** TypeScript/Node. There is no first-party Python SDK; a Python backend +drives pi over a process boundary (RPC or print/JSON mode) or shells out to the `pi` CLI. + +**Four run surfaces** (pi's own term): +1. **Interactive TUI** - `pi` (not relevant to us). +2. **Print / JSON mode** - `pi -p "query"` or `pi --mode json "query"`. One-shot; + emits results (text or JSON-lines events) to stdout. Good for stateless single runs. +3. **RPC mode** - `pi --mode rpc`. JSON protocol over stdin/stdout; bidirectional and + long-lived. This is the canonical "drive it from another process/language" surface. +4. **SDK** - `import { createAgentSession } from "@earendil-works/pi-coding-agent"`. + In-process, richest control. This is what you embed if your harness runner is Node. +Sources: https://pi.dev/, https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md + +**SDK entrypoints** (from `docs/sdk.md`): +```typescript +import { + createAgentSession, + createAgentSessionRuntime, + SessionManager, + AuthStorage, + ModelRegistry, + DefaultResourceLoader, + defineTool, +} from "@earendil-works/pi-coding-agent"; + +const { session, extensionsResult, modelFallbackMessage } = + await createAgentSession({ + cwd: process.cwd(), + model: myModel, + thinkingLevel: "medium", + tools: ["read", "bash", "edit"], + sessionManager: SessionManager.inMemory(), + }); +``` +`createAgentSessionRuntime(factory, options)` is the multi-session variant +(`newSession()`, `switchSession()`, `fork()`, `importFromJsonl()`). + +The returned `AgentSession` interface (verbatim from docs): +```typescript +interface AgentSession { + prompt(text: string, options?: PromptOptions): Promise; + steer(text: string): Promise; + followUp(text: string): Promise; + subscribe(listener: (event: AgentSessionEvent) => void): () => void; + setModel(model: Model): Promise; + setThinkingLevel(level: ThinkingLevel): void; + cycleModel(): Promise; + navigateTree(targetId: string, options?: NavigateOptions): Promise; + compact(customInstructions?: string): Promise; + abort(): Promise; + dispose(): void; + sessionFile: string | undefined; + sessionId: string; // <-- session id, see Q7 + agent: Agent; + model: Model | undefined; + thinkingLevel: ThinkingLevel; + messages: AgentMessage[]; // <-- multi-message output, see Q4 + isStreaming: boolean; +} +``` +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +**Low-level loop** (in `pi-agent-core`) if you want to drive turns yourself: +```typescript +import { agentLoop, agentLoopContinue } from "@earendil-works/pi-agent-core"; +for await (const event of agentLoop([userMessage], context, config)) { /* ... */ } +``` +Source: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md + +**Recommendation for Agenta:** drive pi over **RPC mode** from the Python backend +process that owns the sandbox (long-lived, supports follow-ups/steering/abort and a +stable JSONL contract), and reserve print/JSON mode for stateless single-shot runs. Use +the SDK only if the in-sandbox runner is itself Node. RPC/JSON give the cleanest swappable +boundary for a non-pi harness (Codex) later (Question 7). + +--- + +## Question 2 - Sending messages and getting responses; streaming + +**SDK:** `await session.prompt(text, options?)` sends a user message and resolves when the +agent turn completes. Mid-stream you can `steer()` (replace current op) or `followUp()` +(queue after the turn). Streaming is via `subscribe()` callbacks (push-based observer, +not an async generator at the session level): +```typescript +const unsubscribe = session.subscribe((event) => { + switch (event.type) { + case "message_update": + if (event.assistantMessageEvent.type === "text_delta") { + process.stdout.write(event.assistantMessageEvent.delta); // streaming text + } + break; + case "tool_execution_start": /* event.toolName */ break; + case "tool_execution_end": /* event.isError */ break; + case "turn_end": /* event.message */ break; + case "agent_end": /* event.messages = full multi-message output */ break; + } +}); +``` +Full event set: `agent_start`, `agent_end`, `turn_start`, `turn_end`, `message_start`, +`message_update`, `message_end`, `tool_execution_start`, `tool_execution_update`, +`tool_execution_end`, `queue_update`, `compaction_start/end`, `auto_retry_start/end`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +**pi-agent-core** is where the async-generator streaming lives: `agentLoop()` / +`agentLoopContinue()` are `for await` async generators; the `Agent` class wraps them with +`subscribe()`. The low-level `pi-ai` `stream()` emits `text_start/delta/end`, +`thinking_*`, `toolcall_*`, `done`, `error`. +Sources: https://github.com/earendil-works/pi/blob/main/packages/agent/README.md, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +**RPC mode (cross-process / cross-language):** JSONL over stdin/stdout. +- Framing: strict LF (`\n`)-delimited JSON. Strip a trailing `\r`. **Do not** use + Node `readline` or other readers that split on Unicode separators (e.g. `U+2028`), + because those characters appear inside JSON payloads. +- Send a prompt (client -> pi stdin): + ```json + {"id": "req-1", "type": "prompt", "message": "Hello"} + ``` + Ack (pi stdout): `{"id": "req-1", "type": "response", "command": "prompt", "success": true}` +- Other commands: `steer`, `follow_up`, `abort`, `new_session`, `set_model`, + `cycle_model`, `get_state`, `get_messages`, `set_thinking_level`, `bash`, + `get_session_stats`, `switch_session`, `fork`, `clone`, `compact`, etc. +- Events stream back as JSON lines **without** an `id` (same event names as the SDK): + ```json + {"type":"message_update","assistantMessageEvent":{"type":"text_delta","delta":"Hello"}} + {"type":"message_update","assistantMessageEvent":{"type":"text_end"}} + {"type":"agent_end","messages":[...]} + ``` +- The optional `id` on a command is echoed back on its `response` for correlation. There + is **no handshake** - the protocol starts immediately; the first client command begins + interaction. +- Extension UI is also over the wire: `extension_ui_request` (stdout) / + `extension_ui_response` (stdin) for `select`/`confirm`/`input`/`editor`, plus + fire-and-forget `notify`/`setStatus`/`setWidget`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md + +**Streaming summary:** SDK = observer callbacks; agent-core/pi-ai = async generators; +RPC/JSON modes = JSON-lines event stream over stdout. No SSE or websockets in pi itself; +if Agenta needs SSE to a frontend, the backend wraps the JSONL/observer stream and +re-emits SSE. + +--- + +## Question 3 - Startup hooks (file setup, secret injection, env prep) + +pi has a rich **extension hook system**, plus an **app-level startup ordering** for the +sandbox that Agenta controls itself. Two layers: + +### 3a. pi extension hooks (in-process, TypeScript) +Extensions are default-exported factory functions auto-discovered from: +- Global: `~/.pi/agent/extensions/*.ts` (or `.../*/index.ts`) +- Project: `.pi/extensions/*.ts` (or `.../*/index.ts`) +- CLI: `pi -e ./path.ts` +```typescript +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; +export default function (pi: ExtensionAPI) { + pi.on("session_start", async (event, ctx) => { /* file setup / state restore */ }); + pi.registerTool({ /* ... */ }); +} +``` +Factory functions may be **async**, which is the supported way to do startup +initialization (e.g. fetch remote config) before the session begins. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Relevant hook points (25+ total) for startup/setup:** +- `project_trust` -> `{ trusted: "yes"|"no"|"undecided", remember? }` (gate before + loading dynamic configs). +- `session_start` -> reason `"startup"|"reload"|"new"|"resume"|"fork"`. The documented + place for one-time per-session setup and state restoration. This is the natural + **file-setup hook**. +- `session_shutdown` -> cleanup / persist state (`pi.appendEntry(...)`). +- `resources_discover` -> contribute `skillPaths`/`promptPaths`/`themePaths` (how skills + get injected). +- `before_agent_start` -> inject messages or modify the system prompt before the LLM turn. +- `context` / `before_provider_request` / `after_provider_response` -> mutate the + messages/payload around each LLM call (good instrumentation points). +- `tool_call` -> can **block** a tool (`{ block: true, reason }`); `tool_result` can + rewrite results. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Secret injection at the pi layer** is via provider registration with env interpolation: +```typescript +pi.registerProvider("provider-name", { + name: "Display Name", + baseUrl: "https://api.example.com", + apiKey: "$ENV_VAR", // "$VAR" / "${VAR}" interpolated; "$$" -> literal "$" + api: "anthropic-messages", + models: [/* ... */], +}); +``` +And/or `AuthStorage` (SDK): resolution order is runtime overrides -> `auth.json` -> +environment variables -> fallback resolver: +```typescript +const authStorage = AuthStorage.create(); +authStorage.setRuntimeApiKey("anthropic", process.env.MY_KEY); // not persisted +``` +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md + +### 3b. App-level (sandbox) startup ordering - Agenta's own hooks +The design doc's "startup hooks set up files then secrets" is the **sandbox boot +sequence**, which Agenta owns, not a pi API. pi's containerization doc shows secrets are +injected as env vars at container start and files via bind mounts: +```bash +docker run --rm -it \ + -e ANTHROPIC_API_KEY \ + -v "$PWD:/workspace" \ + -v pi-agent-home:/root/.pi/agent \ + pi-sandbox +``` +Three documented isolation modes: **Gondolin** (local micro-VM, tools run in VM, auth +stays on host), **plain Docker** (whole pi process containerized), and **OpenShell** +(policy-controlled gateway that can inject provider creds upstream so raw keys never +enter the sandbox). For Agenta's Daytona target, the equivalent is: lay files into the +workspace, then set secret env vars / write `auth.json`, then start `pi --mode rpc`. +Source: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md + +So "file setup then secrets" maps to: (1) sandbox provisioning lays config files +(AGENTS.md, skills, files) into the workspace and `~/.pi/agent`; (2) secrets are set as +env vars / `auth.json`; (3) pi boots and its own `session_start` extension hook can do any +remaining in-process setup. Note: pi's own hooks fire **inside** pi after it starts, so +they cannot themselves be the mechanism that installs pi's secrets before pi starts - +that ordering belongs to the sandbox layer (the `$ENV_VAR`/`auth.json` is read by pi at +boot). + +--- + +## Question 4 - Returns as TEXT + +- **Streaming:** `message_update` events carry `assistantMessageEvent.type === + "text_delta"` with `.delta`. Concatenate deltas for live text. (RPC/JSON modes emit the + same shape on stdout.) +- **Final / multi-message:** the run produces an array of messages, not one completion. + - SDK: `session.messages` (all) and the `agent_end` event's `messages` array; per-turn + text is on `turn_end`'s `message`. + - The `agent_end` event is the canonical "full multi-message output" the design doc + wants. Each assistant message's `content` is an array of content blocks; text blocks + are `{ type: "text", text }`. +- **print mode:** `pi -p "query"` prints assistant text to stdout directly (simplest text + path for a one-shot run). +- **JSON mode filtering example** (text via `message_end`): + ```bash + pi --mode json "List files" 2>/dev/null | jq -c 'select(.type == "message_end")' + ``` +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +--- + +## Question 5 - Returns as IMAGES and other binary/file artifacts + +pi-ai content blocks include an explicit image block; images are base64 + MIME type: +```typescript +type ContentBlock = + | { type: 'text'; text: string } + | { type: 'image'; data: string; mimeType: string } // base64-encoded + | { type: 'toolCall'; id: string; name: string; arguments: Record } + | { type: 'thinking'; thinking: string }; +``` +Tool results carry their own `content: ContentBlock[]`, so a tool can return an image +block: +```typescript +{ + role: 'toolResult'; + toolCallId: string; + toolName: string; + content: ContentBlock[]; // may include { type: 'image', data, mimeType } + isError: boolean; + timestamp: number; +} +``` +- **Input images** (multimodal prompts): SDK `prompt(text, { images: [...] })` with + `ImageContent` = `{ type: "image", source: { type: "base64", mediaType, data } }` + (SDK shape). pi-agent-core's `prompt()` also accepts + `[{ type: "image", data, mimeType }]`. +- **Generated images:** pi-ai exposes `getImageModel(provider, modelId)` and + `generateImages(model, input, options)` (one-shot image generation). +- **Binary/file artifacts:** there is no dedicated "artifact" return channel. The two + practical paths are (a) tools return an `image` content block (base64), or (b) the + agent writes files to the sandbox workspace (write/bash tools) and Agenta collects them + from the filesystem after the run. pi-agent-core's package description explicitly + mentions "attachment support," which is worth confirming in source for non-image + binaries. +Sources: https://github.com/earendil-works/pi/blob/main/packages/ai/README.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +`https://registry.npmjs.org/@earendil-works/pi-agent-core` (description). The +attachment/binary specifics are **UNVERIFIED** beyond the image block - confirm in +`packages/agent` source / `packages/ai` source. + +--- + +## Question 6 - STRUCTURED OUTPUTS (JSON / schema-constrained) + +pi's idiomatic structured-output pattern is **a terminating tool**, not a provider-level +`response_format`/`json_schema`. You define a tool whose TypeBox parameters are your +output schema and return `terminate: true` so the agent stops without an extra LLM turn; +the validated arguments are your structured object. See +`packages/coding-agent/examples/extensions/structured-output.ts`: +```typescript +defineTool({ + name: "save_structured_output", + parameters: Type.Object({ + headline: Type.String({ description: "Short title for the result" }), + summary: Type.String({ description: "One-paragraph summary" }), + actionItems: Type.Array(Type.String(), { description: "Concrete next steps" }), + }), + async execute(_toolCallId, params) { + return { + content: [{ type: "text", text: `Saved structured output: ${params.headline}` }], + details: { // <-- machine-readable structured result + headline: params.headline, + summary: params.summary, + actionItems: params.actionItems, + } satisfies StructuredOutputDetails, + terminate: true, // <-- ends agent without follow-up turn + }; + }, +}); +``` +You then read the structured object from that tool call's arguments / the tool result's +`details`. TypeBox is the schema system throughout pi (`Type`, `Static`, `TSchema` are +re-exported from `@earendil-works/pi-ai`), and `validateToolCall(tools, toolCall)` +validates arguments against the schema before execution. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts, +https://github.com/earendil-works/pi/blob/main/packages/ai/README.md + +**UNVERIFIED:** whether `pi-ai`'s `complete()`/`stream()` accept a provider-native +`responseFormat`/`jsonSchema` option (OpenAI/xAI-style strict JSON schema). The README +did not document one; the documented, portable pattern is the terminating-tool approach +above. Confirm by reading `packages/ai` source (`complete`/`stream` option types). + +--- + +## Question 7 - Tools, model selection, and the session_id + +### Tools +**Built-in:** enable per session: `tools: ["read", "bash", "edit", "write", "grep", +"find", "ls"]`. Read-only mode = `["read","grep","find","ls"]`. `excludeTools: [...]` +removes specific ones. + +**Custom (SDK):** +```typescript +import { Type } from "typebox"; +import { defineTool } from "@earendil-works/pi-coding-agent"; +const myTool = defineTool({ + name: "my_tool", + label: "My Tool", + description: "Does something useful", + parameters: Type.Object({ input: Type.String({ description: "Input value" }) }), + execute: async (_toolCallId, params) => ({ + content: [{ type: "text", text: `Result: ${params.input}` }], + details: {}, + }), +}); +await createAgentSession({ customTools: [myTool], tools: ["read", "bash", "my_tool"] }); +``` +**Custom (extension):** `pi.registerTool({...})` with the same shape plus TUI hooks +(`renderCall`, `renderResult`), `promptSnippet`, `promptGuidelines`, and optional +`onUpdate` streaming. `pi.getAllTools()`, `pi.getActiveTools()`, `pi.setActiveTools()` +manage the active set at runtime. `tool_call` hooks can block tools; MCP is composed via +extensions (not core). +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +### Model selection +```typescript +import { getModel } from "@earendil-works/pi-ai"; +const opus = getModel("anthropic", "claude-opus-4-5"); // built-in +const custom = modelRegistry.find("my-provider", "my-model"); // from models.json +const available = await modelRegistry.getAvailable(); // those with valid keys +await createAgentSession({ + model: opus, + thinkingLevel: "high", // off | minimal | low | medium | high | xhigh + scopedModels: [ { model: opus, thinkingLevel: "high" }, { model: haiku, thinkingLevel: "off" } ], + authStorage, modelRegistry, +}); +await session.setModel(newModel); // runtime switch +``` +If no model is provided: restore from session -> settings default -> first available. +15+ providers (Anthropic, OpenAI, Google, Bedrock, Ollama, ...). RPC equivalent: +`set_model`/`cycle_model`; CLI flags `--provider`, `--model`. Custom providers are added +via `pi.registerProvider(...)`. This is the swap point for "run on OpenAI/Codex models." +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md, +https://pi.dev/ + +### session_id +- **Creation:** a session has a `sessionId`. In JSON mode the run opens with a header + line: `{"type":"session","version":3,"id":"","timestamp":"...","cwd":"/path"}`. + The `id` is the session id (UUID). The SDK exposes it as `session.sessionId`; the + `Agent` constructor accepts an explicit `sessionId` (so Agenta can supply its own and + thread it through). +- **Threading:** sessions persist as JSONL files (`SessionManager.create(cwd)` for + on-disk, `SessionManager.inMemory()` for none). `createAgentSessionRuntime` supports + `newSession`/`switchSession`/`fork`/`importFromJsonl`, i.e. resume and branch by + session. In RPC mode, `new_session`/`switch_session`/`fork`/`clone` manage sessions; the + client correlates its own requests with the optional `id` field on each command. +- This matches the design doc's "carry a `session_id`... later have its state stored": + pi already persists session state to JSONL, and you can pass your own `sessionId`. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md, +https://github.com/earendil-works/pi/blob/main/packages/agent/README.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md + +--- + +## Instrumentation ("pi instruments") - important nuance + +The design doc says runs are "instrumented with pi instruments." Findings: +- pi core ships **no product literally called "instruments."** Observability is delivered + through the **extension/hooks API** (you can instrument any of `context`, + `before_provider_request`, `after_provider_response`, `tool_call`, `tool_result`, + `agent_start/end`, `turn_start/end`, etc.). +- The mature path is **`pi-otel`**, a community OpenTelemetry extension: + - Install: `pi install npm:pi-otel`; activate `/otel start`. + - Span tree per prompt: `pi.interaction` -> `pi.turn` -> `pi.llm_request` / + `pi.tool.`, with GenAI semantic-convention attributes (model, token counts, + finish reason). + - Metrics: histograms for LLM request latency, token usage (input/output/cache), tool + execution time. + - Structured log events: `pi.session.start`, `pi.session.end`, `pi.tool.error`. + - Config via standard OTel env vars (`OTEL_EXPORTER_OTLP_ENDPOINT`, + `OTEL_EXPORTER_OTLP_HEADERS`) or `.pi/settings.json` `{ "otel": { endpoint, protocol } }`; + `PI_OTEL_DISABLED=1` disables it. +- There is also a proposed (issue-stage) session usage stats sink via `PI_USAGE_DIR`. +Sources: https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html, +https://github.com/earendil-works/pi/issues/2054, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md + +**Implication for Agenta:** "pi instruments" most likely means "instrument pi via its +hooks (OTel-style)," and Agenta's existing OTel-based tracing/observability can ingest +`pi-otel` OTLP output directly, or Agenta can write its own thin extension that emits +spans on the same hook points. Confirm with the design owner whether "pi instruments" +refers to `pi-otel`, a private Earendil "instruments" API, or just "instrumented via +hooks" - this wording is **UNVERIFIED**. + +--- + +## Local execution parity & swappable harness (design requirements) + +- **Parity:** the same `pi` binary / SDK that runs in the sandbox runs locally; pulling + the agent config (AGENTS.md, skills, model, tools, files, secrets) and starting pi + locally yields the same behavior. The four run surfaces are identical local vs sandbox. + Containerization doc shows host vs container are the same pi. +- **Swappable harness:** because the contract is a thin run surface (RPC JSONL / JSON + events / SDK events), a non-pi harness (e.g. OpenAI Codex) can be slotted behind the + same surface if Agenta defines its harness port against the RPC/event shapes. Within pi, + model/provider swapping (incl. OpenAI) is `getModel`/`registerProvider`/`set_model` - + but "swap the whole harness" is an Agenta-side abstraction over the run surface, not a + pi feature. +Sources: https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md, +https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md, https://pi.dev/ + +--- + +## Open questions / unknowns + +1. **"pi instruments" exact meaning** - is it `pi-otel`, a private Earendil API, or + "instrument via hooks"? UNVERIFIED. Resolve with the design owner; if OTel, wire + `pi-otel` OTLP into Agenta's existing tracing. +2. **Provider-native structured output** - does `pi-ai` `complete()`/`stream()` accept a + `responseFormat`/`jsonSchema` option, or is the terminating-tool pattern the only + supported route? UNVERIFIED; confirm in `packages/ai` source. +3. **Non-image binary artifacts** - `pi-agent-core` advertises "attachment support," but + only the `image` content block is documented. How are arbitrary file/binary artifacts + returned (vs. written to the workspace and collected from disk)? UNVERIFIED; confirm in + `packages/agent`/`packages/ai` source. +4. **Daytona specifically** - pi documents Gondolin / Docker / OpenShell, not Daytona. The + Daytona port is Agenta's to build (lay files -> set secrets -> `pi --mode rpc`); no pi + Daytona integration exists today. +5. **Skills config -> pi** - how Agenta's stored "skills" map to pi skills (loaded via + `resources_discover` skillPaths and `~/.pi/agent` layout) needs a concrete mapping; + read `docs/settings.md` and the skills section of the SDK/extensions docs. +6. **Exact `agent_end.messages` schema** for storing multi-message output - capture the + precise `AgentMessage`/content-block JSON (read `packages/agent` types) before + designing Agenta's storage shape. +7. **Version pinning** - researched against `0.79.4`. The API is pre-1.0 and moving (RPC + command names, event names, hook names may change between minors); pin a version and + re-verify against that tag's docs before implementing. + +## Sources + +- https://pi.dev/ (and https://pi.dev/docs/latest) +- https://github.com/earendil-works/pi (repo root, package layout) +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/sdk.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/extensions.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/rpc.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/json.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/models.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/docs/containerization.md +- https://github.com/earendil-works/pi/blob/main/packages/coding-agent/examples/extensions/structured-output.ts +- https://github.com/earendil-works/pi/blob/main/packages/agent/README.md +- https://github.com/earendil-works/pi/blob/main/packages/ai/README.md +- https://registry.npmjs.org/@earendil-works/pi-coding-agent (and /pi-ai, /pi-agent-core) - version, license, bin, deps +- https://nikiforovall.blog/ai/productivity/2026/05/16/pi-otel.html (pi-otel OTel extension) +- https://github.com/earendil-works/pi/issues/2054 (PI_USAGE_DIR usage stats proposal) +- https://deepwiki.com/earendil-works/pi (and /7.1-pi-coding-agent-sdk, /6.3-extension-examples-and-patterns) diff --git a/docs/design/agent-workflows/scratch/research/sandbox-sharing.md b/docs/design/agent-workflows/scratch/research/sandbox-sharing.md new file mode 100644 index 0000000000..9c8ffbaded --- /dev/null +++ b/docs/design/agent-workflows/scratch/research/sandbox-sharing.md @@ -0,0 +1,359 @@ +# Sandbox sharing: one sandbox for all agents, or one per agent? + +Status: research. Source of the question: the product owner wants v1 to mirror today's +prompt-style workflows, which run against one shared runtime/service rather than one per +workflow. The proposed shortcut is "reuse the same sandbox but connect it to a different +volume at each execution." + +This file answers: can we reuse one Daytona sandbox across many agent executions, can the +mounted volume change per execution, how do we isolate executions in a shared sandbox, +what is the concurrency model, how pi.dev views sessions, and what v1 should actually do. + +## Summary + +- **Reusing one long-lived sandbox: yes, supported.** A Daytona sandbox is designed for + long-lived reuse across many tasks, and the Process API provides both stateless one-off + `exec()` / `code_run()` and stateful named **Sessions** (`create_session` / + `execute_session_command` / `delete_session`) for running many independent command + streams in one sandbox. [daytona-sandboxes][daytona-sandboxes][daytona-process] +- **Swapping a different volume per execution: NO.** Daytona volumes are mounted **only at + sandbox creation** via `CreateSandboxFromSnapshotParams(volumes=[...])`. They cannot be + attached, detached, or changed on a running sandbox. Changing the mount requires + recreating the sandbox. The canonical docs say so explicitly. So the literal + "reuse the sandbox, attach a different volume each run" idea is **not feasible in + Daytona today.** [daytona-volumes][daytona-volumes-src] +- **Closest workable equivalent to "a volume per execution" without recreating the + sandbox:** give each execution its own **working directory** (e.g. + `/runs//`) and lay its config/files/secrets there per run, optionally with a + per-run OS user. That is the per-exec isolation lever in a shared sandbox, not volumes. + If you genuinely need a persistent named volume per agent, that belongs to the + sandbox-per-agent model, where `subpath` on one shared volume gives per-agent isolation + at create time. [daytona-process][daytona-volumes] +- **Isolation in a shared sandbox is weak by default.** All sessions and execs in one + sandbox share one kernel, one filesystem, one process table, one network stack, and one + set of OS env vars. Filesystem bleed, leftover processes, and secret bleed are real and + must be managed by convention (per-run dirs, per-command `env`, cleanup), not by the + platform. Daytona's own positioning is "isolated sandbox **per execution**" for safety. + [daytona-sandboxes][daytona-blog-best] +- **Concurrency is bounded and shares resources.** One sandbox defaults to 1 vCPU / 1 GiB + RAM (max 4 vCPU / 8 GiB), and an org's *total* active-sandbox budget is 4 vCPU / 8 GiB / + 10 GiB. Many agent runs can be launched as concurrent sessions in one sandbox, but they + contend for that single sandbox's CPU/RAM/disk and can step on each other's files. + Daytona has an open issue to add a Parallel Sandbox Execution API precisely because one + sandbox is not a clean unit for parallel independent workflows today. + [daytona-sandboxes][daytona-parallel-issue] +- **pi.dev does not need a dedicated machine per session, only a distinct session file and + working dir.** pi stores each session as a JSONL tree file; the SDK lets you point each + session at its own `cwd`, its own session file (`SessionManager.open(path)`), or its own + `agentDir`, and run in `--mode rpc --no-session`. So multiple pi sessions can coexist in + one environment as long as each gets its own directory/session file. This maps cleanly + onto "per-run working directory inside one shared sandbox." [pi-sdk][pi-docs] +- **Recommendation for v1:** one shared, long-lived sandbox for all agents, isolation by + **per-run working directory + per-command env + cleanup**, NOT by per-run volumes. + Treat the volume-per-execution idea as not feasible and substitute per-run dirs. + Serialize or cap concurrency on the shared sandbox. Keep the sandbox-provider port + abstraction so the migration to **sandbox-per-agent / sandbox-per-run** (with a + per-agent volume via `subpath` at create time) is a config swap, not a rewrite. + +## Reusing one sandbox (sessions / exec model) + +Daytona explicitly designs sandboxes for long-lived reuse: they keep filesystem state +across stop/start, can be archived and restored, and resized without recreation. +[daytona-sandboxes] Agenta already has the integration scaffolding: `DaytonaConfig` in +`api/oss/src/utils/env.py` carries `DAYTONA_API_KEY`, `DAYTONA_API_URL`, +`DAYTONA_SNAPSHOT`, `DAYTONA_TARGET`, which tells us the plan is snapshot-based sandbox +creation. + +The Process API gives two execution modes inside one sandbox: + +- **One-off, stateless:** `exec(command, cwd=None, env=None, timeout=None)` and + `code_run(code, params=None, timeout=None)`. Each invocation starts fresh; good for + isolated commands. Both accept per-call `cwd` and `env`. [daytona-process] +- **Stateful Sessions:** named background sessions that persist state across commands. + [daytona-process] + +Python session example (verbatim shape from the docs): [daytona-process-src] + +```python +session_id = "interactive-session" +sandbox.process.create_session(session_id) + +command = sandbox.process.execute_session_command( + session_id, + SessionExecuteRequest( + command="pip uninstall requests", + run_async=True, + ), +) +# later +sandbox.process.get_session(session_id) # status + command history +sandbox.process.delete_session(session_id) # cleanup +``` + +`SessionExecuteRequest` fields: `command` and `run_async` (Python) / `runAsync` (TS). +[daytona-process-src] Sessions are the natural home for one agent run: create a session +per run keyed by `session_id`, fire the harness command, monitor it, delete the session +when done. Many sessions can live in one sandbox at once. + +**Keeping the shared sandbox alive.** A running sandbox auto-stops after +`autoStopInterval` (default 15 min). Critically, **internal/background processes do NOT +reset the timer** — only lifecycle changes, preview network requests, active SSH, and +Toolbox SDK calls do. For an always-on shared sandbox, set `autoStopInterval: 0` or call +`sandbox.refreshActivity()` periodically. [daytona-sandboxes] + +## Volumes — can they change per execution? + +**No.** This is the central finding and it kills the literal proposal. + +> "Once a volume is created, it can be mounted to a sandbox by specifying it in the +> `CreateSandboxFromSnapshotParams` object." [daytona-volumes-src] + +Volumes mount **only at sandbox creation**. There is no API to attach/detach or swap a +volume on a running sandbox; the docs describe mounting exclusively through the create +params, and contain no running-sandbox mount operation. Changing what is mounted requires +**recreating** the sandbox. [daytona-volumes][daytona-volumes-src] + +Mounting example (Python): [daytona-volumes] + +```python +from daytona import CreateSandboxFromSnapshotParams, Daytona, VolumeMount + +daytona = Daytona() +volume = daytona.volume.get("my-volume", create=True) + +params = CreateSandboxFromSnapshotParams( + language="python", + volumes=[ + VolumeMount( + volume_id=volume.id, + mount_path="/home/daytona/volume", + subpath="users/alice", # optional per-tenant prefix + ) + ], +) +sandbox = daytona.create(params) +``` + +`VolumeMount` fields: `volume_id`, `mount_path` (absolute, not `/`, not a system dir like +`/proc`, `/etc`, `/bin`...), and optional `subpath`. [daytona-volumes][daytona-volumes-src] + +Other volume facts that matter: + +- **Persistence:** "The volume will persist even after the sandbox is removed." Good for + producer/consumer state across sandbox lifecycles. [daytona-volumes-src] +- **`subpath` isolation:** a sandbox mounted at `users/alice` cannot reach `users/bob` via + `../bob`; isolation is at the FUSE mount boundary. This is the supported way to give each + *sandbox* (created per agent/run) its own slice of one shared volume — but again, only at + create time. [daytona-volumes][daytona-volumes-src] +- **FUSE limits:** volumes are FUSE mounts — slower than local disk, not usable for block + storage (e.g. DB files), and "not transactional": concurrent writes to the same path are + last-write-wins. [daytona-volumes-src] +- **FUSE permission bugs:** an open issue reports `mv`, repeated `touch`, `stat`, and + `shutil.copystat()` failing with permission errors inside FUSE volumes. This makes + volumes a poor surface for frequent per-run file manipulation even where they do apply. + [daytona-fuse-issue] + +**Conclusion for the question as posed:** "reuse one sandbox, connect a different volume +each execution" is not achievable in Daytona. Volumes are a create-time-only mount. + +### Alternatives to per-execution volumes (in one shared sandbox) + +1. **Per-run working directory (recommended).** Lay each run's config/files/secrets under + `/runs//` (or a temp dir) and run the harness with that as `cwd`. Clean it + up on completion. This is the direct in-sandbox analog of "a different volume per run" + and avoids the FUSE limits entirely. `exec`/`execute_session_command` already take + `cwd`. [daytona-process] +2. **Copy files in/out per run** via the filesystem/Toolbox API, scoped to the per-run dir. +3. **Per-run OS user** for stronger separation (file ownership, home dir) if root isn't + required by the harness. (Standard Linux; UNVERIFIED whether Daytona's default image + permits adding users without extra config.) +4. **Recreate-per-run with a volume** (this is sandbox-per-run, not sandbox-sharing): if a + *persistent* per-agent volume is a hard requirement, create a fresh sandbox per run with + `volumes=[VolumeMount(volume_id, mount_path, subpath="agents/")]`. This is the + migration target, not v1. + +## Isolation in a shared sandbox + +A single Daytona sandbox is "isolated" from *other sandboxes and the host* — it gets a +dedicated kernel, filesystem, network stack, and resource allocation. [daytona-sandboxes] +But **within** one sandbox there is no isolation between executions. All sessions and execs +share: + +- **One filesystem** — files written by run A are visible to run B unless you scope each + run to its own directory and clean up. Filesystem bleed is the default. +- **One process table** — a leftover/background process from a prior run keeps running + (and does not even reset the auto-stop timer). You must track and kill per-run PIDs. + [daytona-sandboxes] +- **One set of OS environment variables** — sandbox-level env is global. Secret bleed is a + real risk if you `export` a secret. Mitigate by passing secrets per command via the `env` + parameter of `exec` / `execute_session_command` rather than setting them globally, and by + scoping secret files to the per-run dir. [daytona-process] +- **One network stack** — ports and outbound identity are shared. + +Practical isolation recipe for a shared sandbox: + +- Unique `session_id` per run; one Daytona Session per run. +- Per-run working dir `/runs//`; never write run state outside it. +- Pass secrets via per-command `env`, not global exports; keep secret files inside the + per-run dir with tight permissions; delete on completion. +- Explicit cleanup: kill the run's process group, remove the run dir, `delete_session`. +- Optional per-run OS user for ownership separation. + +Even with all of this, one sandbox is a **soft** isolation boundary (shared kernel, Docker +by default). For untrusted agent code or cross-tenant separation, this is weaker than +sandbox-per-run. Daytona's own marketing leans on "isolated sandbox **per execution**" for +exactly this reason, and notes the default Docker isolation is weaker than microVMs. +[daytona-blog-best] + +## Concurrency + +- **Resource budget.** One sandbox defaults to 1 vCPU / 1 GiB / 3 GiB disk, max + 4 vCPU / 8 GiB / 10 GiB. The whole org's active-sandbox budget is also 4 vCPU / 8 GiB / + 10 GiB. So a single shared sandbox is a small box, and packing many concurrent agent runs + into it means they contend for that fixed slice. [daytona-sandboxes] +- **Mechanically parallel, practically contended.** You *can* open multiple sessions and + run them concurrently in one sandbox, but they share CPU/RAM/disk and the filesystem, so + heavy or untrusted runs can starve or corrupt each other. There is no per-session cgroup + isolation documented. (UNVERIFIED: no documented per-session CPU/memory quota.) +- **Daytona itself flags this gap.** Open issue "Design and Implement Parallel Sandbox + Execution API" states that "developers working on AI agents or multi-threaded workflows + face limitations when trying to run multiple tasks concurrently," and that the current + workaround is "running multiple independent sandboxes manually (inefficient and + resource-heavy)." The proposed fix is forking sandbox state (filesystem + memory) — i.e. + Daytona's answer to parallel independent runs is *more sandboxes*, not more sessions in + one. [daytona-parallel-issue] + +Realistic v1 concurrency model for a shared sandbox: **serialize, or cap to a small N** of +concurrent sessions, each in its own working dir, sized to fit the sandbox's CPU/RAM. If +throughput needs to scale, that is the trigger to move to sandbox-per-run. + +## pi.dev session / workspace model + +pi (by Earendil Inc.) is a minimal, extensible agent harness — the harness Agenta's agent +workflow defaults to. It runs as an interactive TUI, a print/JSON one-shot, an RPC process +(stdin/stdout JSONL), or embedded via a Node SDK. [pi-home][pi-docs] + +Key points for sharing one sandbox: + +- **Sessions are files, not machines.** pi stores each session as a JSONL tree file + (branchable history). It does not require a dedicated host per session. [pi-docs] +- **Per-session isolation is by path.** The SDK's `SessionManager` controls where state + lives: `SessionManager.create(cwd)` (new session in a directory), + `SessionManager.continueRecent(cwd)`, `SessionManager.open("/path/to/session.jsonl")` + (explicit file), and `SessionManager.inMemory()` (ephemeral). You can also point at a + different global config via `agentDir`. [pi-sdk] +- **Multiple pi sessions coexist** in one environment by giving each a distinct `cwd`, + distinct session file, and/or distinct `agentDir` — "each combination isolates session + state, credentials, and settings files." [pi-sdk] +- **Context comes from the working dir.** pi loads `AGENTS.md` / `SYSTEM.md` from + `~/.pi/agent/`, parent dirs, and the cwd, so the per-run working dir naturally carries + per-run agent config. [pi-home] +- **Non-interactive runs:** `pi --mode rpc --no-session` (or `runRpcMode(runtime)`) for a + programmatic, sessionless subprocess driven over JSON-RPC. [pi-sdk] + +Implication: pi's design is fully compatible with "one shared sandbox, many runs." Each +agent run = one pi process pointed at its own per-run `cwd` (carrying that run's +`AGENTS.md`, skills, files) and its own session file. pi gives Agenta the per-run state +isolation that Daytona volumes do **not**. Agenta's `session_id` should map to (a) the pi +session file name and (b) the per-run working directory, and (c) the Daytona Session id — +one id threading all three layers. + +## Recommendation for v1 + migration path + +### v1: one shared sandbox, isolation by directory (not by volume) + +1. **One long-lived shared Daytona sandbox** created from `DAYTONA_SNAPSHOT`, with + `autoStopInterval: 0` (or periodic `refreshActivity()`), reused across all agents. + Matches the PO's "one runtime for all" goal and the existing prompt-runtime shared model. +2. **Per-run isolation by working directory, not volume.** For each run, create + `/runs//`, lay down that agent's config (`AGENTS.md`, skills, files) and + secrets there via startup hooks, and run pi with that dir as `cwd` and its own session + file. The "different volume per execution" intent is satisfied by a different *directory* + per execution. This sidesteps Daytona's create-time-only volume limit and the FUSE + permission/perf problems. [daytona-process][daytona-volumes][daytona-fuse-issue] +3. **One Daytona Session per run**, keyed by `session_id`; secrets passed via per-command + `env`, never global exports. [daytona-process] +4. **Mandatory cleanup** after each run: kill the run's process group, delete the run dir, + `delete_session`. This is what contains filesystem/process/secret bleed in a shared box. +5. **Bounded concurrency:** serialize, or cap to a small N sized to the sandbox's 1–4 vCPU. + [daytona-sandboxes] +6. **Keep the sandbox-provider port thin** so the unit of isolation (shared vs per-run) is + a config choice behind the same interface, as the design doc already anticipates. + +Honest framing for the PO: "one sandbox for all agents" is achievable, but **not by +swapping volumes** — by swapping working directories. The volume idea is the right +*instinct* (per-run isolated storage) attached to the wrong Daytona primitive. Use +directories in v1; use volumes only when you move to per-run/per-agent sandboxes. + +### Migration path to per-agent / per-run sandboxes + +When isolation, security (untrusted code), or concurrency throughput outgrow the shared +box: + +- Flip the provider port from "reuse shared sandbox" to "create sandbox per run." +- At creation, mount a per-agent persistent volume slice with + `VolumeMount(volume_id, mount_path, subpath="agents/")` — this is where the + "volume per agent" idea finally becomes native and correct. [daytona-volumes] +- Optionally enable stronger isolation (Kata/Sysbox) for untrusted code. + [daytona-blog-best] +- Lean on snapshot warm-starts to keep per-run create latency low. [daytona-sandboxes] + +Because pi already isolates by `cwd`/session file and `session_id` threads all layers, the +run-orchestration code barely changes between the two models; only the +"get-a-sandbox" step swaps. + +## Open questions + +- **Per-session resource quotas.** Can Daytona cap CPU/RAM/disk per Session (cgroups) + inside one sandbox, or is the only quota the whole-sandbox allocation? Not found in docs + — UNVERIFIED. If none, concurrent runs cannot be resource-isolated within one sandbox. +- **Default image users/permissions.** Does the snapshot image allow adding/switching OS + users per run without root issues? UNVERIFIED. +- **Toolbox filesystem API surface** for laying down per-run files/secrets and reading + outputs (upload/download/permissions) — needs confirmation against the Daytona Toolbox + SDK docs; sibling research on the sandbox port should pin this down. +- **pi `--no-session` vs Agenta `session_id`.** Agenta wants a `session_id` per run for + future state storage; pi can run sessionless (`--no-session`) or with an explicit session + file. Decide whether Agenta persists the pi JSONL session file (per the design doc's + "future session storage") or treats runs as sessionless and stores its own trace. The + design doc's session-storage goal points to keeping pi session files. +- **Concurrency ceiling.** Exact safe N of parallel pi runs in one 1–4 vCPU sandbox needs + empirical testing; treat as serialize-first until measured. +- **Daytona Parallel Sandbox Execution API status.** Issue #4001 is a proposal; if/when it + ships (fork filesystem+memory), it could change the cheapest path for parallel runs. + [daytona-parallel-issue] + +## Sources + +- [daytona-sandboxes] Daytona — Sandboxes (lifecycle, states, auto-stop/archive/delete, + refreshActivity, resource limits, per-sandbox isolation): + https://www.daytona.io/docs/en/sandboxes/ +- [daytona-process] Daytona — Process and Code Execution (exec/code_run vs Sessions, cwd, + env, create/execute/get/delete session): https://www.daytona.io/docs/en/process-code-execution/ +- [daytona-process-src] Daytona docs source — process-code-execution.mdx (verbatim session + example, SessionExecuteRequest fields): + https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/process-code-execution.mdx +- [daytona-volumes] Daytona — Volumes (creation, VolumeMount, mount_path/subpath, FUSE, + mounting via CreateSandboxFromSnapshotParams): https://www.daytona.io/docs/en/volumes/ +- [daytona-volumes-src] Daytona docs source — volumes.mdx (verbatim "mounted at creation", + persistence, FUSE not transactional, last-write-wins): + https://github.com/daytonaio/daytona/blob/main/apps/docs/src/content/docs/en/volumes.mdx +- [daytona-fuse-issue] Daytona GitHub issue #3331 — FUSE volume permission limitations + (mv/touch/stat/copystat failures): https://github.com/daytonaio/daytona/issues/3331 +- [daytona-parallel-issue] Daytona GitHub issue #4001 — Design and Implement Parallel + Sandbox Execution API (fork filesystem+memory; current workaround = many sandboxes): + https://github.com/daytonaio/daytona/issues/4001 +- [daytona-blog-best] Northflank — "Best code execution sandbox for AI agents 2026" + (isolated sandbox per execution; Docker-default isolation weaker than microVMs): + https://northflank.com/blog/best-code-execution-sandbox-for-ai-agents +- [pi-home] pi.dev — product overview (harness, modes, AGENTS.md/SYSTEM.md context): + https://pi.dev +- [pi-docs] pi.dev — docs index (session tree, JSONL session format, RPC/SDK modes): + https://pi.dev/docs/latest +- [pi-sdk] pi.dev — SDK/RPC (SessionManager.create/continueRecent/open/inMemory, cwd, + agentDir, runRpcMode, `--mode rpc --no-session`): https://pi.dev/docs/latest/sdk +- Agenta repo — `api/oss/src/utils/env.py` `DaytonaConfig` (DAYTONA_API_KEY, + DAYTONA_API_URL, DAYTONA_SNAPSHOT, DAYTONA_TARGET). +- Agenta repo — `docs/design/agent-workflows/README.md` (agent workflow context, sandbox + + pi harness + session_id) and `docs/design/prompt-runtime-unification/README.md` (existing + shared prompt runtime model). diff --git a/docs/design/agent-workflows/scratch/sdk-local-backend/status.md b/docs/design/agent-workflows/scratch/sdk-local-backend/status.md new file mode 100644 index 0000000000..1f5506fbec --- /dev/null +++ b/docs/design/agent-workflows/scratch/sdk-local-backend/status.md @@ -0,0 +1,81 @@ +# Status: SDK-owned agent runtime + local backend + +Source of truth for this effort and the handoff for whoever continues it. This is the only +page in `docs/design/agent-workflows/` that describes things that do not fully exist yet; the +design pages describe only what is built. + +## What this effort is + +Two things, layered on the agreed three-layer port redesign (Backend / Environment / Harness +plus neutral and per-harness configs): + +1. Move the neutral agent runtime out of the service and into the published Python SDK, so an + SDK user can download an agent config and run it locally with no Agenta backend. +2. Add a `LocalBackend` that runs a harness on the user's own machine (Pi via a bundled JS + runner, Claude via the Python `claude-agent-sdk`). + +## Current state (2026-06-18) + +### Done and verified (by import + wire-equivalence; live `/invoke` not re-run, see below) + +- **SDK runtime** at `sdks/python/agenta/sdk/agents/`, hexagonal layout: + - `dtos.py` — Pydantic data contracts: `AgentConfig` (+ `from_params`), `RunSelection`, + `SessionConfig`, `Message`, `ContentBlock`, `AgentEvent`, `AgentResult`, + `HarnessCapabilities`, `HarnessType`, `TraceContext`, `ToolCallback`, + `HarnessAgentConfig` + `PiAgentConfig` / `ClaudeAgentConfig`. + - `interfaces.py` — the ports (ABCs): `Backend`, `Environment`, `Sandbox`, `Session`, + `Harness`. + - `errors.py` — `UnsupportedHarnessError`. + - `adapters/rivet.py` — `RivetBackend` (engine hard-coded `rivet`; pi + claude; `sandbox` + kwarg) + `RivetSandbox` / `RivetSession`. + - `adapters/in_process.py` — `InProcessPiBackend` (engine hard-coded `pi`; pi only, local + only; the reference backend) + its sandbox/session. + - `adapters/local.py` — `LocalBackend`, STUB (raises `NotImplementedError`). + - `adapters/harnesses.py` — `PiHarness`, `ClaudeHarness`, `make_harness`; this holds the + per-harness adaptation (tool-spec normalization; Pi keeps built-ins and forces + `permissionPolicy=auto`; Claude drops built-ins and honors the policy). + - `utils/wire.py` — `request_to_wire` / `result_from_wire` (the `/run` shape). + - `utils/ts_runner.py` — `deliver_http` / `deliver_subprocess`. +- **Public surface**: `ag.AgentConfig`, `ag.SessionConfig`, `ag.RunSelection`, + `ag.Environment`, `ag.RivetBackend`, `ag.InProcessPiBackend`, `ag.LocalBackend`, + `ag.PiHarness`, `ag.ClaudeHarness`, `ag.make_harness`. `ag.Message` is deliberately the + prompt type (unchanged); import the agents `Message` from `agenta.sdk.agents`. +- **Service rewired**: `services/oss/src/agent/app.py` builds `AgentConfig.from_params` + + `RunSelection`, picks a backend via `select_backend`, runs through `Environment` + + `make_harness`. `tools.py` / `tracing.py` import the SDK `ToolCallback` / `TraceContext`. + `services/oss/src/agent/inputs.py` and the whole `services/oss/src/harness/` package were + deleted (their content now lives in the SDK). +- The full `_agent` handler emits a `/run` payload byte-identical to the previous one, so the + TypeScript runner (`services/agent/`) is unchanged. `ruff format` + `ruff check` pass. + +### Not done yet (take over here) + +- **`LocalBackend` (the new feature).** Two mechanisms, one per harness: + - Pi → bundled JS runner. Needs a `pnpm` build step that bundles the in-process Pi engine + to a single JS file shipped inside the `agenta` wheel, and `LocalBackend` invoking it + with `node`. (Decision: bundle prebuilt JS in the wheel.) + - Claude → the pure-Python `claude-agent-sdk`, in-process, no TS bridge. (Decision: use + `claude-agent-sdk`, not a TS engine.) + Both need build/dependency setup to verify, which is why they are not started. +- **Live verification.** Everything above is verified by import + wire-equivalence only. A + real `/invoke` run on the dev stack (pi+local, rivet+pi, rivet+claude, rivet+pi+daytona) + has NOT been re-run since the refactor. Do this before treating the rewrite as shipped; see + the `debug-local-deployment` skill. + +## Locked decisions + +- Vocabulary follows `api/`: `dtos.py` (data), `interfaces.py` (ports/ABCs), `adapters/` + (implementations). A port is an interface; an adapter is an implementation. +- Backends are NOT a class hierarchy. Each hard-codes its engine id and supported harnesses; + they share only the `utils` functions. `InProcessPiBackend` is the reference backend. +- DTOs are Pydantic. +- `Harness` (not the backend) owns the per-harness adaptation logic, especially tools. +- Sandbox is a backend/environment concern, not a `SessionConfig` field. +- The TS runner and the `/run` wire stay unchanged. + +## Dependency direction + +`service -> SDK`, never the reverse. The SDK runtime never calls the Agenta API. The service +resolves tools (`/tools/resolve`), vault secrets (`/secrets/`), and the trace context, and +hands the SDK already-resolved data on the `SessionConfig`. A standalone SDK user resolves +their own (env keys, their own tools, no tracing) and uses `LocalBackend`. diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/README.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/README.md new file mode 100644 index 0000000000..0e0d1ee46a --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/README.md @@ -0,0 +1,73 @@ +# WP-1: Tracing Pi in Agenta + +Status: done. Working code in [`poc/`](poc/). To embed it in the agent runtime, follow +[`integrating-the-tracing-extension.md`](integrating-the-tracing-extension.md). + +## Goal + +Install Pi locally, run an agent, and get its telemetry into Agenta as a clean, structured +trace. Success looks like: a local Pi run shows up in Agenta observability as a sensible +span tree (session at the root, turns under it, LLM calls and tool calls as child spans) +with token usage and timings intact. + +## Scope + +In: + +- Run Pi locally (`@earendil-works/pi-coding-agent`), pin an exact version. +- A Pi extension on the `pi.on(...)` event bus that converts lifecycle events + (`session_start`, `turn_*`, `before_provider_request`/`after_provider_response`, + `tool_execution_*`, `message_*`) into OTel spans. +- Export OTLP/HTTP protobuf to Agenta's `POST /otlp/v1/traces`. +- Make the span tree read well in Agenta's UI. + +Out (later work packages): + +- Running inside Daytona. Local only here. +- The agent service itself (that is WP-2). This WP produces the tracing extension that + WP-2 later embeds. + +## Approach (grounded in research) + +See [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md) and +[`../research/pi-interaction.md`](../research/pi-interaction.md). + +- Pi emits no OTel on its own. Either adopt/fork a community extension (`pi-otel*`) or write + our own on the event bus. Writing our own is likely cleaner since we control the span + shape. +- Emit OTel GenAI semantic conventions (`gen_ai.*`) plus `openinference.span.kind` + (AGENT / CHAIN / LLM / TOOL) so Agenta types the nodes correctly. Agenta's adapter + registry already understands both. +- Export over OTLP/HTTP protobuf with `Authorization: ApiKey ` and `?project_id=`. + +## Known gotchas to handle + +- **Token attribute drift.** Pi-style extensions emit `gen_ai.usage.input_tokens` / + `output_tokens`, but Agenta's `semconv.py` maps the older + `prompt_tokens` / `completion_tokens` / `total_tokens`. Either normalize in the extension + or add aliases in Agenta, or token metrics drop silently. +- **Transport.** Agenta accepts OTLP/HTTP protobuf only. Not gRPC default, not JSON-OTLP. + Configure the exporter accordingly. +- **Trace-context propagation.** Whether a W3C `traceparent` is threaded into the run so + in-sandbox spans nest under an originating backend span is UNVERIFIED. Confirm during this + WP. + +## Definition of done + +- A local Pi run produces one trace in Agenta with a coherent span tree. +- LLM and tool spans are typed correctly and carry model, latency, and token usage. +- No silently dropped attributes (token usage in particular is present). +- The exporter config (endpoint, auth, project) is injected, not hard-coded, so it carries + over to the sandboxed and service contexts later. + +## Open questions + +- Adopt a community `pi-otel` extension or write our own? Lean: write our own. +- Final span-tree shape to standardize on (session vs interaction root naming). +- Does Agenta forward `traceparent` into an invocation for nesting? + +## Links + +- [`../research/otel-instrumentation.md`](../research/otel-instrumentation.md) +- [`../research/pi-interaction.md`](../research/pi-interaction.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md new file mode 100644 index 0000000000..dbb2c72a50 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/integrating-the-tracing-extension.md @@ -0,0 +1,187 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# Integrating the Pi tracing extension into the agent runtime + +Status: ready to integrate. Audience: whoever builds the Dockerized Pi agent runtime +(WP-2 service, WP-3 sandbox). Source of the working code: [`poc/`](poc/). + +## What this gives you + +A Pi extension that turns Pi's `pi.on(...)` lifecycle events into OpenTelemetry spans and +ships them to Agenta over OTLP/HTTP protobuf. Once it is loaded, every agent run shows up +in Agenta observability as a clean span tree with inputs, outputs, token usage, cost, and +latency, and runs in the same session are grouped by `session.id`. + +It is one self-contained file, `poc/agenta-otel.ts`. Copy it into the runtime as is. It is +written to be embedded, not just demoed. `poc/run.ts` is only an example driver; you will +write your own runner, but you can copy its wiring. + +This was verified end to end against the dev box: complex multi-tool runs, parallel tool +calls, structured returns, and multi-prompt sessions all trace correctly, and the agent +root reports the correct whole-run token and cost totals. + +## The span tree it produces + +``` +invoke_agent openinference.span.kind = AGENT (root, one per user prompt) + turn N CHAIN + chat LLM model, latency, token usage, finish reason, messages + execute_tool TOOL args in, result out +``` + +Agenta types nodes from `openinference.span.kind` (AGENT to agent, CHAIN to chain, LLM to +chat, TOOL to tool) and groups sessions from `session.id`. No backend change is needed. + +## How to wire it in + +The runtime is Node embedding Pi through the SDK, so use the SDK path. It is the one the +extension is built for, and it is the only path where session id and model name reach the +spans. + +```ts +import { + AuthStorage, createAgentSession, DefaultResourceLoader, + getAgentDir, ModelRegistry, SessionManager, +} from "@earendil-works/pi-coding-agent"; +import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel"; + +const loader = new DefaultResourceLoader({ + cwd, + agentDir: getAgentDir(), + extensionFactories: [agentaOtel], // <-- register the extension in-process +}); +await loader.reload(); + +const { session } = await createAgentSession({ + cwd, model, authStorage, modelRegistry, + tools: ["read", "bash", "edit", "write", "ls"], + sessionManager: SessionManager.inMemory(cwd), + resourceLoader: loader, +}); + +// Hand the session id and model to the extension so spans carry them. +runConfig.sessionId = session.sessionId; +runConfig.provider = model.provider; +runConfig.requestModel = model.id; + +await session.prompt(userPrompt); // run one or more prompts in the session +// ... +await shutdownTracing(); // flush before the process or container exits +``` + +If you instead run Pi from the CLI (`pi -e ./agenta-otel.ts ...`), the extension still +emits spans and flushes on `session_shutdown`, but `runConfig` is never set, so spans lose +`session.id` and the model name in the span title. Prefer the SDK path. + +## What you must not change, and why + +These five choices are load bearing. They were each found by reading how Agenta ingests +and normalizes spans. Changing them silently drops data. + +1. **Atomic, parent-first export per trace.** The extension uses a small custom + `TraceBatchProcessor`, not the OTel `BatchSpanProcessor`. It buffers a trace and exports + all of its spans in one OTLP request when the root span ends, ordered parent before + child. Agenta rolls token and cost totals up the tree by sorting spans on + millisecond-resolution `start_time` and attaching a span only once its parent is + present. The default batch processor splits long runs on its 5 second timer, and + same-millisecond siblings (`agent_start` and `turn_start` fire in the same millisecond) + tie and drop a subtree. Either one makes the agent root undercount, showing only the + last turn instead of the whole run. Keep the custom processor. + +2. **`ag.data.inputs` must be a JSON object.** Agenta moves any non-object input to + `ag.unsupported`. The agent and tool spans emit `input.value` as a JSON object. The chat + span emits OpenInference `llm.input_messages.*` and `llm.output_messages.*` so it renders + as a real message thread. Do not emit a raw string as `input.value`. + +3. **Both token naming conventions.** The extension writes token usage under the current + GenAI names (`gen_ai.usage.input_tokens` / `output_tokens`) and the legacy names + (`prompt_tokens` / `completion_tokens`). Agenta's default `semconv.py` only maps the + legacy names today. Emit both or token metrics drop. + +4. **`openinference.span.kind` on every span.** This is what types the node in the UI. + +5. **`session.id` and `gen_ai.conversation.id` on the root.** Both map to `ag.session.id`, + which groups runs into a session. Set them from the Pi `sessionId`. + +## Configuration + +All config is read from the environment at first use, so set it before the first run. + +| Env var | Meaning | +|---|---| +| `AGENTA_HOST` | Agenta base URL, for example `http://144.76.237.122:8280`. A trailing slash is stripped. | +| `AGENTA_API_KEY` | Agenta project API key. The project is resolved from the key, so no `project_id` is needed. | +| `PI_OTEL_CAPTURE_CONTENT` | Set to `0` to drop prompts, completions, and tool I/O from spans. Default is on. | +| `OTEL_SERVICE_NAME` | Resource `service.name`, default `pi-agent`. | + +The exporter posts to `${AGENTA_HOST}/api/otlp/v1/traces`. Note the `/api` prefix. The +transport is OTLP/HTTP protobuf only (`@opentelemetry/exporter-trace-otlp-proto`), with +header `Authorization: ApiKey `. JSON OTLP and gRPC are rejected. + +These are the same env vars whether the runtime runs locally or in a container, which keeps +local and server behavior identical. + +## Dockerized runtime notes + +- **Inject the two Agenta env vars** (`AGENTA_HOST`, `AGENTA_API_KEY`) into the container as + secrets at start. They are separate from the LLM provider credentials. +- **Allow outbound network** from the sandbox to the Agenta host over HTTP or HTTPS. +- **Flush before the container exits.** Call `shutdownTracing()` at the end of the run. The + per-trace processor already exports each trace when its root span ends, so a completed + trace is usually shipped mid-run, but a final flush guards the last trace. If the + container is killed before the flush, the last trace can be lost. If you cannot call + `shutdownTracing()`, make sure `SIGTERM` triggers Pi's `session_shutdown`, which the + extension also flushes on. +- **Node 22 or newer** is required by Pi 0.79.4. +- **LLM auth in the sandbox is your concern, not the tracing.** The interactive ChatGPT + Codex login used in the POC is local only. In the container use a non-interactive + credential (an API key or a transplanted token). +- **Trace context across the boundary is done for the WP-2 service.** The agent service + threads a W3C `traceparent` into the run and starts the agent span as a child of the + Agenta `/invoke` span, so the whole agent run is part of the response trace. See + [tracing-in-the-agent-service.md](tracing-in-the-agent-service.md). Standalone runs (no + `traceparent`) still create their own root and correlate by `session.id`. + +## Dependencies + +Pin these in the runtime image (the OTel versions are a known-compatible set): + +``` +@earendil-works/pi-coding-agent 0.79.4 +@opentelemetry/api 1.9.0 +@opentelemetry/exporter-trace-otlp-proto 0.54.0 +@opentelemetry/resources 1.28.0 +@opentelemetry/sdk-trace-base 1.28.0 +@opentelemetry/sdk-trace-node 1.28.0 +@opentelemetry/semantic-conventions 1.28.0 +``` + +## How to verify it works + +1. On startup you should see `[agenta-otel] exporting spans to .../api/otlp/v1/traces`. +2. After a run, fetch the trace and check the tree and totals: + ``` + curl -s "${AGENTA_HOST}/api/spans/?trace_id=" -H "Authorization: ApiKey ${AGENTA_API_KEY}" + ``` + Expect `invoke_agent` (agent) over `turn N` (chain) over `chat` (chat) and + `execute_tool` (tool). Expect `ag.data.inputs` and `ag.data.outputs` on the agent, chat, + and tool spans, and nothing under `ag.unsupported`. Expect the agent root's + `ag.metrics.tokens.cumulative` to equal the sum of the chat spans' incrementals. +3. Or open Agenta observability and confirm the trace reads well and the root shows the + full-run token count and cost. + +## Reference: attributes per span + +| Span | Key attributes the extension sets | +|---|---| +| `invoke_agent` (AGENT) | `openinference.span.kind=AGENT`, `gen_ai.operation.name=invoke_agent`, `session.id`, `gen_ai.conversation.id`, `input.value` as `{prompt}`, `output.value` final text | +| `turn N` (CHAIN) | `openinference.span.kind=CHAIN`, `pi.turn.index` | +| `chat ` (LLM) | `openinference.span.kind=LLM`, `gen_ai.system`, `gen_ai.request.model`, `gen_ai.response.model`, `gen_ai.response.finish_reasons`, `gen_ai.usage.{input,output,prompt,completion,total}_tokens`, `llm.input_messages.*`, `llm.output_messages.*` | +| `execute_tool ` (TOOL) | `openinference.span.kind=TOOL`, `gen_ai.tool.name`, `gen_ai.tool.call.id`, `input.value` as the args object, `output.value` the result | + +## One known gap, not on the agent side + +The Agenta Sessions tab groups our `session.id` correctly, and the per-session API +(`POST /api/traces/query` filtering `ag.session.id`) returns the right traces with costs, +but the Sessions table's aggregate columns render empty on the current dev build. The data +is correct and queryable. This is a frontend rendering gap, not something the instrumentation +or the runtime can fix. diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/.env.example b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/.env.example new file mode 100644 index 0000000000..a1ca16a17b --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/.env.example @@ -0,0 +1,7 @@ +# Agenta collector (the runner also falls back to the repo-root .env.test.local). +AGENTA_HOST=http://144.76.237.122:8280/ +AGENTA_API_KEY=your-agenta-project-api-key + +# Optional: +# PI_OTEL_CAPTURE_CONTENT=0 # drop prompt/response/tool I/O from spans +# OTEL_SERVICE_NAME=pi-agent diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/README.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/README.md new file mode 100644 index 0000000000..8d78fc4532 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/README.md @@ -0,0 +1,86 @@ +# WP-1 POC: trace the Pi agent harness into Agenta + +Installs [Pi](https://pi.dev) locally, runs a small tool-using agent, and exports the +run to Agenta observability as a clean OpenTelemetry trace. + +## What's here + +- `agenta-otel.ts` — the deliverable: a Pi extension that turns `pi.on(...)` lifecycle + events into OTel spans and exports them (OTLP/HTTP protobuf) to Agenta. WP-2 embeds + this file as-is. +- `run.ts` — a runner that registers the extension in-process and drives one prompt. + +## Span tree + +``` +invoke_agent (openinference.span.kind = AGENT, carries session.id) + turn N (CHAIN) + chat (LLM — model, latency, token usage, finish reason) + execute_tool (TOOL — args + result) +``` + +Token usage is emitted under both the current (`input_tokens`/`output_tokens`) and +legacy (`prompt_tokens`/`completion_tokens`) GenAI names, so Agenta maps it regardless +of which adapter claims the span. + +## Setup + +```bash +pnpm install --ignore-workspace +``` + +### Authenticate Pi (one time) + +The runner uses `~/.pi/agent/auth.json`. Log in with your ChatGPT subscription — no API +key, no per-token billing: + +```bash +pnpm exec pi # opens the TUI +/login # choose "ChatGPT Plus/Pro (Codex)", finish the browser OAuth +# then quit the TUI +``` + +Alternatively, export `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. + +### Credentials for Agenta + +The runner reads `AGENTA_HOST` / `AGENTA_API_KEY` from a local `.env` (see `.env.example`) +or, failing that, from the repo-root `.env.test.local`. + +## Run + +```bash +pnpm start # uses gpt-5.5 by default +PI_MODEL=gpt-5.4 pnpm start # pick another available model +``` + +The runner prints the `trace_id` and a `/api/spans/?trace_id=...` fetch URL on exit. +Then open Agenta observability and find the `invoke_agent` trace. + +> Note: `gpt-5.3-codex-spark` is **not** usable on a ChatGPT (Codex) login — it 400s. +> Use `gpt-5.5` / `gpt-5.4`. + +## Verified mapping (Agenta conventional semantics) + +A run produces a coherent tree that types and maps correctly: + +``` +invoke_agent (agent) ag.data.inputs={prompt}, ag.data.outputs=text, ag.session.id, cumulative tokens + turn N (chain) + chat (chat) ag.data.inputs.prompt[] + ag.data.outputs.completion[] (OpenInference + messages), ag.meta.request.model, incremental token usage + execute_tool (tool) ag.data.inputs={args}, ag.data.outputs=result +``` + +Two things make the data land in `ag.data` instead of `ag.unsupported`: +`ag.data.inputs` must be a **JSON object** (Agenta exiles non-dict inputs), so the agent and +tool spans emit `input.value` as JSON; the chat span emits OpenInference +`llm.input_messages.*` / `llm.output_messages.*` so it renders as a message thread. Token +usage is emitted under both the new (`input_tokens`) and legacy (`prompt_tokens`) names. + +A third thing makes the **agent-root token/cost totals correct**: Agenta rolls metrics up +its span tree by sorting on millisecond-resolution `start_time` and attaching a span only +once its parent is present. Same-millisecond siblings (e.g. `agent_start`/`turn_start`) +tie and can drop a subtree from the roll-up. So the extension buffers each trace and +exports it in one OTLP batch when the root span ends, ordered **parent-first** — without +this, a multi-turn agent root undercounts (shows only the last turn's tokens/cost). diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts new file mode 100644 index 0000000000..a11d959d36 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts @@ -0,0 +1,414 @@ +/** + * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into + * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta. + * + * Span tree (one per user prompt): + * invoke_agent (openinference.span.kind = AGENT) + * turn N (CHAIN) + * chat (LLM) — the provider request for that turn + * execute_tool (TOOL) — each tool the turn ran + * + * Agenta's OpenInference adapter types nodes off `openinference.span.kind` + * (AGENT->agent, CHAIN->chain, LLM->chat, TOOL->tool) and `session.id` -> + * `ag.session.id`. Token usage is emitted under BOTH the legacy + * (`prompt_tokens`/`completion_tokens`) and current + * (`input_tokens`/`output_tokens`) GenAI names so it maps regardless of which + * Agenta adapter claims the span. + * + * Works two ways with the same file: + * - SDK: pass the default export to DefaultResourceLoader.extensionFactories, + * then call shutdownTracing() after the run to flush (see run.ts). + * - CLI: `pi -e ./agenta-otel.ts`; the session_shutdown handler flushes on exit. + * + * Config (read lazily so the runner can load .env first): + * AGENTA_HOST, AGENTA_API_KEY — exporter endpoint + auth (required) + * PI_OTEL_CAPTURE_CONTENT=0 — disable prompt/response/tool I/O capture + * OTEL_SERVICE_NAME — resource service.name (default "pi-agent") + */ +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; +import { + context, + trace, + SpanStatusCode, + type Context, + type Span, +} from "@opentelemetry/api"; +import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto"; +import { Resource } from "@opentelemetry/resources"; +import type { + ReadableSpan, + SpanExporter, + SpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; + +/** + * Buffer a trace's spans and export them in ONE OTLP batch when the root span + * ends. Agenta computes cumulative (rolled-up) token/cost metrics per ingest + * batch, so a trace split across batches (which BatchSpanProcessor does on its + * timer for long runs) loses the root aggregation — the agent node would show + * only the last turn's tokens/cost instead of the whole-run total. + */ +class TraceBatchProcessor implements SpanProcessor { + private readonly buffers = new Map(); + constructor(private readonly exporter: SpanExporter) {} + onStart(): void {} + onEnd(span: ReadableSpan): void { + const traceId = span.spanContext().traceId; + const spans = this.buffers.get(traceId) ?? []; + spans.push(span); + if (span.parentSpanId) { + this.buffers.set(traceId, spans); + } else { + // Root span ended: all descendants ended earlier, so the trace is complete. + this.buffers.delete(traceId); + this.exporter.export(orderParentFirst(spans), () => {}); + } + } + forceFlush(): Promise { + const leftovers = [...this.buffers.values()].flat(); + this.buffers.clear(); + if (leftovers.length === 0) return Promise.resolve(); + return new Promise((resolve) => + this.exporter.export(orderParentFirst(leftovers), () => resolve()), + ); + } + shutdown(): Promise { + return this.forceFlush().then(() => this.exporter.shutdown()); + } +} + +/** + * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at + * millisecond resolution and builds its roll-up tree by sorting on start_time, + * attaching a span only if its parent is already seen. Sibling events fired in + * the same millisecond (agent_start/turn_start) would otherwise tie, and a + * child sorted before its parent gets dropped from the cumulative tree. A + * parent-first request order makes the backend's stable sort keep parents ahead + * of children on ties. + */ +function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] { + const byId = new Map(spans.map((s) => [s.spanContext().spanId, s])); + const childrenOf = new Map(); + const roots: ReadableSpan[] = []; + for (const s of spans) { + const parentId = s.parentSpanId; + if (parentId && byId.has(parentId)) { + const list = childrenOf.get(parentId) ?? []; + list.push(s); + childrenOf.set(parentId, list); + } else { + roots.push(s); + } + } + const ordered: ReadableSpan[] = []; + const visit = (s: ReadableSpan) => { + ordered.push(s); + for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child); + }; + roots.forEach(visit); + // Any spans not reached (defensive) get appended so nothing is dropped. + if (ordered.length !== spans.length) { + const seen = new Set(ordered); + for (const s of spans) if (!seen.has(s)) ordered.push(s); + } + return ordered; +} + +/** Set by the runner before prompting so spans can carry session + model. */ +export const runConfig: { + sessionId?: string; + provider?: string; + requestModel?: string; + /** Filled by the extension on agent_start so the runner can print/fetch the trace. */ + traceId?: string; +} = {}; + +let provider: NodeTracerProvider | undefined; +let captureContent = true; + +function initTracing(): void { + if (provider) return; + + const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace( + /\/+$/, + "", + ); + const apiKey = process.env.AGENTA_API_KEY || ""; + const url = `${host}/api/otlp/v1/traces`; + captureContent = process.env.PI_OTEL_CAPTURE_CONTENT !== "0"; + + if (!apiKey) { + console.warn( + "[agenta-otel] AGENTA_API_KEY is not set — the collector will reject spans with 401.", + ); + } + console.log(`[agenta-otel] exporting spans to ${url} (content capture: ${captureContent})`); + + const exporter = new OTLPTraceExporter({ + url, + headers: { Authorization: `ApiKey ${apiKey}` }, + timeoutMillis: 10_000, + }); + + provider = new NodeTracerProvider({ + resource: new Resource({ + [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent", + }), + }); + provider.addSpanProcessor(new TraceBatchProcessor(exporter)); + provider.register(); +} + +/** Flush and shut down the exporter. Call from the runner after a run completes. */ +export async function shutdownTracing(): Promise { + if (!provider) return; + try { + await provider.forceFlush(); + await provider.shutdown(); + } finally { + provider = undefined; + } +} + +const tracer = () => trace.getTracer("agenta-pi-otel", "0.1.0"); + +// --- per-run span state (the POC runs one prompt at a time) --- +let agentSpan: Span | undefined; +let agentCtx: Context | undefined; +let pendingPrompt: string | undefined; +let currentTurn: { span: Span; ctx: Context; index?: number } | undefined; +let llmSpan: Span | undefined; +let lastContextMessages: any[] | undefined; +const toolSpans = new Map(); + +/** A string output → ag.data.outputs (any type is valid there). */ +function setOutput(span: Span, value: unknown): void { + if (!captureContent || value == null) return; + const text = typeof value === "string" ? value : JSON.stringify(value); + if (text.length > 0) span.setAttribute("output.value", text); +} + +/** + * ag.data.inputs must be a dict, so emit input.value as a JSON object string. + * A non-object (raw string) would be relocated to ag.unsupported by Agenta. + */ +function setInputs(span: Span, obj: Record): void { + if (!captureContent) return; + span.setAttribute("input.value", JSON.stringify(obj)); + span.setAttribute("input.mime_type", "application/json"); +} + +function oiRole(role: string): string { + return role === "toolResult" ? "tool" : role; // user | assistant | system | tool +} + +function messageText(msg: any): string { + const c = msg?.content; + if (typeof c === "string") return c; + if (Array.isArray(c)) { + return c + .filter((b: any) => b?.type === "text") + .map((b: any) => b.text) + .join(""); + } + return ""; +} + +/** + * Emit OpenInference structured messages so Agenta renders a proper message + * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*, + * `llm.output_messages.*` -> ag.data.outputs.completion.*. + */ +function emitMessages(span: Span, prefix: string, messages: any[]): void { + if (!captureContent || !Array.isArray(messages)) return; + messages.forEach((m, i) => { + const base = `${prefix}.${i}.message`; + span.setAttribute(`${base}.role`, oiRole(m.role)); + const text = messageText(m); + if (text) span.setAttribute(`${base}.content`, text); + if (m.role === "toolResult" && m.toolCallId) + span.setAttribute(`${base}.tool_call_id`, m.toolCallId); + if (Array.isArray(m.content)) { + m.content + .filter((b: any) => b?.type === "toolCall") + .forEach((call: any, j: number) => { + const tc = `${base}.tool_calls.${j}.tool_call`; + if (call.id) span.setAttribute(`${tc}.id`, call.id); + span.setAttribute(`${tc}.function.name`, call.name); + span.setAttribute( + `${tc}.function.arguments`, + JSON.stringify(call.arguments ?? {}), + ); + }); + } + }); +} + +function toolResultText(result: any): string { + if (!result) return ""; + if (typeof result === "string") return result; + if (Array.isArray(result)) { + return result + .filter((c: any) => c?.type === "text") + .map((c: any) => c.text) + .join(""); + } + if (result.content) return toolResultText(result.content); + return JSON.stringify(result); +} + +function lastAssistantText(messages: any): string { + if (!Array.isArray(messages)) return ""; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i]?.role === "assistant") return messageText(messages[i]); + } + return ""; +} + +/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */ +function applyAssistant(span: Span, msg: any): void { + if (msg.provider) span.setAttribute("gen_ai.system", msg.provider); + if (msg.model) span.setAttribute("gen_ai.request.model", msg.model); + if (msg.responseModel || msg.model) + span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model); + if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId); + if (msg.stopReason) + span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]); + + const u = msg.usage; + if (u) { + // Current GenAI names (mapped by Agenta's logfire adapter) ... + span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0); + // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token + // usage is never silently dropped regardless of which adapter wins. + span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0); + span.setAttribute( + "gen_ai.usage.total_tokens", + u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0), + ); + if (u.cacheRead) + span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead); + if (u.cacheWrite) + span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite); + if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total); + } + + emitMessages(span, "llm.output_messages", [msg]); + if (msg.stopReason === "error" || msg.errorMessage) { + span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage }); + } +} + +export default function agentaOtel(pi: ExtensionAPI): void { + initTracing(); + const t = tracer(); + + pi.on("before_agent_start", async (event: any) => { + pendingPrompt = event?.prompt; + }); + + pi.on("agent_start", async () => { + agentSpan = t.startSpan("invoke_agent"); + agentSpan.setAttribute("openinference.span.kind", "AGENT"); + agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); + agentSpan.setAttribute("gen_ai.agent.name", "pi"); + if (runConfig.sessionId) { + agentSpan.setAttribute("session.id", runConfig.sessionId); + agentSpan.setAttribute("gen_ai.conversation.id", runConfig.sessionId); + } + setInputs(agentSpan, { prompt: pendingPrompt ?? "" }); + runConfig.traceId = agentSpan.spanContext().traceId; + agentCtx = trace.setSpan(context.active(), agentSpan); + }); + + // The messages handed to the next LLM call — the chat span's input. + pi.on("context", async (event: any) => { + if (Array.isArray(event?.messages)) lastContextMessages = event.messages; + }); + + pi.on("turn_start", async (event: any) => { + const parent = agentCtx ?? context.active(); + const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn"; + const span = t.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "CHAIN"); + if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex); + currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex }; + }); + + pi.on("before_provider_request", async (_event: any, ctx: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const modelId = runConfig.requestModel ?? ctx?.model?.id; + const providerName = runConfig.provider ?? ctx?.model?.provider; + llmSpan = t.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent); + llmSpan.setAttribute("openinference.span.kind", "LLM"); + llmSpan.setAttribute("gen_ai.operation.name", "chat"); + if (providerName) llmSpan.setAttribute("gen_ai.system", providerName); + if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId); + if (lastContextMessages) emitMessages(llmSpan, "llm.input_messages", lastContextMessages); + }); + + pi.on("message_end", async (event: any) => { + const msg = event?.message; + if (!msg || msg.role !== "assistant" || !llmSpan) return; + applyAssistant(llmSpan, msg); + llmSpan.end(); + llmSpan = undefined; + }); + + pi.on("tool_execution_start", async (event: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool"; + const span = t.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "TOOL"); + span.setAttribute("gen_ai.operation.name", "execute_tool"); + if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName); + if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId); + setInputs(span, (event?.args as Record) ?? {}); + if (event?.toolCallId) toolSpans.set(event.toolCallId, span); + }); + + pi.on("tool_execution_end", async (event: any) => { + const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined; + if (!span) return; + setOutput(span, toolResultText(event?.result)); + if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR }); + span.end(); + toolSpans.delete(event.toolCallId); + }); + + pi.on("turn_end", async (event: any) => { + // Safety net: if the LLM span is still open (no assistant message_end seen), + // close it from the turn's assistant message. + if (llmSpan && event?.message) { + applyAssistant(llmSpan, event.message); + llmSpan.end(); + llmSpan = undefined; + } + if (currentTurn) { + currentTurn.span.end(); + currentTurn = undefined; + } + }); + + pi.on("agent_end", async (event: any) => { + if (!agentSpan) return; + setOutput(agentSpan, lastAssistantText(event?.messages)); + agentSpan.end(); + agentSpan = undefined; + agentCtx = undefined; + lastContextMessages = undefined; + }); + + // CLI (`pi -e`) flush path. The SDK runner additionally calls shutdownTracing(). + pi.on("session_shutdown", async () => { + try { + await provider?.forceFlush(); + } catch { + /* best effort */ + } + }); +} diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/package.json b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/package.json new file mode 100644 index 0000000000..e3d23ae603 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/package.json @@ -0,0 +1,25 @@ +{ + "name": "wp-1-pi-tracing-poc", + "version": "0.0.0", + "private": true, + "type": "module", + "description": "WP-1 POC: trace the Pi agent harness into Agenta via an OTel extension.", + "scripts": { + "start": "tsx run.ts", + "login": "pi" + }, + "dependencies": { + "@earendil-works/pi-coding-agent": "0.79.4", + "@opentelemetry/api": "1.9.0", + "@opentelemetry/exporter-trace-otlp-proto": "0.54.0", + "@opentelemetry/resources": "1.28.0", + "@opentelemetry/sdk-trace-base": "1.28.0", + "@opentelemetry/sdk-trace-node": "1.28.0", + "@opentelemetry/semantic-conventions": "1.28.0", + "dotenv": "17.2.3" + }, + "devDependencies": { + "tsx": "4.19.2", + "@types/node": "22.10.2" + } +} diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/pnpm-lock.yaml b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/pnpm-lock.yaml new file mode 100644 index 0000000000..54c94564b7 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/pnpm-lock.yaml @@ -0,0 +1,1842 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@earendil-works/pi-coding-agent': + specifier: 0.79.4 + version: 0.79.4(ws@8.21.0)(zod@4.4.3) + '@opentelemetry/api': + specifier: 1.9.0 + version: 1.9.0 + '@opentelemetry/exporter-trace-otlp-proto': + specifier: 0.54.0 + version: 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-node': + specifier: 1.28.0 + version: 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': + specifier: 1.28.0 + version: 1.28.0 + dotenv: + specifier: 17.2.3 + version: 17.2.3 + devDependencies: + '@types/node': + specifier: 22.10.2 + version: 22.10.2 + tsx: + specifier: 4.19.2 + version: 4.19.2 + +packages: + + '@anthropic-ai/sdk@0.91.1': + resolution: {integrity: sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw==} + hasBin: true + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + peerDependenciesMeta: + zod: + optional: true + + '@aws-crypto/crc32@5.2.0': + resolution: {integrity: sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==} + engines: {node: '>=16.0.0'} + + '@aws-crypto/sha256-browser@5.2.0': + resolution: {integrity: sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==} + + '@aws-crypto/sha256-js@5.2.0': + resolution: {integrity: sha512-FFQQyu7edu4ufvIZ+OadFpHHOt+eSTBaYaki44c+akjg7qZg9oOQeLlk77F6tSYqjDAFClrHJk9tMf0HdVyOvA==} + engines: {node: '>=16.0.0'} + + '@aws-crypto/supports-web-crypto@5.2.0': + resolution: {integrity: sha512-iAvUotm021kM33eCdNfwIN//F77/IADDSs58i+MDaOqFrVjZo9bAal0NK7HurRuWLLpF1iLX7gbWrjHjeo+YFg==} + + '@aws-crypto/util@5.2.0': + resolution: {integrity: sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==} + + '@aws-sdk/client-bedrock-runtime@3.1048.0': + resolution: {integrity: sha512-u+NT61JZEkRFtpL0CAw1N1dwxnaLgwVXQl/zjJxTGgLyS/jTIdg2SdoEoCTHxgDyCnqa1HEi9QOoE9/pYRNpOQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/core@3.974.20': + resolution: {integrity: sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-env@3.972.46': + resolution: {integrity: sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-http@3.972.48': + resolution: {integrity: sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-ini@3.972.53': + resolution: {integrity: sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-login@3.972.52': + resolution: {integrity: sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-node@3.972.55': + resolution: {integrity: sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-process@3.972.46': + resolution: {integrity: sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-sso@3.972.52': + resolution: {integrity: sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/credential-provider-web-identity@3.972.52': + resolution: {integrity: sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/eventstream-handler-node@3.972.21': + resolution: {integrity: sha512-mVC0hOmwGJmNFezZ+wM8Sqfap/LjsMavEf2Evl0YWrLAcrdZOEdjnY8nRvgakVViWJSGm2eJxLuPVHGdeV06kA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/middleware-eventstream@3.972.17': + resolution: {integrity: sha512-tdbnXbw73ww62ABWP0G0Z/euvFowEEvAoi/zG4NaZo7HJFpfGho/Z65HyVzkJLT1cMsUregr4pTyxljlarT0wA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/middleware-websocket@3.972.28': + resolution: {integrity: sha512-SCW06Zjugn86pq7+dxGnFcyWJuEWHT753HTU/Vj/OzVxP+NoShwdAr4ynxAcvWL883OgRVbSqW3ohnjIxwXjjw==} + engines: {node: '>= 14.0.0'} + + '@aws-sdk/nested-clients@3.997.20': + resolution: {integrity: sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/signature-v4-multi-region@3.996.34': + resolution: {integrity: sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/token-providers@3.1048.0': + resolution: {integrity: sha512-k0y/GcuesuSfWyUM0WamrGyeZmltRYaPbHO82UDA6mZ/doB+FOHKutikPAtSXMn/hDz970cF+iRuuiYO9VEbAA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/token-providers@3.1066.0': + resolution: {integrity: sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/types@3.973.12': + resolution: {integrity: sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/util-locate-window@3.965.7': + resolution: {integrity: sha512-M0D6oIpohdNHjc7udzTHEQyot0+0iuA36jc2I9Hps+f/GtKi2HO/pyijQnCnNcwZqLB5+rtn81z3eZK/GyjAmA==} + engines: {node: '>=20.0.0'} + + '@aws-sdk/xml-builder@3.972.29': + resolution: {integrity: sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==} + engines: {node: '>=20.0.0'} + + '@aws/lambda-invoke-store@0.2.4': + resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==} + engines: {node: '>=18.0.0'} + + '@babel/runtime@7.29.7': + resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==} + engines: {node: '>=6.9.0'} + + '@earendil-works/pi-agent-core@0.79.4': + resolution: {integrity: sha512-xkaZ3yK2XbP9HYdHrrdj/6HqZPM0o/mwbjMSU4RTJyR3HjDG0ZrPz76Hg6s0W+G4u6PpJr1mGx/srCG+3eQA8A==} + engines: {node: '>=22.19.0'} + + '@earendil-works/pi-ai@0.79.4': + resolution: {integrity: sha512-Z1j+YP+6ZyPBKDUoc5m0GO/o1hPK17fWeErtDgegCTpm2dcKzuFvL/7GTqHeJkVkfpeXRwO37xOfgozQbK6EUw==} + engines: {node: '>=22.19.0'} + hasBin: true + + '@earendil-works/pi-coding-agent@0.79.4': + resolution: {integrity: sha512-PthzVzM5m4XH/hrU+2fVjuwuH5M4eMFWbd0NCRScH14XKpwlPc8/Fh6JDz0jQb5kTBT9oQT183YLTHVVulFL9A==} + engines: {node: '>=22.19.0'} + hasBin: true + + '@earendil-works/pi-tui@0.79.4': + resolution: {integrity: sha512-/ZhfFiHSBMH7AbDrBQIN+UWlJnl9tSEpLYICRGGMzmNfyCqX+30NYacIhyOEaD8R5rS6wJZysAOPU0yNwigbXw==} + engines: {node: '>=22.19.0'} + + '@esbuild/aix-ppc64@0.23.1': + resolution: {integrity: sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + + '@esbuild/android-arm64@0.23.1': + resolution: {integrity: sha512-xw50ipykXcLstLeWH7WRdQuysJqejuAGPd30vd1i5zSyKK3WE+ijzHmLKxdiCMtH1pHz78rOg0BKSYOSB/2Khw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + + '@esbuild/android-arm@0.23.1': + resolution: {integrity: sha512-uz6/tEy2IFm9RYOyvKl88zdzZfwEfKZmnX9Cj1BHjeSGNuGLuMD1kR8y5bteYmwqKm1tj8m4cb/aKEorr6fHWQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + + '@esbuild/android-x64@0.23.1': + resolution: {integrity: sha512-nlN9B69St9BwUoB+jkyU090bru8L0NA3yFvAd7k8dNsVH8bi9a8cUAUSEcEEgTp2z3dbEDGJGfP6VUnkQnlReg==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + + '@esbuild/darwin-arm64@0.23.1': + resolution: {integrity: sha512-YsS2e3Wtgnw7Wq53XXBLcV6JhRsEq8hkfg91ESVadIrzr9wO6jJDMZnCQbHm1Guc5t/CdDiFSSfWP58FNuvT3Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + + '@esbuild/darwin-x64@0.23.1': + resolution: {integrity: sha512-aClqdgTDVPSEGgoCS8QDG37Gu8yc9lTHNAQlsztQ6ENetKEO//b8y31MMu2ZaPbn4kVsIABzVLXYLhCGekGDqw==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + + '@esbuild/freebsd-arm64@0.23.1': + resolution: {integrity: sha512-h1k6yS8/pN/NHlMl5+v4XPfikhJulk4G+tKGFIOwURBSFzE8bixw1ebjluLOjfwtLqY0kewfjLSrO6tN2MgIhA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + + '@esbuild/freebsd-x64@0.23.1': + resolution: {integrity: sha512-lK1eJeyk1ZX8UklqFd/3A60UuZ/6UVfGT2LuGo3Wp4/z7eRTRYY+0xOu2kpClP+vMTi9wKOfXi2vjUpO1Ro76g==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + + '@esbuild/linux-arm64@0.23.1': + resolution: {integrity: sha512-/93bf2yxencYDnItMYV/v116zff6UyTjo4EtEQjUBeGiVpMmffDNUyD9UN2zV+V3LRV3/on4xdZ26NKzn6754g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + + '@esbuild/linux-arm@0.23.1': + resolution: {integrity: sha512-CXXkzgn+dXAPs3WBwE+Kvnrf4WECwBdfjfeYHpMeVxWE0EceB6vhWGShs6wi0IYEqMSIzdOF1XjQ/Mkm5d7ZdQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + + '@esbuild/linux-ia32@0.23.1': + resolution: {integrity: sha512-VTN4EuOHwXEkXzX5nTvVY4s7E/Krz7COC8xkftbbKRYAl96vPiUssGkeMELQMOnLOJ8k3BY1+ZY52tttZnHcXQ==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + + '@esbuild/linux-loong64@0.23.1': + resolution: {integrity: sha512-Vx09LzEoBa5zDnieH8LSMRToj7ir/Jeq0Gu6qJ/1GcBq9GkfoEAoXvLiW1U9J1qE/Y/Oyaq33w5p2ZWrNNHNEw==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + + '@esbuild/linux-mips64el@0.23.1': + resolution: {integrity: sha512-nrFzzMQ7W4WRLNUOU5dlWAqa6yVeI0P78WKGUo7lg2HShq/yx+UYkeNSE0SSfSure0SqgnsxPvmAUu/vu0E+3Q==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + + '@esbuild/linux-ppc64@0.23.1': + resolution: {integrity: sha512-dKN8fgVqd0vUIjxuJI6P/9SSSe/mB9rvA98CSH2sJnlZ/OCZWO1DJvxj8jvKTfYUdGfcq2dDxoKaC6bHuTlgcw==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + + '@esbuild/linux-riscv64@0.23.1': + resolution: {integrity: sha512-5AV4Pzp80fhHL83JM6LoA6pTQVWgB1HovMBsLQ9OZWLDqVY8MVobBXNSmAJi//Csh6tcY7e7Lny2Hg1tElMjIA==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + + '@esbuild/linux-s390x@0.23.1': + resolution: {integrity: sha512-9ygs73tuFCe6f6m/Tb+9LtYxWR4c9yg7zjt2cYkjDbDpV/xVn+68cQxMXCjUpYwEkze2RcU/rMnfIXNRFmSoDw==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + + '@esbuild/linux-x64@0.23.1': + resolution: {integrity: sha512-EV6+ovTsEXCPAp58g2dD68LxoP/wK5pRvgy0J/HxPGB009omFPv3Yet0HiaqvrIrgPTBuC6wCH1LTOY91EO5hQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + + '@esbuild/netbsd-x64@0.23.1': + resolution: {integrity: sha512-aevEkCNu7KlPRpYLjwmdcuNz6bDFiE7Z8XC4CPqExjTvrHugh28QzUXVOZtiYghciKUacNktqxdpymplil1beA==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + + '@esbuild/openbsd-arm64@0.23.1': + resolution: {integrity: sha512-3x37szhLexNA4bXhLrCC/LImN/YtWis6WXr1VESlfVtVeoFJBRINPJ3f0a/6LV8zpikqoUg4hyXw0sFBt5Cr+Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + + '@esbuild/openbsd-x64@0.23.1': + resolution: {integrity: sha512-aY2gMmKmPhxfU+0EdnN+XNtGbjfQgwZj43k8G3fyrDM/UdZww6xrWxmDkuz2eCZchqVeABjV5BpildOrUbBTqA==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + + '@esbuild/sunos-x64@0.23.1': + resolution: {integrity: sha512-RBRT2gqEl0IKQABT4XTj78tpk9v7ehp+mazn2HbUeZl1YMdaGAQqhapjGTCe7uw7y0frDi4gS0uHzhvpFuI1sA==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + + '@esbuild/win32-arm64@0.23.1': + resolution: {integrity: sha512-4O+gPR5rEBe2FpKOVyiJ7wNDPA8nGzDuJ6gN4okSA1gEOYZ67N8JPk58tkWtdtPeLz7lBnY6I5L3jdsr3S+A6A==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + + '@esbuild/win32-ia32@0.23.1': + resolution: {integrity: sha512-BcaL0Vn6QwCwre3Y717nVHZbAa4UBEigzFm6VdsVdT/MbZ38xoj1X9HPkZhbmaBGUD1W8vxAfffbDe8bA6AKnQ==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + + '@esbuild/win32-x64@0.23.1': + resolution: {integrity: sha512-BHpFFeslkWrXWyUPnbKm+xYYVYruCinGcftSBaa8zoF9hZO4BcSCFUvHVTtzpIY6YzUnYtuEhZ+C9iEXjxnasg==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + + '@google/genai@1.52.0': + resolution: {integrity: sha512-gwSvbpiN/17O9TbsqSsE/OzZcpv5Fo4RQjdngGgogtuB9RsyJ8ZHhX5KjHj1bp5N9snN2eK8LDGXSaWW2hof8Q==} + engines: {node: '>=20.0.0'} + peerDependencies: + '@modelcontextprotocol/sdk': ^1.25.2 + peerDependenciesMeta: + '@modelcontextprotocol/sdk': + optional: true + + '@mariozechner/clipboard-darwin-arm64@0.3.9': + resolution: {integrity: sha512-BfgV7vCEWZwJwZJw03r6bP5+tf0iI/ANuQYCxi9RNn7FrWB3yzGuMKCrNLRl6V761vXRdL8+OqZ0wd4TqlsNOQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [darwin] + + '@mariozechner/clipboard-darwin-universal@0.3.9': + resolution: {integrity: sha512-BGGR4iA9Z2shAjI65eI5xtyb3LYNlDW9X3gxKxDbqtbnREohsrqznov6zpKoIrsRWpzlYVEdKphS7ksJ0/ndSQ==} + engines: {node: '>= 10'} + os: [darwin] + + '@mariozechner/clipboard-darwin-x64@0.3.9': + resolution: {integrity: sha512-4kURmCbS6nt8uYhtmWpUcJWyPHfmAr5dTpXD1nO3pIfa+TSQ9DbrGOYCKH+aEFW47XhQ4Vp8ZTszie+wfFvDKg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [darwin] + + '@mariozechner/clipboard-linux-arm64-gnu@0.3.9': + resolution: {integrity: sha512-g59OkUGP2DDfCOIKypHeYgv2M55u/cKvXa5dSxFbEJ34XvIQMdcVmpKCkGUro3ZgefXiGVdwguvTMQGpHWzIXw==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-arm64-musl@0.3.9': + resolution: {integrity: sha512-AGuJdgKsmJdm4Pych7kv3sqe591ERRaAHW3xjLooiFzn8J+PxUyof++7YZrB5Y5tpnTO+K18Og3taj2NpluCRQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [musl] + + '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9': + resolution: {integrity: sha512-DXBEAiuMpk7dhS1a9NzNxVAFi1vaKoPu7rQNgY8LIDLGrK3lnIp3nT10DUum+PKVJoJppIP+NAA8IZe4DMNDPw==} + engines: {node: '>= 10'} + cpu: [riscv64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-x64-gnu@0.3.9': + resolution: {integrity: sha512-WORrMLd6EpElEME7JRKfSaY34nW1P5LbdgK5YNCS1ncG2LqmITsSMEJ8nh2mpvxb3TxqbOOKgY7k9eMJYlW9Mw==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [glibc] + + '@mariozechner/clipboard-linux-x64-musl@0.3.9': + resolution: {integrity: sha512-/DHn+1DrfL6oRaPPWXaOKvonFFrni666fxd+zFqiQEfvBH0tsHVWjq9iqBk0oDp0qaPA72lIMy5BptxISBEhZQ==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [musl] + + '@mariozechner/clipboard-win32-arm64-msvc@0.3.9': + resolution: {integrity: sha512-O5FHD3ErkMwMhNzAfu3ggy0ug4z7btZuoQgwwxlzPrwV2bxlD6WDpqBY4NCgICAgZdDKdp+loUEKVAVt8aYnhQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [win32] + + '@mariozechner/clipboard-win32-x64-msvc@0.3.9': + resolution: {integrity: sha512-ihQC3EufqEY81vhXBgVBtK4prL+wc62zJsSvxrgz7K1hsdt6OObz6v9p3Rn1OG3GJksTTKMJF0u/guMISHPhSA==} + engines: {node: '>= 10'} + cpu: [x64] + os: [win32] + + '@mariozechner/clipboard@0.3.9': + resolution: {integrity: sha512-ABnA53mdfkGZwOFUdZNv2S0CWGO/EIuPj8Vv9xmBFmSYg/qFc7ihO6q5FcQjvoE67kZpWkEc4AhD6B/os04yuA==} + engines: {node: '>= 10'} + + '@mistralai/mistralai@2.2.1': + resolution: {integrity: sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ==} + + '@nodable/entities@2.2.0': + resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==} + + '@opentelemetry/api-logs@0.54.0': + resolution: {integrity: sha512-9HhEh5GqFrassUndqJsyW7a0PzfyWr2eV2xwzHLIS+wX3125+9HE9FMRAKmJRwxZhgZGwH3HNQQjoMGZqmOeVA==} + engines: {node: '>=14'} + + '@opentelemetry/api@1.9.0': + resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} + engines: {node: '>=8.0.0'} + + '@opentelemetry/context-async-hooks@1.28.0': + resolution: {integrity: sha512-igcl4Ve+F1N2063PJUkesk/GkYyuGIWinYkSyAFTnIj3gzrOgvOA4k747XNdL47HRRL1w/qh7UW8NDuxOLvKFA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.27.0': + resolution: {integrity: sha512-yQPKnK5e+76XuiqUH/gKyS8wv/7qITd5ln56QkBTf3uggr0VkXOXfcaAuG330UfdYu83wsyoBwqwxigpIG+Jkg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.28.0': + resolution: {integrity: sha512-ZLwRMV+fNDpVmF2WYUdBHlq0eOWtEaUJSusrzjGnBt7iSRvfjFE3RXYUZJrqou/wIDWV0DwQ5KIfYe9WXg9Xqw==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/exporter-trace-otlp-proto@0.54.0': + resolution: {integrity: sha512-cpDQj5wl7G8pLu3lW94SnMpn0C85A9Ehe7+JBow2IL5DGPWXTkynFngMtCC3PpQzQgzlyOVe0MVZfoBB3M5ECA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-exporter-base@0.54.0': + resolution: {integrity: sha512-g+H7+QleVF/9lz4zhaR9Dt4VwApjqG5WWupy5CTMpWJfHB/nLxBbX73GBZDgdiNfh08nO3rNa6AS7fK8OhgF5g==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-transformer@0.54.0': + resolution: {integrity: sha512-jRexIASQQzdK4AjfNIBfn94itAq4Q8EXR9d3b/OVbhd3kKQKvMr7GkxYDjbeTbY7hHCOLcLfJ3dpYQYGOe8qOQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/propagator-b3@1.28.0': + resolution: {integrity: sha512-Q7HVDIMwhN5RxL4bECMT4BdbyYSAKkC6U/RGn4NpO/cbqP6ZRg+BS7fPo/pGZi2w8AHfpIGQFXQmE8d2PC5xxQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-jaeger@1.28.0': + resolution: {integrity: sha512-wKJ94+s8467CnIRgoSRh0yXm/te0QMOwTq9J01PfG/RzYZvlvN8aRisN2oZ9SznB45dDGnMj3BhUlchSA9cEKA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.27.0': + resolution: {integrity: sha512-jOwt2VJ/lUD5BLc+PMNymDrUCpm5PKi1E9oSVYAvz01U/VdndGmrtV3DU1pG4AwlYhJRHbHfOUIlpBeXCPw6QQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.28.0': + resolution: {integrity: sha512-cIyXSVJjGeTICENN40YSvLDAq4Y2502hGK3iN7tfdynQLKWb3XWZQEkPc+eSx47kiy11YeFAlYkEfXwR1w8kfw==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-logs@0.54.0': + resolution: {integrity: sha512-HeWvOPiWhEw6lWvg+lCIi1WhJnIPbI4/OFZgHq9tKfpwF3LX6/kk3+GR8sGUGAEZfbjPElkkngzvd2s03zbD7Q==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.4.0 <1.10.0' + + '@opentelemetry/sdk-metrics@1.27.0': + resolution: {integrity: sha512-JzWgzlutoXCydhHWIbLg+r76m+m3ncqvkCcsswXAQ4gqKS+LOHKhq+t6fx1zNytvLuaOUBur7EvWxECc4jPQKg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.3.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.27.0': + resolution: {integrity: sha512-btz6XTQzwsyJjombpeqCX6LhiMQYpzt2pIYNPnw0IPO/3AhT6yjnf8Mnv3ZC2A4eRYOjqrg+bfaXg9XHDRJDWQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.28.0': + resolution: {integrity: sha512-ceUVWuCpIao7Y5xE02Xs3nQi0tOGmMea17ecBdwtCvdo9ekmO+ijc9RFDgfifMl7XCBf41zne/1POM3LqSTZDA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-node@1.28.0': + resolution: {integrity: sha512-N0sYfYXvHpP0FNIyc+UfhLnLSTOuZLytV0qQVrDWIlABeD/DWJIGttS7nYeR14gQLXch0M1DW8zm3VeN6Opwtg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/semantic-conventions@1.27.0': + resolution: {integrity: sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==} + engines: {node: '>=14'} + + '@opentelemetry/semantic-conventions@1.28.0': + resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==} + engines: {node: '>=14'} + + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.5': + resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==} + + '@protobufjs/eventemitter@1.1.1': + resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==} + + '@protobufjs/fetch@1.1.1': + resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.1': + resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==} + + '@silvia-odwyer/photon-node@0.3.4': + resolution: {integrity: sha512-bnly4BKB3KDTFxrUIcgCLbaeVVS8lrAkri1pEzskpmxu9MdfGQTy8b8EgcD83ywD3RPMsIulY8xJH5Awa+t9fA==} + + '@smithy/core@3.24.7': + resolution: {integrity: sha512-KoUi4M1f3BG6kzN1FnCwL7oyFptTbyBJKjR6yhSib+JHRdUmM1o+VwsFtJ66NZCkCzVfJMWRHJNo0R0jznp0Pg==} + engines: {node: '>=18.0.0'} + + '@smithy/credential-provider-imds@4.3.9': + resolution: {integrity: sha512-ZlfJ/4Fa3jYb+3eaohPfG9utX9HmdhFNcFtpoGAhUhdynAOmGXtmigbi7eEiONKM+ykHw8RwKuDEb85Lx7t7fA==} + engines: {node: '>=18.0.0'} + + '@smithy/fetch-http-handler@5.4.7': + resolution: {integrity: sha512-NslaM2ir0N2hisDmzXLstPaVINZheh8SokyOC++kzFPloZucL2R7Y7bS57mSzx/1Fc/fqmn7twjkeezTTrV0EA==} + engines: {node: '>=18.0.0'} + + '@smithy/is-array-buffer@2.2.0': + resolution: {integrity: sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==} + engines: {node: '>=14.0.0'} + + '@smithy/node-http-handler@4.7.3': + resolution: {integrity: sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==} + engines: {node: '>=18.0.0'} + + '@smithy/node-http-handler@4.7.8': + resolution: {integrity: sha512-f+DbsWUwSbtMu1a/j8Y93KiU1SRg9nyzfjereqn1BJ33QOTUXxdlYvVXMhAYl1vuR1Kmna5aIJe09KSIfyFNYw==} + engines: {node: '>=18.0.0'} + + '@smithy/signature-v4@5.4.7': + resolution: {integrity: sha512-LwQZazFayImv+IOm0S0enoLeUJwmAlhGC5O6YCcLWezyu08dF46GOxPOq35OpBIHkgd7OvNvBStIFwVNyrvoBw==} + engines: {node: '>=18.0.0'} + + '@smithy/types@4.14.4': + resolution: {integrity: sha512-B2S9+UGm1+/pHkcx3ZoLVX1a+pmSk8rqxRR+ZsNqZaJ5q9FWX9AFGQVM4qG5+OBeQUZVy99HY8HqW8gK/wgXzQ==} + engines: {node: '>=18.0.0'} + + '@smithy/util-buffer-from@2.2.0': + resolution: {integrity: sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==} + engines: {node: '>=14.0.0'} + + '@smithy/util-utf8@2.3.0': + resolution: {integrity: sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==} + engines: {node: '>=14.0.0'} + + '@types/node@22.10.2': + resolution: {integrity: sha512-Xxr6BBRCAOQixvonOye19wnzyDiUtTeqldOOmj3CkeblonbccA12PFwlufvRdrpjXxqnmUaeiU5EOA+7s5diUQ==} + + '@types/retry@0.12.0': + resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==} + + agent-base@7.1.4: + resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} + engines: {node: '>= 14'} + + anynum@1.0.0: + resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==} + + balanced-match@4.0.4: + resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} + engines: {node: 18 || 20 || >=22} + + base64-js@1.5.1: + resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + + bignumber.js@9.3.1: + resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} + + bowser@2.14.1: + resolution: {integrity: sha512-tzPjzCxygAKWFOJP011oxFHs57HzIhOEracIgAePE4pqB3LikALKnSzUyU4MGs9/iCEUuHlAJTjTc5M+u7YEGg==} + + brace-expansion@5.0.6: + resolution: {integrity: sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==} + engines: {node: 18 || 20 || >=22} + + buffer-equal-constant-time@1.0.1: + resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} + + chalk@5.6.2: + resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==} + engines: {node: ^12.17.0 || ^14.13 || >=16.0.0} + + cross-spawn@7.0.6: + resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} + engines: {node: '>= 8'} + + data-uri-to-buffer@4.0.1: + resolution: {integrity: sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==} + engines: {node: '>= 12'} + + debug@4.4.3: + resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + + diff@8.0.4: + resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==} + engines: {node: '>=0.3.1'} + + dotenv@17.2.3: + resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} + engines: {node: '>=12'} + + ecdsa-sig-formatter@1.0.11: + resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==} + + esbuild@0.23.1: + resolution: {integrity: sha512-VVNz/9Sa0bs5SELtn3f7qhJCDPCF5oMEl5cO9/SSinpE9hbPVvxbd572HH5AKiP7WD8INO53GgfDDhRjkylHEg==} + engines: {node: '>=18'} + hasBin: true + + extend@3.0.2: + resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} + + fast-xml-builder@1.2.0: + resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==} + + fast-xml-parser@5.7.3: + resolution: {integrity: sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==} + hasBin: true + + fetch-blob@3.2.0: + resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==} + engines: {node: ^12.20 || >= 14.13} + + formdata-polyfill@4.0.10: + resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==} + engines: {node: '>=12.20.0'} + + fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + gaxios@7.1.5: + resolution: {integrity: sha512-5FZy72Rh8LhtjmvDrKkI+lVhrsQrVKVsItxMoDm5mNQE+xR0WVIIs+jzPSJgBvKVsLi24fZhXJIsNI0bihDzFg==} + engines: {node: '>=18'} + + gcp-metadata@8.1.2: + resolution: {integrity: sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==} + engines: {node: '>=18'} + + get-east-asian-width@1.6.0: + resolution: {integrity: sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA==} + engines: {node: '>=18'} + + get-tsconfig@4.14.0: + resolution: {integrity: sha512-yTb+8DXzDREzgvYmh6s9vHsSVCHeC0G3PI5bEXNBHtmshPnO+S5O7qgLEOn0I5QvMy6kpZN8K1NKGyilLb93wA==} + + glob@13.0.6: + resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==} + engines: {node: 18 || 20 || >=22} + + google-auth-library@10.7.0: + resolution: {integrity: sha512-QpTAbNJ36TliZLx3TTtahR8HG0hN9RllL1e3FymOvQSIKK8JmgV58H924ub2wa2DsS3ANjjP1Aw1N+Ramc8hqQ==} + engines: {node: '>=18'} + + google-logging-utils@1.1.3: + resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==} + engines: {node: '>=14'} + + graceful-fs@4.2.11: + resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + + highlight.js@10.7.3: + resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==} + + hosted-git-info@9.0.3: + resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==} + engines: {node: ^20.17.0 || >=22.9.0} + + http-proxy-agent@7.0.2: + resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==} + engines: {node: '>= 14'} + + https-proxy-agent@7.0.6: + resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} + engines: {node: '>= 14'} + + ignore@7.0.5: + resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==} + engines: {node: '>= 4'} + + isexe@2.0.0: + resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + + jiti@2.7.0: + resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==} + hasBin: true + + json-bigint@1.0.0: + resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==} + + json-schema-to-ts@3.1.1: + resolution: {integrity: sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==} + engines: {node: '>=16'} + + jwa@2.0.1: + resolution: {integrity: sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==} + + jws@4.0.1: + resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==} + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + + lru-cache@11.5.1: + resolution: {integrity: sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==} + engines: {node: 20 || >=22} + + marked@15.0.12: + resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==} + engines: {node: '>= 18'} + hasBin: true + + minimatch@10.2.5: + resolution: {integrity: sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==} + engines: {node: 18 || 20 || >=22} + + minipass@7.1.3: + resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==} + engines: {node: '>=16 || 14 >=14.17'} + + ms@2.1.3: + resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + + node-domexception@1.0.0: + resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} + engines: {node: '>=10.5.0'} + deprecated: Use your platform's native DOMException instead + + node-fetch@3.3.2: + resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + + openai@6.26.0: + resolution: {integrity: sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA==} + hasBin: true + peerDependencies: + ws: ^8.18.0 + zod: ^3.25 || ^4.0 + peerDependenciesMeta: + ws: + optional: true + zod: + optional: true + + p-retry@4.6.2: + resolution: {integrity: sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==} + engines: {node: '>=8'} + + partial-json@0.1.7: + resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==} + + path-expression-matcher@1.5.0: + resolution: {integrity: sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==} + engines: {node: '>=14.0.0'} + + path-key@3.1.1: + resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} + engines: {node: '>=8'} + + path-scurry@2.0.2: + resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==} + engines: {node: 18 || 20 || >=22} + + proper-lockfile@4.1.2: + resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==} + + protobufjs@7.6.4: + resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==} + engines: {node: '>=12.0.0'} + + resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} + + retry@0.12.0: + resolution: {integrity: sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==} + engines: {node: '>= 4'} + + retry@0.13.1: + resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} + engines: {node: '>= 4'} + + safe-buffer@5.2.1: + resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} + + semver@7.8.0: + resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==} + engines: {node: '>=10'} + hasBin: true + + semver@7.8.4: + resolution: {integrity: sha512-rUCObTnP32Q08R2uuIrt7r9PlEonuTmtuXYcW6s5kjdlj3xbnwe+21yXptAUYcMAABLkYYTtnmzb3w3EDZfueA==} + engines: {node: '>=10'} + hasBin: true + + shebang-command@2.0.0: + resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} + engines: {node: '>=8'} + + shebang-regex@3.0.0: + resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} + engines: {node: '>=8'} + + signal-exit@3.0.7: + resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + + strnum@2.4.0: + resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==} + + ts-algebra@2.0.0: + resolution: {integrity: sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==} + + tslib@2.8.1: + resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + + tsx@4.19.2: + resolution: {integrity: sha512-pOUl6Vo2LUq/bSa8S5q7b91cgNSjctn9ugq/+Mvow99qW6x/UZYwzxy/3NmqoT66eHYfCVvFvACC58UBPFf28g==} + engines: {node: '>=18.0.0'} + hasBin: true + + typebox@1.1.38: + resolution: {integrity: sha512-pZ0aQPmMmXoUvSbeuWf/Hzsc+avNw/Zd6VeE8CFgkVGWyuHPJvqeJJDeJqLve+K70LvjYIoleGcoJHPT17cWoA==} + + undici-types@6.20.0: + resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} + + undici@8.3.0: + resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==} + engines: {node: '>=22.19.0'} + + web-streams-polyfill@3.3.3: + resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} + engines: {node: '>= 8'} + + which@2.0.2: + resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} + engines: {node: '>= 8'} + hasBin: true + + ws@8.21.0: + resolution: {integrity: sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==} + engines: {node: '>=10.0.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: '>=5.0.2' + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + xml-naming@0.1.0: + resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==} + engines: {node: '>=16.0.0'} + + yaml@2.9.0: + resolution: {integrity: sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA==} + engines: {node: '>= 14.6'} + hasBin: true + + zod-to-json-schema@3.25.2: + resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==} + peerDependencies: + zod: ^3.25.28 || ^4 + + zod@4.4.3: + resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==} + +snapshots: + + '@anthropic-ai/sdk@0.91.1(zod@4.4.3)': + dependencies: + json-schema-to-ts: 3.1.1 + optionalDependencies: + zod: 4.4.3 + + '@aws-crypto/crc32@5.2.0': + dependencies: + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + tslib: 2.8.1 + + '@aws-crypto/sha256-browser@5.2.0': + dependencies: + '@aws-crypto/sha256-js': 5.2.0 + '@aws-crypto/supports-web-crypto': 5.2.0 + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + '@aws-sdk/util-locate-window': 3.965.7 + '@smithy/util-utf8': 2.3.0 + tslib: 2.8.1 + + '@aws-crypto/sha256-js@5.2.0': + dependencies: + '@aws-crypto/util': 5.2.0 + '@aws-sdk/types': 3.973.12 + tslib: 2.8.1 + + '@aws-crypto/supports-web-crypto@5.2.0': + dependencies: + tslib: 2.8.1 + + '@aws-crypto/util@5.2.0': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/util-utf8': 2.3.0 + tslib: 2.8.1 + + '@aws-sdk/client-bedrock-runtime@3.1048.0': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/core': 3.974.20 + '@aws-sdk/credential-provider-node': 3.972.55 + '@aws-sdk/eventstream-handler-node': 3.972.21 + '@aws-sdk/middleware-eventstream': 3.972.17 + '@aws-sdk/middleware-websocket': 3.972.28 + '@aws-sdk/token-providers': 3.1048.0 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.3 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/core@3.974.20': + dependencies: + '@aws-sdk/types': 3.973.12 + '@aws-sdk/xml-builder': 3.972.29 + '@aws/lambda-invoke-store': 0.2.4 + '@smithy/core': 3.24.7 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + bowser: 2.14.1 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-env@3.972.46': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-http@3.972.48': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.8 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-ini@3.972.53': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/credential-provider-env': 3.972.46 + '@aws-sdk/credential-provider-http': 3.972.48 + '@aws-sdk/credential-provider-login': 3.972.52 + '@aws-sdk/credential-provider-process': 3.972.46 + '@aws-sdk/credential-provider-sso': 3.972.52 + '@aws-sdk/credential-provider-web-identity': 3.972.52 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/credential-provider-imds': 4.3.9 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-login@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-node@3.972.55': + dependencies: + '@aws-sdk/credential-provider-env': 3.972.46 + '@aws-sdk/credential-provider-http': 3.972.48 + '@aws-sdk/credential-provider-ini': 3.972.53 + '@aws-sdk/credential-provider-process': 3.972.46 + '@aws-sdk/credential-provider-sso': 3.972.52 + '@aws-sdk/credential-provider-web-identity': 3.972.52 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/credential-provider-imds': 4.3.9 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-process@3.972.46': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-sso@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/token-providers': 3.1066.0 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/credential-provider-web-identity@3.972.52': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/eventstream-handler-node@3.972.21': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/middleware-eventstream@3.972.17': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/middleware-websocket@3.972.28': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/nested-clients@3.997.20': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/core': 3.974.20 + '@aws-sdk/signature-v4-multi-region': 3.996.34 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/fetch-http-handler': 5.4.7 + '@smithy/node-http-handler': 4.7.8 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/signature-v4-multi-region@3.996.34': + dependencies: + '@aws-sdk/types': 3.973.12 + '@smithy/signature-v4': 5.4.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/token-providers@3.1048.0': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/token-providers@3.1066.0': + dependencies: + '@aws-sdk/core': 3.974.20 + '@aws-sdk/nested-clients': 3.997.20 + '@aws-sdk/types': 3.973.12 + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/types@3.973.12': + dependencies: + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@aws-sdk/util-locate-window@3.965.7': + dependencies: + tslib: 2.8.1 + + '@aws-sdk/xml-builder@3.972.29': + dependencies: + '@smithy/types': 4.14.4 + fast-xml-parser: 5.7.3 + tslib: 2.8.1 + + '@aws/lambda-invoke-store@0.2.4': {} + + '@babel/runtime@7.29.7': {} + + '@earendil-works/pi-agent-core@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3) + ignore: 7.0.5 + typebox: 1.1.38 + yaml: 2.9.0 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-ai@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@anthropic-ai/sdk': 0.91.1(zod@4.4.3) + '@aws-sdk/client-bedrock-runtime': 3.1048.0 + '@google/genai': 1.52.0 + '@mistralai/mistralai': 2.2.1 + '@smithy/node-http-handler': 4.7.3 + http-proxy-agent: 7.0.2 + https-proxy-agent: 7.0.6 + openai: 6.26.0(ws@8.21.0)(zod@4.4.3) + partial-json: 0.1.7 + typebox: 1.1.38 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-coding-agent@0.79.4(ws@8.21.0)(zod@4.4.3)': + dependencies: + '@earendil-works/pi-agent-core': 0.79.4(ws@8.21.0)(zod@4.4.3) + '@earendil-works/pi-ai': 0.79.4(ws@8.21.0)(zod@4.4.3) + '@earendil-works/pi-tui': 0.79.4 + '@silvia-odwyer/photon-node': 0.3.4 + chalk: 5.6.2 + cross-spawn: 7.0.6 + diff: 8.0.4 + glob: 13.0.6 + highlight.js: 10.7.3 + hosted-git-info: 9.0.3 + ignore: 7.0.5 + jiti: 2.7.0 + minimatch: 10.2.5 + proper-lockfile: 4.1.2 + semver: 7.8.0 + typebox: 1.1.38 + undici: 8.3.0 + yaml: 2.9.0 + optionalDependencies: + '@mariozechner/clipboard': 0.3.9 + transitivePeerDependencies: + - '@modelcontextprotocol/sdk' + - bufferutil + - supports-color + - utf-8-validate + - ws + - zod + + '@earendil-works/pi-tui@0.79.4': + dependencies: + get-east-asian-width: 1.6.0 + marked: 15.0.12 + + '@esbuild/aix-ppc64@0.23.1': + optional: true + + '@esbuild/android-arm64@0.23.1': + optional: true + + '@esbuild/android-arm@0.23.1': + optional: true + + '@esbuild/android-x64@0.23.1': + optional: true + + '@esbuild/darwin-arm64@0.23.1': + optional: true + + '@esbuild/darwin-x64@0.23.1': + optional: true + + '@esbuild/freebsd-arm64@0.23.1': + optional: true + + '@esbuild/freebsd-x64@0.23.1': + optional: true + + '@esbuild/linux-arm64@0.23.1': + optional: true + + '@esbuild/linux-arm@0.23.1': + optional: true + + '@esbuild/linux-ia32@0.23.1': + optional: true + + '@esbuild/linux-loong64@0.23.1': + optional: true + + '@esbuild/linux-mips64el@0.23.1': + optional: true + + '@esbuild/linux-ppc64@0.23.1': + optional: true + + '@esbuild/linux-riscv64@0.23.1': + optional: true + + '@esbuild/linux-s390x@0.23.1': + optional: true + + '@esbuild/linux-x64@0.23.1': + optional: true + + '@esbuild/netbsd-x64@0.23.1': + optional: true + + '@esbuild/openbsd-arm64@0.23.1': + optional: true + + '@esbuild/openbsd-x64@0.23.1': + optional: true + + '@esbuild/sunos-x64@0.23.1': + optional: true + + '@esbuild/win32-arm64@0.23.1': + optional: true + + '@esbuild/win32-ia32@0.23.1': + optional: true + + '@esbuild/win32-x64@0.23.1': + optional: true + + '@google/genai@1.52.0': + dependencies: + google-auth-library: 10.7.0 + p-retry: 4.6.2 + protobufjs: 7.6.4 + ws: 8.21.0 + transitivePeerDependencies: + - bufferutil + - supports-color + - utf-8-validate + + '@mariozechner/clipboard-darwin-arm64@0.3.9': + optional: true + + '@mariozechner/clipboard-darwin-universal@0.3.9': + optional: true + + '@mariozechner/clipboard-darwin-x64@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-arm64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-arm64-musl@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-riscv64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-x64-gnu@0.3.9': + optional: true + + '@mariozechner/clipboard-linux-x64-musl@0.3.9': + optional: true + + '@mariozechner/clipboard-win32-arm64-msvc@0.3.9': + optional: true + + '@mariozechner/clipboard-win32-x64-msvc@0.3.9': + optional: true + + '@mariozechner/clipboard@0.3.9': + optionalDependencies: + '@mariozechner/clipboard-darwin-arm64': 0.3.9 + '@mariozechner/clipboard-darwin-universal': 0.3.9 + '@mariozechner/clipboard-darwin-x64': 0.3.9 + '@mariozechner/clipboard-linux-arm64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-arm64-musl': 0.3.9 + '@mariozechner/clipboard-linux-riscv64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-x64-gnu': 0.3.9 + '@mariozechner/clipboard-linux-x64-musl': 0.3.9 + '@mariozechner/clipboard-win32-arm64-msvc': 0.3.9 + '@mariozechner/clipboard-win32-x64-msvc': 0.3.9 + optional: true + + '@mistralai/mistralai@2.2.1': + dependencies: + ws: 8.21.0 + zod: 4.4.3 + zod-to-json-schema: 3.25.2(zod@4.4.3) + transitivePeerDependencies: + - bufferutil + - utf-8-validate + + '@nodable/entities@2.2.0': {} + + '@opentelemetry/api-logs@0.54.0': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/api@1.9.0': {} + + '@opentelemetry/context-async-hooks@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/core@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/core@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/exporter-trace-otlp-proto@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-exporter-base': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/otlp-exporter-base@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.54.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/otlp-transformer@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/api-logs': 0.54.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-logs': 0.54.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-metrics': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.27.0(@opentelemetry/api@1.9.0) + protobufjs: 7.6.4 + + '@opentelemetry/propagator-b3@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/propagator-jaeger@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/resources@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/resources@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-logs@0.54.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/api-logs': 0.54.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/sdk-metrics@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + + '@opentelemetry/sdk-trace-base@1.27.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.27.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-trace-base@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.27.0 + + '@opentelemetry/sdk-trace-node@1.28.0(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/context-async-hooks': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/core': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-b3': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-jaeger': 1.28.0(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.28.0(@opentelemetry/api@1.9.0) + semver: 7.8.4 + + '@opentelemetry/semantic-conventions@1.27.0': {} + + '@opentelemetry/semantic-conventions@1.28.0': {} + + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.5': {} + + '@protobufjs/eventemitter@1.1.1': {} + + '@protobufjs/fetch@1.1.1': + dependencies: + '@protobufjs/aspromise': 1.1.2 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.1': {} + + '@silvia-odwyer/photon-node@0.3.4': {} + + '@smithy/core@3.24.7': + dependencies: + '@aws-crypto/crc32': 5.2.0 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/credential-provider-imds@4.3.9': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/fetch-http-handler@5.4.7': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/is-array-buffer@2.2.0': + dependencies: + tslib: 2.8.1 + + '@smithy/node-http-handler@4.7.3': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/node-http-handler@4.7.8': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/signature-v4@5.4.7': + dependencies: + '@smithy/core': 3.24.7 + '@smithy/types': 4.14.4 + tslib: 2.8.1 + + '@smithy/types@4.14.4': + dependencies: + tslib: 2.8.1 + + '@smithy/util-buffer-from@2.2.0': + dependencies: + '@smithy/is-array-buffer': 2.2.0 + tslib: 2.8.1 + + '@smithy/util-utf8@2.3.0': + dependencies: + '@smithy/util-buffer-from': 2.2.0 + tslib: 2.8.1 + + '@types/node@22.10.2': + dependencies: + undici-types: 6.20.0 + + '@types/retry@0.12.0': {} + + agent-base@7.1.4: {} + + anynum@1.0.0: {} + + balanced-match@4.0.4: {} + + base64-js@1.5.1: {} + + bignumber.js@9.3.1: {} + + bowser@2.14.1: {} + + brace-expansion@5.0.6: + dependencies: + balanced-match: 4.0.4 + + buffer-equal-constant-time@1.0.1: {} + + chalk@5.6.2: {} + + cross-spawn@7.0.6: + dependencies: + path-key: 3.1.1 + shebang-command: 2.0.0 + which: 2.0.2 + + data-uri-to-buffer@4.0.1: {} + + debug@4.4.3: + dependencies: + ms: 2.1.3 + + diff@8.0.4: {} + + dotenv@17.2.3: {} + + ecdsa-sig-formatter@1.0.11: + dependencies: + safe-buffer: 5.2.1 + + esbuild@0.23.1: + optionalDependencies: + '@esbuild/aix-ppc64': 0.23.1 + '@esbuild/android-arm': 0.23.1 + '@esbuild/android-arm64': 0.23.1 + '@esbuild/android-x64': 0.23.1 + '@esbuild/darwin-arm64': 0.23.1 + '@esbuild/darwin-x64': 0.23.1 + '@esbuild/freebsd-arm64': 0.23.1 + '@esbuild/freebsd-x64': 0.23.1 + '@esbuild/linux-arm': 0.23.1 + '@esbuild/linux-arm64': 0.23.1 + '@esbuild/linux-ia32': 0.23.1 + '@esbuild/linux-loong64': 0.23.1 + '@esbuild/linux-mips64el': 0.23.1 + '@esbuild/linux-ppc64': 0.23.1 + '@esbuild/linux-riscv64': 0.23.1 + '@esbuild/linux-s390x': 0.23.1 + '@esbuild/linux-x64': 0.23.1 + '@esbuild/netbsd-x64': 0.23.1 + '@esbuild/openbsd-arm64': 0.23.1 + '@esbuild/openbsd-x64': 0.23.1 + '@esbuild/sunos-x64': 0.23.1 + '@esbuild/win32-arm64': 0.23.1 + '@esbuild/win32-ia32': 0.23.1 + '@esbuild/win32-x64': 0.23.1 + + extend@3.0.2: {} + + fast-xml-builder@1.2.0: + dependencies: + path-expression-matcher: 1.5.0 + xml-naming: 0.1.0 + + fast-xml-parser@5.7.3: + dependencies: + '@nodable/entities': 2.2.0 + fast-xml-builder: 1.2.0 + path-expression-matcher: 1.5.0 + strnum: 2.4.0 + + fetch-blob@3.2.0: + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 3.3.3 + + formdata-polyfill@4.0.10: + dependencies: + fetch-blob: 3.2.0 + + fsevents@2.3.3: + optional: true + + gaxios@7.1.5: + dependencies: + extend: 3.0.2 + https-proxy-agent: 7.0.6 + node-fetch: 3.3.2 + transitivePeerDependencies: + - supports-color + + gcp-metadata@8.1.2: + dependencies: + gaxios: 7.1.5 + google-logging-utils: 1.1.3 + json-bigint: 1.0.0 + transitivePeerDependencies: + - supports-color + + get-east-asian-width@1.6.0: {} + + get-tsconfig@4.14.0: + dependencies: + resolve-pkg-maps: 1.0.0 + + glob@13.0.6: + dependencies: + minimatch: 10.2.5 + minipass: 7.1.3 + path-scurry: 2.0.2 + + google-auth-library@10.7.0: + dependencies: + base64-js: 1.5.1 + ecdsa-sig-formatter: 1.0.11 + gaxios: 7.1.5 + gcp-metadata: 8.1.2 + google-logging-utils: 1.1.3 + jws: 4.0.1 + transitivePeerDependencies: + - supports-color + + google-logging-utils@1.1.3: {} + + graceful-fs@4.2.11: {} + + highlight.js@10.7.3: {} + + hosted-git-info@9.0.3: + dependencies: + lru-cache: 11.5.1 + + http-proxy-agent@7.0.2: + dependencies: + agent-base: 7.1.4 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + https-proxy-agent@7.0.6: + dependencies: + agent-base: 7.1.4 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + ignore@7.0.5: {} + + isexe@2.0.0: {} + + jiti@2.7.0: {} + + json-bigint@1.0.0: + dependencies: + bignumber.js: 9.3.1 + + json-schema-to-ts@3.1.1: + dependencies: + '@babel/runtime': 7.29.7 + ts-algebra: 2.0.0 + + jwa@2.0.1: + dependencies: + buffer-equal-constant-time: 1.0.1 + ecdsa-sig-formatter: 1.0.11 + safe-buffer: 5.2.1 + + jws@4.0.1: + dependencies: + jwa: 2.0.1 + safe-buffer: 5.2.1 + + long@5.3.2: {} + + lru-cache@11.5.1: {} + + marked@15.0.12: {} + + minimatch@10.2.5: + dependencies: + brace-expansion: 5.0.6 + + minipass@7.1.3: {} + + ms@2.1.3: {} + + node-domexception@1.0.0: {} + + node-fetch@3.3.2: + dependencies: + data-uri-to-buffer: 4.0.1 + fetch-blob: 3.2.0 + formdata-polyfill: 4.0.10 + + openai@6.26.0(ws@8.21.0)(zod@4.4.3): + optionalDependencies: + ws: 8.21.0 + zod: 4.4.3 + + p-retry@4.6.2: + dependencies: + '@types/retry': 0.12.0 + retry: 0.13.1 + + partial-json@0.1.7: {} + + path-expression-matcher@1.5.0: {} + + path-key@3.1.1: {} + + path-scurry@2.0.2: + dependencies: + lru-cache: 11.5.1 + minipass: 7.1.3 + + proper-lockfile@4.1.2: + dependencies: + graceful-fs: 4.2.11 + retry: 0.12.0 + signal-exit: 3.0.7 + + protobufjs@7.6.4: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.5 + '@protobufjs/eventemitter': 1.1.1 + '@protobufjs/fetch': 1.1.1 + '@protobufjs/float': 1.0.2 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.1 + '@types/node': 22.10.2 + long: 5.3.2 + + resolve-pkg-maps@1.0.0: {} + + retry@0.12.0: {} + + retry@0.13.1: {} + + safe-buffer@5.2.1: {} + + semver@7.8.0: {} + + semver@7.8.4: {} + + shebang-command@2.0.0: + dependencies: + shebang-regex: 3.0.0 + + shebang-regex@3.0.0: {} + + signal-exit@3.0.7: {} + + strnum@2.4.0: + dependencies: + anynum: 1.0.0 + + ts-algebra@2.0.0: {} + + tslib@2.8.1: {} + + tsx@4.19.2: + dependencies: + esbuild: 0.23.1 + get-tsconfig: 4.14.0 + optionalDependencies: + fsevents: 2.3.3 + + typebox@1.1.38: {} + + undici-types@6.20.0: {} + + undici@8.3.0: {} + + web-streams-polyfill@3.3.3: {} + + which@2.0.2: + dependencies: + isexe: 2.0.0 + + ws@8.21.0: {} + + xml-naming@0.1.0: {} + + yaml@2.9.0: {} + + zod-to-json-schema@3.25.2(zod@4.4.3): + dependencies: + zod: 4.4.3 + + zod@4.4.3: {} diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/run.ts b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/run.ts new file mode 100644 index 0000000000..03164e6311 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/run.ts @@ -0,0 +1,197 @@ +/** + * WP-1 runner: install Pi, run a small tool-using agent task, and export the run + * to Agenta as OpenTelemetry traces via the agenta-otel extension. + * + * Auth: uses AuthStorage.create(), which reads ~/.pi/agent/auth.json. Log in once + * with `pnpm exec pi` -> `/login` -> "ChatGPT Plus/Pro (Codex)" (no API key needed), + * or set OPENAI_API_KEY / ANTHROPIC_API_KEY in the environment. + * + * Run: `pnpm start` + */ +import dotenv from "dotenv"; +import { existsSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { + AuthStorage, + createAgentSession, + DefaultResourceLoader, + getAgentDir, + ModelRegistry, + SessionManager, +} from "@earendil-works/pi-coding-agent"; + +import agentaOtel, { runConfig, shutdownTracing } from "./agenta-otel.ts"; + +// Load env before anything reads it: poc-local .env first, then walk up to the +// repo-root .env.test.local for the shared dev-box Agenta credentials. +function loadEnv(): void { + dotenv.config(); + let dir = dirname(fileURLToPath(import.meta.url)); + for (let i = 0; i < 8; i++) { + const candidate = join(dir, ".env.test.local"); + if (existsSync(candidate)) { + dotenv.config({ path: candidate }); + break; + } + const parent = dirname(dir); + if (parent === dir) break; + dir = parent; + } +} + +type Scenario = { name: string; seed: (dir: string) => void; prompts: string[] }; + +const SCENARIOS: Record = { + simple: { + name: "simple", + seed: (dir) => + writeFileSync( + join(dir, "notes.txt"), + "TODO: greet the user by name (use 'Mahmoud')\n" + + "TODO: add a two-line haiku about tracing\n", + ), + prompts: [ + "Read notes.txt in the current directory, then create greeting.txt that " + + "addresses each TODO. Keep it short.", + ], + }, + // Many tool calls across several turns, ending in a structured return. + complex: { + name: "complex", + seed: (dir) => { + writeFileSync( + join(dir, "alpha.py"), + "def add(a, b):\n return a + b\n\n\ndef sub(a, b):\n return a - b\n", + ); + writeFileSync( + join(dir, "beta.py"), + "import math\n\n\ndef area(r):\n return math.pi * r * r\n", + ); + writeFileSync(join(dir, "README.md"), "# demo\n\nA tiny demo package.\n"); + }, + prompts: [ + "Explore this directory: list the files, read every .py file, and use bash " + + "(wc -l) to count the total number of lines across the .py files. Then write " + + "REPORT.md describing what each .py file does and the total line count. " + + "Finally, reply with ONLY a JSON object: " + + '{"files": ["..."], "total_py_lines": , "report": "REPORT.md"}.', + ], + }, + // A longer, multi-prompt session: each prompt is its own trace, all sharing one session.id. + session: { + name: "session", + seed: () => {}, + prompts: [ + "Create todo.md with exactly 3 short tasks about adding distributed tracing to a service.", + "Append 2 more tasks to todo.md, then show me the full file with the bash 'cat' command.", + 'Read todo.md and reply with ONLY a JSON object: {"count": , "tasks": ["..."]}.', + ], + }, +}; + +function pickScenario(cliPrompts: string[]): Scenario { + if (cliPrompts.length > 0) { + return { name: "custom", seed: SCENARIOS.complex.seed, prompts: cliPrompts }; + } + const key = process.env.PI_SCENARIO || "complex"; + return SCENARIOS[key] ?? SCENARIOS.complex; +} + +async function main(): Promise { + loadEnv(); + + // A throwaway working dir seeded per scenario so the agent actually uses tools. + const cwd = mkdtempSync(join(tmpdir(), "pi-poc-")); + const scenario = pickScenario(process.argv.slice(2)); + scenario.seed(cwd); + + const authStorage = AuthStorage.create(); + const modelRegistry = ModelRegistry.create(authStorage); + const available = await modelRegistry.getAvailable(); + if (available.length === 0) { + console.error( + "\nNo model is available. Authenticate Pi first:\n" + + " pnpm exec pi then /login -> \"ChatGPT Plus/Pro (Codex)\"\n" + + "or export OPENAI_API_KEY / ANTHROPIC_API_KEY.\n", + ); + process.exit(1); + } + + const wanted = process.env.PI_MODEL; // "gpt-5.5" or "openai-codex/gpt-5.5" + const model = + (wanted && + available.find( + (m: any) => m.id === wanted || `${m.provider}/${m.id}` === wanted, + )) || + available.find((m: any) => m.id === "gpt-5.5") || + available.find((m: any) => !/spark|mini/i.test(m.id)) || + available[0]; + if (wanted && model.id !== wanted && `${model.provider}/${model.id}` !== wanted) { + console.warn(`[run] PI_MODEL="${wanted}" not available; using ${model.id}`); + } + console.log(`[run] scenario: ${scenario.name} (${scenario.prompts.length} prompt(s))`); + console.log(`[run] model: ${model.provider}/${model.id}`); + console.log(`[run] cwd: ${cwd}`); + + const loader = new DefaultResourceLoader({ + cwd, + agentDir: getAgentDir(), + extensionFactories: [agentaOtel], + }); + await loader.reload(); + + const { session } = await createAgentSession({ + cwd, + model, + authStorage, + modelRegistry, + tools: ["read", "bash", "edit", "write", "ls"], + sessionManager: SessionManager.inMemory(cwd), + resourceLoader: loader, + }); + + // Hand the session id + model to the extension so spans carry them. + runConfig.sessionId = session.sessionId; + runConfig.provider = model.provider; + runConfig.requestModel = model.id; + + session.subscribe((event: any) => { + if ( + event.type === "message_update" && + event.assistantMessageEvent?.type === "text_delta" + ) { + process.stdout.write(event.assistantMessageEvent.delta); + } else if (event.type === "tool_execution_start") { + process.stdout.write(`\n[tool] ${event.toolName}\n`); + } + }); + + const traceIds: string[] = []; + for (let i = 0; i < scenario.prompts.length; i++) { + const p = scenario.prompts[i]; + console.log(`\n[run] prompt ${i + 1}/${scenario.prompts.length}: ${p}\n`); + await session.prompt(p); + if (runConfig.traceId) traceIds.push(runConfig.traceId); + } + + console.log("\n\n[run] flushing spans to Agenta..."); + session.dispose(); + await shutdownTracing(); + + const host = (process.env.AGENTA_HOST || "").replace(/\/+$/, ""); + console.log("[run] flushed."); + console.log(`[run] session_id=${session.sessionId}`); + traceIds.forEach((tid, i) => { + console.log(`[run] trace ${i + 1}: ${tid}`); + console.log(` ${host}/api/spans/?trace_id=${tid}`); + }); + process.exit(0); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md new file mode 100644 index 0000000000..9c53d2f4bd --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-1-pi-tracing/tracing-in-the-agent-service.md @@ -0,0 +1,115 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# Tracing the agent run into the response, like completion and chat + +Status: built and verified end to end against the dev box (2026-06-15). Audience: +whoever works on the agent service (WP-2) and its tracing. + +This is the follow-on to [integrating-the-tracing-extension.md](integrating-the-tracing-extension.md). +That doc made a standalone Pi run show up in Agenta as its own trace. This one wires +the same extension into the WP-2 agent service so the agent's whole run becomes part +of the `/invoke` trace, the way completion and chat nest their LLM spans under the +workflow span. + +## What changed and why + +Completion and chat are traced as one tree: the SDK opens a workflow span for the +`/invoke` request, the LLM call nests under it, and the response carries that +`trace_id`. Open the trace and you see the whole call. + +The agent service runs the model work in a separate Node process (the Pi wrapper), so +its spans were not part of that tree. The WP-1 doc flagged the fix as future work: +thread a W3C `traceparent` across the boundary and start the agent span as its child. +That is what this change does. + +The result is one tree under the response's `trace_id`: + +``` +_agent workflow (the Python /invoke span, root) + invoke_agent AGENT (the Pi run, now a child of _agent) + turn N CHAIN + chat LLM model, tokens, cost, message thread + execute_tool ... TOOL +``` + +Verified shape from a live run (trace `0f47e5f5...`): four spans, one trace, the +`chat` span carrying `ag.data.inputs`/`outputs` as a message thread, token usage +(598/21/619), and cost, with nothing in `ag.unsupported`. + +## How it works + +Three seams carry the context from the Python service to the Pi spans. + +1. **Capture (Python, `services/oss/src/agent.py`).** Inside the instrumented + `_agent` handler the current OpenTelemetry span is the workflow span. `_trace_context()` + reads it with the SDK's `propagation.inject()`, which yields the `traceparent`, + `baggage`, and the request `Authorization`. It also reads the OTLP endpoint from + `ag.tracing.otlp_url`, the exact URL the Python spans use. This is best effort: if + capture fails the run still works, just without cross-trace linking. + +2. **Carry (`services/oss/src/agent_pi`).** `HarnessRequest` gains a `TraceContext` + (`ports.py`). `TraceContext.to_wire()` serializes it to the camelCase shape the + wrapper expects, and both harness adapters send it: the local subprocess one + (`pi_harness.py`) and the HTTP sidecar one (`pi_http_harness.py`). + +3. **Consume (Node, `services/agent/src/agenta-otel.ts`).** When a `traceparent` is + present the extension starts `invoke_agent` as a child of that remote span, so the + whole Pi subtree shares the caller's `trace_id`. It exports each trace to the + endpoint and with the `Authorization` the caller passed, falling back to env. The + runner (`runPi.ts`) flushes the trace before it returns the result. + +Because the Python span and the Pi spans share one `trace_id` and the Pi root points +at the Python span, Agenta merges them into one tree at ingest. No backend change. + +## What is different from the POC extension + +The service build keeps the POC's span tree and every load-bearing attribute choice +(read the [five rules](integrating-the-tracing-extension.md#what-you-must-not-change-and-why) +again before touching attributes). It adds three things the service needs: + +- **Per-run state, not module globals.** The POC ran one prompt at a time. The HTTP + sidecar can drive several runs in one process, so all span state lives in the + closure `createAgentaOtel()` returns. Only the tracer, provider, and exporter cache + stay process wide. +- **A remote parent.** `invoke_agent` nests under the incoming `traceparent` instead + of starting a fresh root. The parent has no end event in this process, so the + per-trace batch flushes by trace id after the run rather than only on root-end. +- **Per-trace export target.** The OTLP endpoint and `Authorization` come from the run + config, so one shared process can serve more than one project. They fall back to + `AGENTA_HOST` / `AGENTA_API_KEY` when the caller passes nothing. + +## Auth and endpoint + +The Node side ships spans to the same place and with the same credentials as the +Python span. When the request carries `Authorization` (the project key or service +secret) the wrapper uses it verbatim, matching how the SDK exporter authorizes per +trace. With auth disabled locally there is no request credential, so the wrapper falls +back to the container's `AGENTA_API_KEY`. Set `AGENTA_AGENT_CAPTURE_CONTENT=0` on the +Python service to drop prompts, completions, and tool I/O from the spans. + +For the HTTP sidecar the endpoint passed from Python is the URL the Python container +uses to reach Agenta. The sidecar must be able to reach the same host. On one Docker +network the internal hostname resolves from both; if it does not, the sidecar's +`AGENTA_HOST` fallback applies. + +## How to verify + +1. Start the services app (`entrypoints.main:app`, which mounts the agent at + `/agent/v0`) with `AGENTA_HOST` and `AGENTA_API_KEY` set and a Pi login or provider + key available. +2. POST a chat-style body to `/agent/v0/invoke` and read `x-ag-trace-id` from the + response headers (it equals `trace_id` in the body). +3. Fetch the trace and confirm the merged tree and the totals: + ``` + curl -s "${AGENTA_HOST}/api/spans/?trace_id=" -H "Authorization: ApiKey ${AGENTA_API_KEY}" + ``` + Expect `_agent` (workflow) over `invoke_agent` (agent) over `turn N` (chain) over + `chat` (chat), all sharing one `trace_id`, with token usage and cost on the `chat` + span and nothing under `ag.unsupported`. + +## Files + +- `services/oss/src/agent.py` — `_trace_context()` captures the workflow span context. +- `services/oss/src/agent_pi/ports.py` — `TraceContext` and `HarnessRequest.trace`. +- `services/oss/src/agent_pi/pi_harness.py`, `pi_http_harness.py` — forward the context. +- `services/agent/src/agenta-otel.ts` — the service build of the extension. +- `services/agent/src/runPi.ts` — registers the extension, sets run config, flushes. diff --git a/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md new file mode 100644 index 0000000000..433e368998 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-2-agent-service/README.md @@ -0,0 +1,125 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# WP-2: Agent service wrapping Pi + +Status: not started. + +## Goal + +Stand up a new service that wraps Pi and exposes an interface like Agenta's completion/chat +services, so we can talk to an agent: set it up (auth, AGENTS.md), send a message, and get response streamed back. Local only for the POC. No Daytona yet. + +Basically we want: + +- A new docker service that has the same structure as completion and chat +- that opens endpoints for the same interface as chat +- that you can send a message history and context and get back response + + + + +## Scope + +In: + +- A thin TypeScript harness-wrapper that drives Pi's SDK (`createAgentSession`). +- Configure the agent fully in memory: AGENTS.md, LLM auth, model. Skills and custom tools + can be stubbed for the first cut. +- Expose our own protocol on a port: a send-message / get-response surface that mirrors the + shape of the existing completion/chat services. + +Out (later work packages): + +- Daytona sandbox. The wrapper runs as a local process for the POC. +- Swapping in other harnesses (Codex, Claude Code). Design the protocol so it is possible, + but only implement Pi here. +- Persisting sessions or storing config server-side. Use a config passed in at startup. +- Stream the multi-message output back to the caller. +- multimessages +- tools + +In step 1 we will hard code the auth for pi.dev (the openai api key for instance or codex). We wont have any configuration just ability to run things. The docker compose will be reloadable automatic change which mean we can simply change the files in the volume locally and change things there. + +We will make sure in the implementation to first think about the port and adapters. So that even the first MVP is very simple it has the right ports and adapters. + +First between our agent implementation and calling pi.dev and setting it up there is a clear port. pi is an implementation for this. + +there is also another port for setting up the run environment. So it's not just setup the agent but also the run environment. + +because you might run pi.dev or claude code locally. As you might run each in daytona or something else. + +We need to set these up. EAch with an adapter. starting env - shutting down - pausing - connecting volume - + +then set up pi.dev setting up - invoking - stoping? (all the rpc interactions) - shutting down + +For pi.dev it might make sense to have two adapters one for RPC and the other for json + +Success for this WP1 is: +- I go to the UI +- Create a new agent (with some hard coded config Say hello world) +- I run it in the playground and I see the output. + +note here that instrumentation here might needed, we are working in parallel on the research for that + + +As soon as we have that we can start working on adding a config first to the playground. which include agents.md then authentication (model used) then setting up tools. then we can talk about streaming, multi messages, intermediate messages. + + + + +--- The rest of the article might be out of date for some parts. The main requirements are above --- + + +## Approach (grounded in research) + +See [`../research/pi-interaction.md`](../research/pi-interaction.md), +[`../research/auth-secrets.md`](../research/auth-secrets.md), and +[`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md). + +- Use the **SDK**, not RPC. The SDK is what exposes the in-memory overrides and runtime + credential injection; RPC mode cannot inject credentials post-spawn. +- Inject everything in memory: + - AGENTS.md via `systemPromptOverride` / `appendSystemPrompt` / `agentsFilesOverride`, + with `noContextFiles` so no on-disk AGENTS.md leaks in. + - LLM auth via `setRuntimeApiKey(provider, key)` or `AuthStorage.inMemory()` (env at + spawn also works). + - State via `SessionManager.inMemory()`, `SettingsManager.inMemory()`, + `ModelRegistry.inMemory()`. +- Diskless: set `TMPDIR` to a per-run tmpfs for bash output spillover; pre-install `rg`/`fd` + so search tools do not write binaries to disk. +- Stream output via `session.subscribe()` callbacks (`message_update` -> `text_delta`), + mapping Pi events onto the service's streamed response. +- This wrapper is the "works with our port" contract and the swappable-harness seam. Keep + the protocol harness-agnostic. + +## Interface to mirror + +Match the existing Agenta completion/chat service surface so callers and the playground can +treat an agent like the other workflow types. Reconcile the single-output completion/chat +shape with Pi's multi-message output (the response is a list of messages, not one +completion). + +## Definition of done + +- The service starts locally with a passed-in config (AGENTS.md text, model, provider key). +- A caller can send a message and receive the streamed multi-message response. +- Auth and AGENTS.md are applied in memory, with nothing invocation-specific written to a + persistent disk. +- The same wrapper binary runs as a plain local process (parity baseline for later sandbox + and pull-config-and-run-locally work). + +## Open questions + +- Where the service lives in the repo (a new entry under `services/`, or alongside `api/`), + and how a Node service fits the Python backend. Decide before writing code. +- The exact protocol on the port (JSON-lines over stdio, a small HTTP/SSE server, or + websockets). Pick the one that matches how Agenta calls completion/chat today. +- How the multi-message output maps to the completion/chat response contract. +- Whether WP-1's tracing extension is embedded here from the start or added after. + +## Links + +- [`../research/pi-interaction.md`](../research/pi-interaction.md) +- [`../research/auth-secrets.md`](../research/auth-secrets.md) +- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md) +- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md new file mode 100644 index 0000000000..881030e73b --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-2-agent-service/implementation-plan.md @@ -0,0 +1,282 @@ +> **Historical record.** This is a work-package note. It describes the design as it was at the time and may reference components that no longer exist. For the current design see the [agent-workflows docs](../../README.md); for the live state see [sdk-local-backend/status.md](../sdk-local-backend/status.md). +# WP-2 implementation plan: agent service wrapping Pi + +Status: MVP built and verified by curl (2026-06-15). Decisions below were taken; the +"Implemented" section records what shipped. Original decision points are kept marked +**[DECISION]** for history. + +> Note (current state): the sections below describe the iterative MVP, including a +> standalone entrypoint (`agent_main.py`) and dedicated composes +> (`docker-compose.agent.yml`, `docker-compose.stack.yml`). Those were **removed** in +> favor of the integrated path only: the agent is mounted in `entrypoints/main.py` at +> `/agent/v0` and the `agent-pi` sidecar lives in +> `hosting/docker-compose/ee/docker-compose.dev.yml`. The standalone run commands below +> are historical. See `qa.md` for the rationale. + +## Implemented (MVP, verified by curl) + +Per the decisions: a Python service exposes the Agenta `/invoke` contract (auth, +middleware, CORS via `ag.create_app`) and calls a thin TypeScript Pi wrapper. Standalone, +verified with curl. Pi runs on the local login (`openai-codex` / `gpt-5.5`). + +What shipped: + +- TypeScript Pi wrapper: `services/agent/` (`src/runPi.ts`, `src/cli.ts`). One-shot + JSON-over-stdio: read a request on stdin, drive Pi's SDK (`createAgentSession`) with + AGENTS.md injected in memory, write the reply as JSON on stdout. Pinned + `@earendil-works/pi-coding-agent@0.79.4`. Editable config in `services/agent/config/` + (`AGENTS.md`, `agent.json`), read per request so edits need no restart. +- Python service: `services/oss/src/agent.py` mirrors `chat.py` (`ag.create_app` + + `ag.workflow` + `ag.route`, `is_chat` flag). Ports and adapters in + `services/oss/src/agent_pi/`: `Harness` port + `PiHarness` (spawns the wrapper over the + JSON transport), `Runtime` port + `LocalRuntime` (local subprocess; Daytona slots in + here later). +- Standalone entrypoint: `services/entrypoints/agent_main.py` mounts only the agent app + + `/health` for isolated local runs. + +How to run and verify locally: + +```bash +cd services/agent && pnpm install # once +cd ../ && set -a && source ../.env.test.local && set +a +AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false \ + uv run uvicorn entrypoints.agent_main:app --host 0.0.0.0 --port 8090 + +curl -s -X POST http://localhost:8090/agent/v0/invoke -H "Content-Type: application/json" \ + -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}' +# -> {"data":{"outputs":{"role":"assistant","content":"Hi! I'm your friendly hello-world AI assistant."}}, "status":{"code":200}, ...} +``` + +## Dockerized (verified by curl) + +The agent now runs fully in Docker via a dedicated, self-contained compose that does not +touch other stacks. Two containers: + +- `agent-pi`: the TypeScript Pi wrapper as an HTTP sidecar + (`services/agent/src/server.ts`, `docker/Dockerfile.dev`). It copies the read-only + mounted `~/.pi/agent` login into a writable container path at startup, so OAuth refresh + never writes back to the host. `node_modules` is baked into the image; `src` is + bind-mounted so `tsx watch` hot-reloads code edits. Adding npm deps needs a rebuild. +- `agent-api`: the Python agent service, built from the current services dev Dockerfile + (`agenta-agent-api:dev`, a dedicated tag). Selects the HTTP harness via + `AGENTA_AGENT_PI_URL` and calls the sidecar in-network. Published on host port 8092. + +The Python -> Pi seam is now two adapters behind the same Harness port: `PiHarness` +(subprocess, local) and `PiHttpHarness` (HTTP, docker). `agent.py` picks by env. + +Run and verify: + +```bash +docker compose -f services/agent/docker-compose.agent.yml up --build -d +curl localhost:8092/health +curl -s -X POST localhost:8092/agent/v0/invoke -H 'Content-Type: application/json' \ + -d '{"data":{"inputs":{"messages":[{"role":"user","content":"Hi, who are you?"}]}}}' +# -> 200, {"data":{"outputs":{"role":"assistant","content":"Hello from your friendly Docker agent!"}}, ...} +docker compose -f services/agent/docker-compose.agent.yml down # tear down +``` + +Note: do not reuse the stale `agenta-oss-dev-services:latest` image (Python 3.11, old SDK +without `route(app=...)`); the compose builds a fresh `agenta-agent-api:dev` from the +current Dockerfile instead. + +Known gaps / next steps: auth header is bypassed for local curl; streaming, multi-message +output, and tools; tracing across the boundary is being wired in (OTel deps + `agenta-otel.ts` +in the wrapper, `TraceContext` in the ports) and the HTTP path / OTLP target still need +finishing; registering `agenta:builtin:agent:v0` as a real workflow type + template (WP-6) +and pointing a real dev stack at the sidecar so it runs from the playground. + +--- + +Status: draft for review. Add inline comments anywhere. Decision points are marked +**[DECISION]** and have a recommended default. + +## Context + +Agenta runs prompt-style workflows today (completion, chat, LLM-as-a-judge). Each is a +Python FastAPI app exposing `/invoke` and `/inspect`, all mounted in one `services` +container (`services/entrypoints/main.py`). The backend and playground call a service by +POSTing a `WorkflowInvokeRequest` to `{serviceUrl}/invoke` and reading +`WorkflowBatchResponse.data.outputs` back. + +WP-2 adds a new kind of workflow: an agent. An agent runs a harness (Pi by default) that +drives a model over multiple turns. Pi is a TypeScript/Node SDK +(`@earendil-works/pi-coding-agent`, pinned `0.79.4`). It has no Python SDK. So the agent +service is a Node service, the first non-Python service in the dev stack. + +This work package builds only the service. It runs Pi locally (no Daytona), with hardcoded +config (AGENTS.md text, model, provider key from env). The goal is to stand up the right +ports and adapters even for the simplest MVP, so Daytona and other harnesses slot in later +without reshaping the service. + +Source: `wp-2-agent-service/README.md` and the research it links +(`research/pi-interaction.md`, `research/diskless-in-memory-config.md`). + +## What I confirmed in the codebase + +- All Python services run in one `services` container, each mounted at its own path and + exposing `/invoke` + `/inspect` (`services/entrypoints/main.py:135`). +- The chat handler takes `inputs`, `messages`, and `parameters` + (`services/oss/src/chat.py:18`). The routing decorator pulls these from the + `WorkflowInvokeRequest` envelope. +- The playground resolves `serviceUrl` from the workflow's `data.url` (or builds it from + `data.uri`) and POSTs directly from the browser to `{serviceUrl}/invoke` + (`web/packages/agenta-entities/src/workflow/state/runnableSetup.ts:246`). So the service + needs the same request/response shapes and CORS as the Python services + (`services/entrypoints/main.py:115`). +- The dev stack hot-reloads via bind mounts plus uvicorn `--reload`, and traefik routes + `PathPrefix(/services/)` after stripping the prefix + (`hosting/docker-compose/oss/docker-compose.dev.yml:351`). +- Research confirms Pi runs fully diskless through its SDK: in-memory auth, AGENTS.md, + model, and sessions (`research/diskless-in-memory-config.md`). + +## Scope + +In: +- A new Node/TypeScript service that exposes the Agenta `/invoke` contract directly. +- Drives Pi through its SDK (`createAgentSession`) in-process, config in memory. +- Hardcoded config: AGENTS.md text, model id, provider key from env. Config read from a + mounted file so it is editable and hot-reloads. +- Ports and adapters wired from the start (see Architecture). +- Dockerized with hot-reload, wired into the OSS dev compose and traefik. + +Out (later WPs, per the design doc): +- Daytona sandbox. The runtime adapter is the local process for now. +- Streaming and multi-message output. This cut returns the final assistant text as a + single `data.outputs`. +- Custom tools and skills. Stubbed for the first cut. +- Server-side config persistence. Config is passed in at startup. +- Other harnesses (Codex, Claude Code). Design the port for them, implement only Pi. + +## Architecture: ports and adapters + +The service is harness-agnostic at its core, with the two ports the design doc calls out. + +``` +HTTP layer (Fastify or Express): POST /invoke, POST /inspect, GET /health, CORS + | +Core (no Pi, no Daytona): + AgentRunner.run(config, messages, inputs) -> { output } + | | + Port: Harness Port: Runtime (environment) + setup(config) start() / shutdown() + invoke(messages, inputs) pause() / connectVolume() + stop() / shutdown() + | | + Adapter: PiSdkHarness Adapter: LocalRuntime + (createAgentSession, (in-process; the Node process + in-memory auth + AGENTS.md itself is the run environment) + + model, SessionManager + .inMemory()) [later: DaytonaRuntime in WP-3] + [later: PiRpcHarness] +``` + +- Harness port: the seam between our service and the agent engine. Pi is one + implementation. The MVP ships one adapter, `PiSdkHarness`. The doc also floats RPC and + JSON adapters; the port shape leaves room for `PiRpcHarness` later. + **[DECISION]** Drive Pi via the SDK in-process for the MVP (recommended: simplest for a + Node service, gives in-memory auth + AGENTS.md + model), rather than spawning `pi --mode + rpc`. +- Runtime port: the seam for the run environment (start, shutdown, pause, connect volume). + The MVP adapter is `LocalRuntime` (the Node process). `DaytonaRuntime` lands in WP-3 + behind the same port. + +### PiSdkHarness (the MVP adapter) + +Per `research/diskless-in-memory-config.md`: +- `AuthStorage.inMemory()` + `setRuntimeApiKey(provider, key)` for the LLM key. +- `DefaultResourceLoader` with `noContextFiles: true` and `agentsFilesOverride` (or + `systemPromptOverride`) to inject AGENTS.md text in memory. +- `SessionManager.inMemory()`, `SettingsManager.inMemory()`, + `ModelRegistry.inMemory(auth)` so nothing persists. +- `model: getModel(provider, modelId)`. +- `TMPDIR` set to a tmpfs for Pi's bash output spillover (the one forced write). +- MVP run: `await session.prompt(text)`, then read the final assistant text from + `session.messages` (or the `agent_end` event). Return it as `data.outputs`. No + streaming. + +## HTTP contract (mirror chat) + +- `POST /invoke`: accept `{ data: { parameters, inputs }, references?, ... }`. Pull the + user message from `inputs`/`messages` the way chat does + (`services/oss/src/chat.py:18`). Return + `{ version, data: { outputs }, status: { code: 200 }, trace_id, span_id }`. +- `POST /inspect`: return the parameters/inputs schema. The MVP can return a minimal + static schema, enough for the backend inspect path. +- `GET /health`: `{ status: "ok" }`. +- CORS: allow the same origins as the Python services so the browser can call it directly. + +Auth note: the Python services verify an `Authorization: Secret {token}` header via SDK +middleware. The local MVP can accept the header without verifying it. Real verification is +a later concern. Flagging this as a known gap. + +## Repo placement and Docker + +- New Node project at `services/agent/`: own `package.json`, `tsconfig.json`, `src/` (with + `http/`, `core/`, `adapters/pi/`, `adapters/runtime/`), `config/` (the editable + AGENTS.md and model config), and `docker/Dockerfile.dev` + `docker/Dockerfile.gh`. +- Pin `@earendil-works/pi-coding-agent@0.79.4` and `@earendil-works/pi-ai@0.79.4`. +- Hot-reload: run with `tsx watch` (or `node --watch`). Bind-mount `services/agent/src` and + `services/agent/config`; keep `node_modules` in the image via an anonymous volume so the + host/container split does not break it. +- New compose service block in `hosting/docker-compose/oss/docker-compose.dev.yml` (model + the existing `services` block at line 351). Own port (for example 8090), traefik router + `PathPrefix(/agent/)` that strips the prefix, env_file for the provider key. +- The provider key (for example `OPENAI_API_KEY`) goes in the dev env file the compose + service reads. + +## Verification + +1. Bring up the OSS dev stack with the new service: + `./hosting/docker-compose/run.sh --oss --dev --build`. +2. `curl http://localhost/agent/health` returns ok. +3. `curl -X POST http://localhost/agent/invoke` with a chat-style body and a message; + confirm the response carries the agent reply in `data.outputs`. This is the core WP-2 + definition of done. +4. Edit `services/agent/config/AGENTS.md`; confirm the change is picked up without a + rebuild. +5. End-to-end demo (only if decided in scope below): register an agent workflow whose + `data.url` points at the agent service, open it in the playground, send a message, see + the output. + +## Decisions to confirm + +**[DECISION 1] Service shape.** Recommended: a pure Node service that speaks `/invoke` +directly (matches the doc, fewest moving parts). Alternative: a Python shim in the existing +services container that bridges to a Node Pi sidecar (reuses Agenta auth/tracing +middleware, adds a hop). +> Your call: We should use python then call ts for the moment. The Py provides authentication, middleware, and a bunch of things. + +**[DECISION 2] How far this iteration goes.** Option A: standalone service, verified by +curl (the true WP-2 definition of done). Option B: also wire the minimal end-to-end so you +can create an agent and run it in the playground (overlaps WP-6's workflow-type +registration). +> Your call: Let's start with the standalone service verified by curl + +**[DECISION 3] LLM key for Pi.** `.env.test.local` only has Agenta cloud creds, not a model +key. Pi needs a real provider key to run. Which provider and model for the hardcoded +"hello world" agent (for example OpenAI `gpt-4o-mini`)? Can you supply the key as an env +var for a live verification, or should I build without live verification for now? +> Your call: I have set up + +**[DECISION 4] Pi driving mode.** Recommended: SDK in-process. Alternative: `pi --mode rpc` +subprocess. SDK is simpler here and supports in-memory auth and AGENTS.md. +> Your call: +I have set up auth What's left — your one-time Pi login +`~/.pi/agent` doesn't exist yet, so no model is available. Pi can't reuse the `~/.codex` token directly; it needs its own login (same ChatGPT account, browser OAuth — I can't drive that for you): + +```bash +cd docs/design/agent-workflows/wp-1-pi-tracing/poc +pnpm exec pi # TUI opens +# type: /login → choose "ChatGPT Plus/Pro (Codex)" → finish browser OAuth → quit +pnpm start # runs the agent, exports the trace +``` + +(Or `export OPENAI_API_KEY=...` / `ANTHROPIC_API_KEY=...` instead of logging in.) + +After `pnpm start`, watch for `[agenta-otel] exporting spans to .../api/otlp/v1/traces` and `[run] flushed`, then open Agenta observability on the dev box and find the `invoke_agent` trace — verify the tree types correctly and the `chat` span carries model, latency, and token usage. + +Want me to wait while you log in, then I'll run it and verify the trace in Agenta together — or would you rather I add the Pi-native model-usage cost (`gen_ai.usage.cost`) display check to the verification while you do that? + + + Logged in to ChatGPT Plus/Pro (Codex Subscription). Selected gpt-5.5. Credentials saved + to /home/mahmoud/.pi/agent/auth.json diff --git a/docs/design/agent-workflows/scratch/wp-2-agent-service/qa.md b/docs/design/agent-workflows/scratch/wp-2-agent-service/qa.md new file mode 100644 index 0000000000..b7d25221d9 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-2-agent-service/qa.md @@ -0,0 +1,176 @@ +# Agent service: Q&A + +Running notes answering review questions about the agent workflow implementation +(branch `feat/agent-workflows`). Questions are in no particular order. + +--- + +## Q: Why a separate entrypoint `agent_main.py` instead of `main.py`? + +Short answer: `agent_main.py` is not a replacement for `main.py`. It is an extra, +lightweight runner for testing the agent in isolation. The real integration lives in +`main.py`, and that is what the 8280 stack actually runs. + +The two entrypoints: + +- `services/entrypoints/main.py` is the full services app. It mounts every service + (chat, completion, all the managed evaluators, and now the agent at `/agent/v0`). This + is the production/dev container entrypoint and the path the playground uses + (`/services/agent/v0/...`). The agent is a first-class part of it: + `app.mount("/agent/v0", agent_v0_app)`. + +- `services/entrypoints/agent_main.py` mounts only the agent app plus `/health`. + +Why we added `agent_main.py`: + +1. Isolated, fast iteration. Early on the deliverable was "a standalone agent service + verified by curl" (no full stack). Running `main.py` pulls in the whole managed + evaluator surface (litellm, all the builtins) and `ag.init()` for the full app. + `agent_main.py` lets you run just the agent: + `uv run uvicorn entrypoints.agent_main:app --port 8090` and curl it, without the rest. + +2. The dedicated `:8092` Docker compose. Before the agent was integrated into the real + stack, it ran standalone in its own compose. That container ran `agent_main.py`. + +3. A place for cross-origin CORS. When the playground had to call the agent on a + different port (`:8092` vs the web on `:8280`), the browser needs a credentialed CORS + policy (echo the specific origin + allow credentials). `agent_main.py` sets that + (`allow_origin_regex` + `allow_credentials=True`). `main.py` keeps the stricter + shared services CORS, which is fine for it because, once integrated, the agent is + served same-origin (`/services/agent/v0`) so there is no CORS at all. + +Net: `main.py` is the real, integrated path (same-origin, used by the 8280 stack). +`agent_main.py` was a convenience runner for isolated local/standalone testing and the +old dedicated compose. + +**Update (decision): dropped.** We removed `agent_main.py` and the two standalone +composes (`docker-compose.agent.yml`, `docker-compose.stack.yml`) to keep only the +integrated path: the agent mounted in `entrypoints/main.py` at `/agent/v0`, served by +the normal services container, with the `agent-pi` sidecar wired into +`hosting/docker-compose/ee/docker-compose.dev.yml`. If we ever want isolated runs again, +the cleaner approach is a profile/override on the real compose rather than a parallel +entrypoint. + +--- + +## Q: How does the agent service use the workflow middleware? Which parts does it have access to (secrets, invoke, inspect, ...)? + +The agent gets the whole Agenta workflow machinery "for free" because it is built the +same way as chat and completion: `ag.create_app()` + `ag.workflow(schemas=...)` + +`ag.route("/", flags={"is_chat": True})` in `services/oss/src/agent.py`. That was the +point of the Python-front decision: the Python layer provides auth, middleware, +tracing, secrets, and the invoke/inspect contract; the Node wrapper only runs Pi. + +There are **two middleware layers**. + +### Layer 1 — HTTP/ASGI middleware (per request) + +Added by `ag.create_app()` (`sdks/.../decorators/routing.py:64`). Outermost first: + +- **CORSMiddleware** — cross-origin headers. Irrelevant on the integrated same-origin + path; it mattered only for the old cross-port setup. +- **AuthMiddleware** — verifies the caller against `{host}/api/access/permissions/check` + and puts the resolved credential on `request.state.auth["credentials"]` (a signed + `Secret`). With `AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=false` it passes the raw + `Authorization` through without a remote check. This is the credential everything + downstream uses. +- **OTelMiddleware** — opens the request's tracing context, i.e. the workflow span the + whole run nests under. + +### Layer 2 — Workflow middleware (inside `wf.invoke`) + +Set on the workflow object (`decorators/running.py:197`), run in order around the +handler: + +- **VaultMiddleware** — resolves secrets for the credential: it fetches the project's + vault secrets from `{api_url}/secrets/`, combines them with any local secrets, checks + access, and exposes them on the running context. (More on "access" below.) +- **ResolverMiddleware** — resolves which handler to run from the revision URI, hydrates + references / revision / config from the backend when needed, and resolves embeds in + parameters. +- **NormalizerMiddleware** — maps the request to the handler's arguments by inspecting + its signature (`inputs`, `messages`, `parameters` pulled from `data`), calls + `_agent(...)`, and wraps the return value into the response envelope, attaching + `trace_id` / `span_id`. + +### What the agent actually has access to / uses + +- **invoke** — yes, fully. `POST /services/agent/v0/invoke` runs the entire chain + (auth -> vault -> resolver -> normalizer -> `_agent`). `_agent` receives `inputs`, + `messages`, and `parameters` already mapped for it. +- **inspect** — yes. `POST /services/agent/v0/inspect` returns the agent's interface, + i.e. `AGENT_SCHEMAS` (chat `messages` in, `message` out, config = `model` + + `agents_md`). This is what tells the playground to render a chat box and the two + config fields. (Known bug: inspect currently 500s under session-cookie auth; it did + not block the playground because the create flow takes the schema from the catalog + template.) +- **auth / credentials** — yes. The resolved `Secret` credential is available to the + handler and to tracing export. +- **tracing** — yes. `_agent` reads the active workflow span via `_trace_context()` and + threads the `traceparent` (plus endpoint/auth) to the Pi sidecar, so the Pi spans + nest under the `/invoke` span in one trace. +- **secrets** — available but **not consumed yet**. VaultMiddleware resolves the + project's secrets on every invoke and exposes them on the running context. Chat and + completion use them automatically because litellm reads them. The agent handler does + not read them today; the Pi model auth currently comes from the mounted + `~/.pi/agent` (Codex login) or `AGENTA_API_KEY`/provider env on the sidecar. Wiring + the resolved secrets into the Pi run (the "startup hook injects the provider/tool + keys" step) is exactly where this plugs in: read the secrets in `_agent`, pass them in + the harness request, and have the wrapper inject them (`setRuntimeApiKey` / env). That + is the planned secrets work, not yet built. + +One detail: the route passes `secrets=None` into `wf.invoke`, so the agent does not +hand secrets in; VaultMiddleware fetches them itself from the credential. The gap is +only on the consuming side (the handler), not the resolving side. + +--- + +## Q: Why does tracing look different / broken now vs the old trace? + +Reference old trace `6ab51033...`: root `invoke_agent`, four `turn`s, several +`chat gpt-5.5` spans, and `execute_tool ls/read/bash/write` — 14 spans, with +cumulative token + cost rolled up onto the `turn` and `invoke_agent` spans. + +Current trace (e.g. `329698f7...`): `_agent -> invoke_agent -> turn 0 -> chat` — 4 +spans; the `chat` span has tokens + cost, the parents do not. + +Tracing is **not broken** (spans land, nest correctly, the `chat` span carries model, +tokens, cost). Two things changed: + +### 1. Different agent and task (the big, expected difference) + +The old trace is the WP-1 POC: tools enabled (`read/bash/edit/write/ls`) and a task +that needs them ("read notes.txt, write greeting.txt"). That drives a multi-turn loop +with tool calls, so you get many turns, many `chat` spans, and `execute_tool` spans. + +The current app is the hello-world chat agent: `tools=[]` and "answer in one or two +short sentences". So it does exactly one turn, no tools, one `chat`. Same +instrumentation, a trivial run. To get a rich trace again, give the agent tools +(built-in `read/bash/...` or the WP-7 runnable tools) and a task that uses them. + +### 2. Cumulative token/cost rollup is lost across the process boundary (a real regression) + +In the old (standalone) trace, all spans were exported by one process in one batch, so +Agenta's per-ingest-batch cumulative computation could build the roll-up tree and put +cumulative tokens/cost on `turn` and `invoke_agent`. + +Now the trace is split across **two exporters**: +- Python (services container) exports `_agent` (the workflow span). +- Node (`agent-pi`) exports `invoke_agent -> turn -> chat` (the Pi spans), where + `invoke_agent`'s parent is the **remote** `_agent`. + +Agenta builds the cumulative tree per ingest batch and "attaches a span only if its +parent is already seen" (see the `orderParentFirst` comment in `agenta-otel.ts`). In the +Node batch, `invoke_agent`'s parent (`_agent`) is in the **other** (Python) batch, so the +Pi subtree is dropped from the cumulative tree. Result: the leaf `chat` keeps its raw +`incremental` tokens, but `cumulative` is missing on `chat` and there is no token/cost +rollup on `turn` / `invoke_agent` / `_agent`. (Duration still rolls up because it is +computed differently.) + +So the agent- and turn-level token/cost totals you used to see are gone. This is a +side effect of nesting the agent under the Agenta workflow span (the integration goal). +The fix belongs on the tracing side (owned by the instrumentation work): compute the +cumulative roll-up across the whole trace by `trace_id` rather than per ingest batch, so +a trace split between the Python workflow span and the Node Pi spans still aggregates. +Until then, per-span (leaf `chat`) tokens/cost are correct; the rolled-up agent totals +are not. diff --git a/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/README.md b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/README.md new file mode 100644 index 0000000000..89a775f7e0 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/README.md @@ -0,0 +1,99 @@ +# WP-3: Daytona sandbox running Pi + +Status: **POC complete** against Daytona cloud (`target=eu`). See +[`poc/`](poc/README.md). Ran in parallel with WP-1 and WP-2. + +## Goal + +Prove the sandbox track end to end: create a Daytona sandbox with Pi installed, inject the +agent's files and secrets, run an agent, stream the output back, and tear down. This takes +the local Pi wrapper (WP-2) and shows it running inside a sandbox. The two can be developed +in parallel, since the Daytona lifecycle and image work do not depend on the wrapper being +finished. + +## What the POC established + +The POC ([`poc/`](poc/README.md)) does the full loop against Daytona cloud and answers the +key unknowns: + +- **Bake Pi into a snapshot.** `build_snapshot.py` builds `agenta-pi-harness` from + `node:22-bookworm` + Pi `0.79.4` + ripgrep/fd in ~26s. Daytona injects its toolbox daemon + into the custom image, so `process.exec` / `fs` / sessions work on a plain node base (no + need to layer on `daytonaio/sandbox`). +- **Cold start is sub-second warm.** Creating a sandbox from the prebuilt snapshot is + ~0.7-1.1s on a warm runner, with an occasional few-second spike when a runner pulls the + custom image cold. That beats installing Pi per run (npm install alone is ~3s). +- **Inject config + secret, run, stream, tear down.** `run_agent.py` lays an `AGENTS.md` + and a task file into a per-run dir, injects the provider credential (env var or uploaded + credential file), runs Pi headless in `--mode json`, streams the typed event lines, and + deletes the sandbox. The agent honored the injected `AGENTS.md` and used tools + (`read`, `read`, `write`). +- **Gotcha: Pi blocks on a trust prompt.** With an `AGENTS.md` in cwd, Pi asks to trust + project-local files and hangs in a non-interactive session. Pass `--approve` and run with + stdin from `/dev/null`. This was the main trap. + +Full findings, the measured numbers, and how to run it: [`poc/README.md`](poc/README.md). + +## Scope + +In: + +- Create a Daytona sandbox from the Python SDK (`pip install daytona`, + `Daytona` / `AsyncDaytona`): `create` -> `process.exec` / sessions -> `stop` -> `delete`. +- Bake Pi into a Daytona snapshot (declarative `Image` builder or Dockerfile) so runs skip + per-run `npm install`. Pre-install `rg` / `fd`. +- Inject files (`fs.upload_file` / `upload_files`) and secrets (`env_vars` at create, or + per-exec `env`). +- Run Pi headless and stream stdout/stderr back (session with `run_async=True`, + `get_session_command_logs_async`). +- Expose and use the port via `get_preview_link(port)` (the "works with our port" contract). +- One shared long-lived sandbox (`auto_stop_interval: 0`), per-run working directory plus a + per-run tmpfs for `TMPDIR`, bounded concurrency. + +Out: + +- Volume-per-execution. Not feasible in Daytona (volumes mount at create time only); use the + per-run dir + tmpfs approach instead. +- The provider abstraction for non-Daytona sandboxes. Keep the seam thin, but only implement + Daytona here. + +## Approach (grounded in research) + +See [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md) and +[`../research/sandbox-sharing.md`](../research/sandbox-sharing.md). + +## Definition of done + +- [x] A script creates a sandbox from a Pi snapshot, injects an AGENTS.md and a provider + key, runs an agent, streams the multi-message output, and tears down cleanly. +- [x] Nothing invocation-specific is written to a persistent volume. No volume is mounted; + each run uses a per-run dir plus a `TMPDIR` inside it, and the sandbox is deleted at the + end. +- [x] Cold-start with the custom snapshot is measured and recorded (`poc/README.md`). + +## Open questions + +Answered by the POC: + +- Daytona cloud works end to end with the provided `eu` credentials; the node-base snapshot + gets a working toolbox; cold start from the prebuilt snapshot is sub-second warm. +- Secret injection has two working paths: `env_vars` at create (secret-as-env) and an + uploaded credential file via `fs.upload_file` (secret-as-file). + +Still open: + +- Self-hosted Daytona vs Daytona cloud (AGPL review if self-host-and-modify). POC used + cloud only. +- Whether an actively streaming session resets the auto-stop idle timer. Sidestepped with + `auto_stop_interval=0` and owning the lifecycle; not independently confirmed. +- Realistic safe parallel-run count for one small sandbox (needs load testing). +- The snapshot build/version pipeline: who builds and pins `agenta-pi-harness` per agent + revision, and where that runs (CI or config-publish time). + +## Links + +- [`poc/`](poc/README.md) — the working POC (build snapshot, run agent, bench cold start) +- [`../research/daytona-sandbox.md`](../research/daytona-sandbox.md) +- [`../research/sandbox-sharing.md`](../research/sandbox-sharing.md) +- [`../research/diskless-in-memory-config.md`](../research/diskless-in-memory-config.md) +- [Project README](../README.md) diff --git a/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/README.md b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/README.md new file mode 100644 index 0000000000..452322d858 --- /dev/null +++ b/docs/design/agent-workflows/scratch/wp-3-daytona-sandbox/poc/README.md @@ -0,0 +1,118 @@ +# WP-3 POC: run a Pi agent in a Daytona cloud sandbox + +Bakes Pi into a Daytona snapshot, then creates a sandbox from it, injects the agent's +credential and config, runs the agent headless, streams its multi-message output back, +and tears the sandbox down. Runs against **Daytona cloud** (`target=eu`). + +This is the sandbox half of the agent runtime. It validates the `DaytonaRuntime` adapter +that WP-2 leaves behind its `Runtime` port (`start` -> create sandbox, inject config -> +lay down the per-run dir, `invoke` -> run Pi and stream, `shutdown` -> delete). + +## What's here + +- `build_snapshot.py` — bake Pi (+ ripgrep, fd) into the reusable `agenta-pi-harness` + snapshot so per-run cold start skips `npm install`. Run once. +- `run_agent.py` — the deliverable. Create -> inject -> run -> stream -> tear down. +- `bench_coldstart.py` — measure cold start, Pi snapshot vs the default image. +- `cleanup.py` — list sandboxes and delete leaked WP-3 ones. + +## Setup + +Needs `uv` and Daytona cloud credentials. Export them (the dev values live in +`hosting/docker-compose/ee/.env.ee.dev.local`): + +```bash +export DAYTONA_API_KEY=dtn_... +export DAYTONA_API_URL=https://app.daytona.io/api +export DAYTONA_TARGET=eu +``` + +Each script declares its own deps inline, so `uv run