diff --git a/apps/cli/src/commands/_cloud-attach.ts b/apps/cli/src/commands/_cloud-attach.ts index 71fb48e..203ab8f 100644 --- a/apps/cli/src/commands/_cloud-attach.ts +++ b/apps/cli/src/commands/_cloud-attach.ts @@ -2,11 +2,12 @@ import { spawn } from 'node:child_process'; import { appendFileSync } from 'node:fs'; import { homedir } from 'node:os'; import { join } from 'node:path'; -import { spinner } from '@clack/prompts'; +import { log, spinner } from '@clack/prompts'; import { DEFAULT_RELAY_PORT } from '@agentbox/sandbox-docker'; import type { BoxRecord, Provider } from '@agentbox/core'; import type { AttachOpenIn } from '@agentbox/config'; import { agentResumeArgs } from '../agent-sessions.js'; +import { withFirewallRepair } from '../lib/firewall-repair.js'; import { providerForBox } from '../provider/registry.js'; import { runWrappedAttach } from '../wrapped-pty/index.js'; import { pasteHostClipboardImage, uploadImageFileToBox } from '../lib/paste-image.js'; @@ -185,6 +186,24 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise log.success(line) }, + () => provider.exec(box, ['true']), + ); + } // Attaching to a box that just came back up (a stop / cloud idle-timeout // resume): if the user passed no args of their own and the box has a resumable // claude/codex session, launch resuming it (claude --resume / codex resume @@ -213,10 +232,9 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise } | null> => { diff --git a/apps/cli/src/commands/recover.ts b/apps/cli/src/commands/recover.ts index d357cec..c7f7a13 100644 --- a/apps/cli/src/commands/recover.ts +++ b/apps/cli/src/commands/recover.ts @@ -35,6 +35,7 @@ import { import { ensureRelay, generateRelayToken, readState, recordBox } from '@agentbox/sandbox-docker'; import { Command } from 'commander'; import { restoreAgentSessions } from '../agent-sessions.js'; +import { withFirewallRepair } from '../lib/firewall-repair.js'; import { resolveBoxOrExit } from '../box-ref.js'; import { providerForBox } from '../provider/registry.js'; import { cloudBackendForProvider } from '../provider/cloud-backend.js'; @@ -46,6 +47,7 @@ import { handleLifecycleError } from './_errors.js'; interface RecoverOpts { all?: boolean; attach?: boolean; + firewallSync?: boolean; provider?: string; adopt?: boolean; attachIn?: string; @@ -98,7 +100,10 @@ async function readBoxBranch(box: BoxRecord): Promise { * (optionally) attach. Returns false on a non-fatal skip (e.g. Hetzner key * gone) so `--all` keeps going. */ -async function recoverKnownBox(box: BoxRecord, opts: { attach: boolean }): Promise { +async function recoverKnownBox( + box: BoxRecord, + opts: { attach: boolean; firewallSync: boolean }, +): Promise { if (await hetznerKeyMissing(box)) { log.warn( `${box.name}: per-box SSH key not found at ${hetznerKeyPath(box.cloud?.sandboxId ?? box.id)} — this box was created on another host and can't be controlled from here. Skipping.`, @@ -106,7 +111,15 @@ async function recoverKnownBox(box: BoxRecord, opts: { attach: boolean }): Promi return false; } const provider = await providerForBox(box); - const record = await provider.reconnect(box); + // Reconnect is an explicit connection-establishment, so a connect failure may + // be a host IP change that locked the Hetzner firewall — self-heal it (only + // when the egress actually changed) and retry once. + const record = await withFirewallRepair( + provider, + box, + { enabled: opts.firewallSync, onLog: (line) => log.success(line) }, + () => provider.reconnect(box), + ); log.success(`reconnected ${record.name}`); // Bring back exactly the box's last agent: resume its session if there's one // to resume, else start it fresh (adopted box / cleared pointer; the only @@ -245,6 +258,10 @@ export const recoverCommand = new Command('recover') ) .option('--all', 'recover every box in local state (skips attach)') .option('--no-attach', 'restore only; do not attach to the agent') + .option( + '--no-firewall-sync', + "don't auto-sync a Hetzner box's firewall to your current egress IP on a connect failure", + ) .option('--provider ', 'cloud provider for --adopt (daytona|hetzner|vercel|e2b)') .option('--adopt', 'rebuild local state from a live sandbox that is missing from this host') .action(async function (this: Command, idOrName: string | undefined) { @@ -264,7 +281,10 @@ export const recoverCommand = new Command('recover') } const adopted = await adoptUnknownBox(provider, idOrName); if (!adopted) return; - await recoverKnownBox(adopted, { attach: opts.attach !== false }); + await recoverKnownBox(adopted, { + attach: opts.attach !== false, + firewallSync: opts.firewallSync !== false, + }); return; } @@ -277,7 +297,13 @@ export const recoverCommand = new Command('recover') let ok = 0; for (const box of state.boxes) { try { - if (await recoverKnownBox(box, { attach: false })) ok++; + if ( + await recoverKnownBox(box, { + attach: false, + firewallSync: opts.firewallSync !== false, + }) + ) + ok++; } catch (err) { log.warn( `${box.name}: recover failed: ${err instanceof Error ? err.message : String(err)}`, @@ -289,7 +315,10 @@ export const recoverCommand = new Command('recover') } const box = await resolveBoxOrExit(idOrName); - await recoverKnownBox(box, { attach: opts.attach !== false }); + await recoverKnownBox(box, { + attach: opts.attach !== false, + firewallSync: opts.firewallSync !== false, + }); } catch (err) { handleLifecycleError(err); } diff --git a/apps/cli/src/lib/firewall-repair.ts b/apps/cli/src/lib/firewall-repair.ts new file mode 100644 index 0000000..2c1ad57 --- /dev/null +++ b/apps/cli/src/lib/firewall-repair.ts @@ -0,0 +1,36 @@ +import type { BoxRecord, Provider } from '@agentbox/core'; + +export interface FirewallRepairOptions { + /** When false, skip repair entirely (the `--no-firewall-sync` opt-out). */ + enabled: boolean; + /** Surface what the repair did (e.g. "firewall updated: …"). */ + onLog: (line: string) => void; +} + +/** + * Run a connection-ESTABLISHMENT attempt with one self-heal retry: if it throws + * and `provider.repairReachability` reports it changed something (today: a + * Hetzner host egress-IP change that locked the per-box firewall), retry once. + * When nothing changed (or repair is unsupported/disabled) the original error + * is rethrown — a non-IP failure isn't masked. + * + * Use this ONLY at establish sites (`recover`, the initial attach connect), + * never around a mid-session reconnect: a checkpoint stops the box and drops the + * connection, and that must not be mistaken for an IP change. + */ +export async function withFirewallRepair( + provider: Provider, + box: BoxRecord, + opts: FirewallRepairOptions, + attempt: () => Promise, +): Promise { + try { + return await attempt(); + } catch (err) { + if (!opts.enabled || !provider.repairReachability) throw err; + const repair = await provider.repairReachability(box).catch(() => null); + if (!repair?.changed) throw err; + opts.onLog(repair.detail ?? 'firewall synced to current egress IP'); + return await attempt(); + } +} diff --git a/packages/core/src/cloud-backend.ts b/packages/core/src/cloud-backend.ts index 0b7f280..490095c 100644 --- a/packages/core/src/cloud-backend.ts +++ b/packages/core/src/cloud-backend.ts @@ -193,6 +193,18 @@ export interface CloudBackend { */ refreshPreviewUrl?(h: CloudHandle, port: number): Promise; + /** + * Re-establish host→box reachability when establishing a connection fails for + * a reason the backend can self-heal. Today only Hetzner implements it: a host + * egress-IP change locks the per-box Cloud Firewall, so this re-syncs the + * firewall to the current egress IP — but ONLY when it actually changed (else + * `{ changed: false }`, so the caller surfaces the original error). The CLI + * calls it ONLY on a connection-ESTABLISHMENT failure (`recover`, the initial + * attach connect), never on a mid-session drop (a checkpoint stops the box — + * not an IP change). Backends with public URLs / no host transport omit it. + */ + repairReachability?(h: CloudHandle): Promise<{ changed: boolean; detail?: string }>; + /** * Browser-bound signed preview URL with the auth token embedded in the URL * (no header needed). Used for `agentbox url` / `agentbox screen` — anywhere diff --git a/packages/core/src/provider.ts b/packages/core/src/provider.ts index f018b84..1dfd2a8 100644 --- a/packages/core/src/provider.ts +++ b/packages/core/src/provider.ts @@ -317,6 +317,17 @@ export interface Provider { * provider has no cheaper reconnect path. */ reconnect(box: BoxRecord): Promise; + /** + * Self-heal host→box reachability when establishing a connection fails for a + * reason the provider can repair. Today only the Hetzner cloud provider acts: + * a host egress-IP change locks the per-box firewall, so this re-syncs it to + * the current egress — but ONLY when it actually changed (`{ changed: false }` + * otherwise, so the caller rethrows the original error). The CLI calls it ONLY + * on a connection-ESTABLISHMENT failure (`recover`, the initial attach + * connect), never on a mid-session drop. Optional — docker and public-URL + * clouds omit it. + */ + repairReachability?(box: BoxRecord): Promise<{ changed: boolean; detail?: string }>; pause(box: BoxRecord): Promise; resume(box: BoxRecord): Promise; stop(box: BoxRecord): Promise; diff --git a/packages/sandbox-cloud/src/cloud-provider.ts b/packages/sandbox-cloud/src/cloud-provider.ts index 719ff94..f5a2d76 100644 --- a/packages/sandbox-cloud/src/cloud-provider.ts +++ b/packages/sandbox-cloud/src/cloud-provider.ts @@ -1103,6 +1103,12 @@ export function createCloudProvider( ); }, + async repairReachability(box: BoxRecord): Promise<{ changed: boolean; detail?: string }> { + // Delegate to the backend (only Hetzner self-heals its firewall); other + // backends have no host transport to repair. + return (await backend.repairReachability?.(handleFor(box))) ?? { changed: false }; + }, + async pause(box: BoxRecord): Promise { await backend.pause(handleFor(box)); await persistLastState(box, 'paused'); diff --git a/packages/sandbox-hetzner/src/backend.ts b/packages/sandbox-hetzner/src/backend.ts index 938f065..331d291 100644 --- a/packages/sandbox-hetzner/src/backend.ts +++ b/packages/sandbox-hetzner/src/backend.ts @@ -48,7 +48,9 @@ import { detectEgressIp } from './egress-ip.js'; import { createPerBoxFirewall, deletePerBoxFirewall, + firewallNeedsSync, normalizeSourceCidr, + syncFirewallSource, } from './firewall.js'; import { pollUntil } from './poll.js'; import { readPreparedState } from './prepared-state.js'; @@ -230,13 +232,71 @@ function buildSshTarget(state: PerBoxState, vpsIp: string, controlPath?: string) }; } +/** + * The per-box firewall's current SSH source vs the host's live egress IP. Best- + * effort and called ONLY on a connection-failure path (the open-failed hint and + * `recover`'s auto-sync), never on the happy path — `egressIpCached` keeps the + * probe from storming across retries. Returns null when there's no firewall or + * we can't determine the state. + */ +interface FirewallEgressStatus { + firewallId: number; + /** The CIDR the firewall currently allows for inbound SSH (`source_ips[0]`). */ + allowedSource: string | undefined; + /** The host's current egress IP as a `/32` CIDR. */ + currentEgress: string; + /** Friendly box ref for the `firewall sync` hint (the `agentbox.box` label). */ + boxRef: string; +} + +async function firewallEgressStatus(sandboxId: string): Promise { + const id = Number.parseInt(sandboxId, 10); + if (!Number.isFinite(id)) return null; + const server = await client().getServer(id); + if (!server) return null; + const firewallId = Number.parseInt(server.labels['agentbox.firewall'] ?? '', 10); + if (!Number.isFinite(firewallId)) return null; + const firewall = await client().getFirewall(firewallId); + const sshRule = firewall?.rules.find((r) => r.direction === 'in' && r.port === '22'); + const allowedSource = sshRule?.source_ips?.[0]; + // A FRESH probe (not cached): this runs only on a connection-failure path, and + // a stale value could read the new IP as "unchanged" and skip the heal — the + // very mismatch we exist to catch. The poller already de-dupes its recover + // calls and `recover --all` is sequential, so fresh probing here won't storm. + const currentEgress = `${await detectEgressIp({})}/32`; + return { + firewallId, + allowedSource, + currentEgress, + boxRef: server.labels['agentbox.box'] ?? sandboxId, + }; +} + async function ensureTunnel(sandboxId: string, state: PerBoxState, vpsIp: string): Promise { if (tunnels.has(sandboxId)) return; - await tunnels.open({ - boxId: sandboxId, - vpsHost: vpsIp, - identity: state.identity, - }); + try { + await tunnels.open({ + boxId: sandboxId, + vpsHost: vpsIp, + identity: state.identity, + }); + } catch (err) { + // A host egress-IP change locks us out of the per-box firewall, surfacing as + // an opaque SSH connect timeout. Best-effort: detect the mismatch and enrich + // the error with the fix. Never let the diagnostic mask the original error + // on a match (box is just down) or a probe failure. + const s = await firewallEgressStatus(sandboxId).catch(() => null); + if (s && firewallNeedsSync(s.allowedSource, s.currentEgress)) { + throw new Error( + `${(err as Error).message}\n\n` + + `hetzner: SSH is blocked by the box firewall — it allows ${s.allowedSource ?? '(no rule)'} ` + + `but your egress IP is now ${s.currentEgress}. Your IP changed; run:\n` + + ` agentbox hetzner firewall sync ${s.boxRef}\n` + + `(or \`agentbox recover ${s.boxRef}\`, which auto-syncs).`, + ); + } + throw err; + } } /** @@ -639,6 +699,22 @@ export const hetznerBackend: CloudBackend = { return { url: `http://127.0.0.1:${String(localPort)}` }; }, + async repairReachability(h): Promise<{ changed: boolean; detail?: string }> { + // Re-sync the per-box firewall to the host's CURRENT egress IP, but only + // when it actually changed — the host laptop moved networks and the + // firewall is now blocking us. Called by the CLI ONLY on a connection- + // establishment failure (`recover`, the initial attach connect), never on a + // mid-session drop. A `0.0.0.0/0` firewall (explicit dynamic-IP opt-in) is + // already open, so it's a no-op. + const s = await firewallEgressStatus(h.sandboxId).catch(() => null); + if (!s || !firewallNeedsSync(s.allowedSource, s.currentEgress)) return { changed: false }; + await syncFirewallSource(client(), s.firewallId, s.currentEgress); + return { + changed: true, + detail: `firewall updated: SSH now allowed from ${s.currentEgress} (was ${s.allowedSource ?? '(no rule)'})`, + }; + }, + async startInBoxPortless(h, opts): Promise { // Bring up a `portless` proxy *inside the VPS* mirroring the host's // mode so `.localhost:

` resolves to the same content on diff --git a/packages/sandbox-hetzner/src/firewall.ts b/packages/sandbox-hetzner/src/firewall.ts index 019c3a6..99ac841 100644 --- a/packages/sandbox-hetzner/src/firewall.ts +++ b/packages/sandbox-hetzner/src/firewall.ts @@ -133,6 +133,21 @@ export async function deletePerBoxFirewall( } } +/** + * Whether the firewall's allowed SSH source needs re-syncing to the current + * egress: true when they differ AND the firewall isn't already wide-open + * (`0.0.0.0/0`, the explicit dynamic-IP opt-in). Pure so the hint + auto-sync + * decision is unit-testable without the Hetzner API. An absent allowed source + * (no SSH rule) counts as a mismatch worth syncing. + */ +export function firewallNeedsSync( + allowedSource: string | undefined, + currentEgress: string, +): boolean { + if (allowedSource === '0.0.0.0/0') return false; + return allowedSource !== currentEgress; +} + /** * Normalize a source spec into a CIDR. Accepts: * - bare IPv4 → appends `/32` diff --git a/packages/sandbox-hetzner/test/firewall.test.ts b/packages/sandbox-hetzner/test/firewall.test.ts index 60e88d9..13c3cf4 100644 --- a/packages/sandbox-hetzner/test/firewall.test.ts +++ b/packages/sandbox-hetzner/test/firewall.test.ts @@ -1,5 +1,23 @@ import { describe, expect, it } from 'vitest'; -import { normalizeSourceCidr, sshOnlyInboundRule } from '../src/firewall.js'; +import { firewallNeedsSync, normalizeSourceCidr, sshOnlyInboundRule } from '../src/firewall.js'; + +describe('firewallNeedsSync', () => { + it('no sync when the allowed source already matches the current egress', () => { + expect(firewallNeedsSync('1.2.3.4/32', '1.2.3.4/32')).toBe(false); + }); + + it('sync when the egress IP changed', () => { + expect(firewallNeedsSync('1.2.3.4/32', '5.6.7.8/32')).toBe(true); + }); + + it('never syncs a wide-open (0.0.0.0/0) firewall — explicit dynamic-IP opt-in', () => { + expect(firewallNeedsSync('0.0.0.0/0', '5.6.7.8/32')).toBe(false); + }); + + it('syncs when there is no SSH rule at all (absent allowed source)', () => { + expect(firewallNeedsSync(undefined, '5.6.7.8/32')).toBe(true); + }); +}); describe('normalizeSourceCidr', () => { it('appends /32 to a bare IPv4', () => {