From a27f479090286e7b63c46a4775c6abedbcc35f70 Mon Sep 17 00:00:00 2001 From: Marco D'Alia Date: Mon, 29 Jun 2026 23:14:48 +0100 Subject: [PATCH 1/3] feat(hetzner): self-heal the per-box firewall on a host egress-IP change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A Hetzner box's firewall locks SSH to the host's egress IP at create time and is never re-synced. When the host IP changes (laptop moves networks), every comms op fails with an opaque `ssh ControlMaster failed … Operation timed out` and the user has to know to run `agentbox hetzner firewall sync`. Two fixes, both gated to the connection-failure path so the happy path never pays the egress-detect cost, and the firewall is re-synced ONLY when the IP actually changed: 1. Hint (read-only): wrap `tunnels.open` in `ensureTunnel` — the one choke point all of exec/scp/forward/poller/attach funnel through. On a real mismatch it turns the opaque timeout into "firewall allows X but your egress is now Y — run `firewall sync`/`recover`". Safe on a checkpoint drop (box merely stopped, IP unchanged → no hint). 2. Auto-sync, scoped to connection ESTABLISHMENT only. New optional `repairReachability` on CloudBackend/Provider (Hetzner-only): re-syncs the firewall to the current egress, but only when it changed (else changed:false). A `withFirewallRepair` CLI helper retries the attempt once iff something changed, wired at the two establish sites — `recover` (provider.reconnect) and the INITIAL attach connect (`_cloud-attach` buildAttach). Deliberately NOT the mid-session reconnect closure: a checkpoint stops the box and drops the PTY, which must not be mistaken for an IP change. `--no-firewall-sync` opts out on recover (shared/untrusted egress). A short-TTL egress cache avoids probe storms across retries / `recover --all`. `0.0.0.0/0` (explicit dynamic-IP opt-in) is never hinted or synced. Verified live on a Hetzner box: locking the firewall to a bogus IP makes `shell` fail with the hint (no auto-repair), `recover` auto-syncs back + reconnects, and `--no-firewall-sync` leaves it locked. Unit tests cover firewallNeedsSync + the egress TTL cache. Claude-Session: https://claude.ai/code/session_01Ja5HgEjwyER5BhhFCpPUup --- apps/cli/src/commands/_cloud-attach.ts | 24 ++++-- apps/cli/src/commands/recover.ts | 39 +++++++-- apps/cli/src/lib/firewall-repair.ts | 36 ++++++++ packages/core/src/cloud-backend.ts | 12 +++ packages/core/src/provider.ts | 11 +++ packages/sandbox-cloud/src/cloud-provider.ts | 6 ++ packages/sandbox-hetzner/src/backend.ts | 84 +++++++++++++++++-- packages/sandbox-hetzner/src/egress-ip.ts | 30 +++++++ packages/sandbox-hetzner/src/firewall.ts | 15 ++++ .../sandbox-hetzner/test/egress-ip.test.ts | 34 +++++++- .../sandbox-hetzner/test/firewall.test.ts | 20 ++++- 11 files changed, 292 insertions(+), 19 deletions(-) create mode 100644 apps/cli/src/lib/firewall-repair.ts diff --git a/apps/cli/src/commands/_cloud-attach.ts b/apps/cli/src/commands/_cloud-attach.ts index 71fb48ee..2d577e4c 100644 --- a/apps/cli/src/commands/_cloud-attach.ts +++ b/apps/cli/src/commands/_cloud-attach.ts @@ -2,11 +2,12 @@ import { spawn } from 'node:child_process'; import { appendFileSync } from 'node:fs'; import { homedir } from 'node:os'; import { join } from 'node:path'; -import { spinner } from '@clack/prompts'; +import { log, spinner } from '@clack/prompts'; import { DEFAULT_RELAY_PORT } from '@agentbox/sandbox-docker'; import type { BoxRecord, Provider } from '@agentbox/core'; import type { AttachOpenIn } from '@agentbox/config'; import { agentResumeArgs } from '../agent-sessions.js'; +import { withFirewallRepair } from '../lib/firewall-repair.js'; import { providerForBox } from '../provider/registry.js'; import { runWrappedAttach } from '../wrapped-pty/index.js'; import { pasteHostClipboardImage, uploadImageFileToBox } from '../lib/paste-image.js'; @@ -213,10 +214,17 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise log.success(line) }, + () => buildAttach(box, 'agent', { sessionName: args.sessionName, command }), + ); // claude only, and only when this host can capture a clipboard image (macOS, // or a Linux desktop with xclip/wl-paste). Otherwise Ctrl+V forwards verbatim. const canPaste = args.mode === 'claude' && (await clipboardCaptureAvailable()); @@ -231,6 +239,12 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise } | null> => { diff --git a/apps/cli/src/commands/recover.ts b/apps/cli/src/commands/recover.ts index d357cec0..c7f7a130 100644 --- a/apps/cli/src/commands/recover.ts +++ b/apps/cli/src/commands/recover.ts @@ -35,6 +35,7 @@ import { import { ensureRelay, generateRelayToken, readState, recordBox } from '@agentbox/sandbox-docker'; import { Command } from 'commander'; import { restoreAgentSessions } from '../agent-sessions.js'; +import { withFirewallRepair } from '../lib/firewall-repair.js'; import { resolveBoxOrExit } from '../box-ref.js'; import { providerForBox } from '../provider/registry.js'; import { cloudBackendForProvider } from '../provider/cloud-backend.js'; @@ -46,6 +47,7 @@ import { handleLifecycleError } from './_errors.js'; interface RecoverOpts { all?: boolean; attach?: boolean; + firewallSync?: boolean; provider?: string; adopt?: boolean; attachIn?: string; @@ -98,7 +100,10 @@ async function readBoxBranch(box: BoxRecord): Promise { * (optionally) attach. Returns false on a non-fatal skip (e.g. Hetzner key * gone) so `--all` keeps going. */ -async function recoverKnownBox(box: BoxRecord, opts: { attach: boolean }): Promise { +async function recoverKnownBox( + box: BoxRecord, + opts: { attach: boolean; firewallSync: boolean }, +): Promise { if (await hetznerKeyMissing(box)) { log.warn( `${box.name}: per-box SSH key not found at ${hetznerKeyPath(box.cloud?.sandboxId ?? box.id)} — this box was created on another host and can't be controlled from here. Skipping.`, @@ -106,7 +111,15 @@ async function recoverKnownBox(box: BoxRecord, opts: { attach: boolean }): Promi return false; } const provider = await providerForBox(box); - const record = await provider.reconnect(box); + // Reconnect is an explicit connection-establishment, so a connect failure may + // be a host IP change that locked the Hetzner firewall — self-heal it (only + // when the egress actually changed) and retry once. + const record = await withFirewallRepair( + provider, + box, + { enabled: opts.firewallSync, onLog: (line) => log.success(line) }, + () => provider.reconnect(box), + ); log.success(`reconnected ${record.name}`); // Bring back exactly the box's last agent: resume its session if there's one // to resume, else start it fresh (adopted box / cleared pointer; the only @@ -245,6 +258,10 @@ export const recoverCommand = new Command('recover') ) .option('--all', 'recover every box in local state (skips attach)') .option('--no-attach', 'restore only; do not attach to the agent') + .option( + '--no-firewall-sync', + "don't auto-sync a Hetzner box's firewall to your current egress IP on a connect failure", + ) .option('--provider ', 'cloud provider for --adopt (daytona|hetzner|vercel|e2b)') .option('--adopt', 'rebuild local state from a live sandbox that is missing from this host') .action(async function (this: Command, idOrName: string | undefined) { @@ -264,7 +281,10 @@ export const recoverCommand = new Command('recover') } const adopted = await adoptUnknownBox(provider, idOrName); if (!adopted) return; - await recoverKnownBox(adopted, { attach: opts.attach !== false }); + await recoverKnownBox(adopted, { + attach: opts.attach !== false, + firewallSync: opts.firewallSync !== false, + }); return; } @@ -277,7 +297,13 @@ export const recoverCommand = new Command('recover') let ok = 0; for (const box of state.boxes) { try { - if (await recoverKnownBox(box, { attach: false })) ok++; + if ( + await recoverKnownBox(box, { + attach: false, + firewallSync: opts.firewallSync !== false, + }) + ) + ok++; } catch (err) { log.warn( `${box.name}: recover failed: ${err instanceof Error ? err.message : String(err)}`, @@ -289,7 +315,10 @@ export const recoverCommand = new Command('recover') } const box = await resolveBoxOrExit(idOrName); - await recoverKnownBox(box, { attach: opts.attach !== false }); + await recoverKnownBox(box, { + attach: opts.attach !== false, + firewallSync: opts.firewallSync !== false, + }); } catch (err) { handleLifecycleError(err); } diff --git a/apps/cli/src/lib/firewall-repair.ts b/apps/cli/src/lib/firewall-repair.ts new file mode 100644 index 00000000..2c1ad579 --- /dev/null +++ b/apps/cli/src/lib/firewall-repair.ts @@ -0,0 +1,36 @@ +import type { BoxRecord, Provider } from '@agentbox/core'; + +export interface FirewallRepairOptions { + /** When false, skip repair entirely (the `--no-firewall-sync` opt-out). */ + enabled: boolean; + /** Surface what the repair did (e.g. "firewall updated: …"). */ + onLog: (line: string) => void; +} + +/** + * Run a connection-ESTABLISHMENT attempt with one self-heal retry: if it throws + * and `provider.repairReachability` reports it changed something (today: a + * Hetzner host egress-IP change that locked the per-box firewall), retry once. + * When nothing changed (or repair is unsupported/disabled) the original error + * is rethrown — a non-IP failure isn't masked. + * + * Use this ONLY at establish sites (`recover`, the initial attach connect), + * never around a mid-session reconnect: a checkpoint stops the box and drops the + * connection, and that must not be mistaken for an IP change. + */ +export async function withFirewallRepair( + provider: Provider, + box: BoxRecord, + opts: FirewallRepairOptions, + attempt: () => Promise, +): Promise { + try { + return await attempt(); + } catch (err) { + if (!opts.enabled || !provider.repairReachability) throw err; + const repair = await provider.repairReachability(box).catch(() => null); + if (!repair?.changed) throw err; + opts.onLog(repair.detail ?? 'firewall synced to current egress IP'); + return await attempt(); + } +} diff --git a/packages/core/src/cloud-backend.ts b/packages/core/src/cloud-backend.ts index 0b7f280c..490095c1 100644 --- a/packages/core/src/cloud-backend.ts +++ b/packages/core/src/cloud-backend.ts @@ -193,6 +193,18 @@ export interface CloudBackend { */ refreshPreviewUrl?(h: CloudHandle, port: number): Promise; + /** + * Re-establish host→box reachability when establishing a connection fails for + * a reason the backend can self-heal. Today only Hetzner implements it: a host + * egress-IP change locks the per-box Cloud Firewall, so this re-syncs the + * firewall to the current egress IP — but ONLY when it actually changed (else + * `{ changed: false }`, so the caller surfaces the original error). The CLI + * calls it ONLY on a connection-ESTABLISHMENT failure (`recover`, the initial + * attach connect), never on a mid-session drop (a checkpoint stops the box — + * not an IP change). Backends with public URLs / no host transport omit it. + */ + repairReachability?(h: CloudHandle): Promise<{ changed: boolean; detail?: string }>; + /** * Browser-bound signed preview URL with the auth token embedded in the URL * (no header needed). Used for `agentbox url` / `agentbox screen` — anywhere diff --git a/packages/core/src/provider.ts b/packages/core/src/provider.ts index f018b841..1dfd2a86 100644 --- a/packages/core/src/provider.ts +++ b/packages/core/src/provider.ts @@ -317,6 +317,17 @@ export interface Provider { * provider has no cheaper reconnect path. */ reconnect(box: BoxRecord): Promise; + /** + * Self-heal host→box reachability when establishing a connection fails for a + * reason the provider can repair. Today only the Hetzner cloud provider acts: + * a host egress-IP change locks the per-box firewall, so this re-syncs it to + * the current egress — but ONLY when it actually changed (`{ changed: false }` + * otherwise, so the caller rethrows the original error). The CLI calls it ONLY + * on a connection-ESTABLISHMENT failure (`recover`, the initial attach + * connect), never on a mid-session drop. Optional — docker and public-URL + * clouds omit it. + */ + repairReachability?(box: BoxRecord): Promise<{ changed: boolean; detail?: string }>; pause(box: BoxRecord): Promise; resume(box: BoxRecord): Promise; stop(box: BoxRecord): Promise; diff --git a/packages/sandbox-cloud/src/cloud-provider.ts b/packages/sandbox-cloud/src/cloud-provider.ts index 719ff941..f5a2d766 100644 --- a/packages/sandbox-cloud/src/cloud-provider.ts +++ b/packages/sandbox-cloud/src/cloud-provider.ts @@ -1103,6 +1103,12 @@ export function createCloudProvider( ); }, + async repairReachability(box: BoxRecord): Promise<{ changed: boolean; detail?: string }> { + // Delegate to the backend (only Hetzner self-heals its firewall); other + // backends have no host transport to repair. + return (await backend.repairReachability?.(handleFor(box))) ?? { changed: false }; + }, + async pause(box: BoxRecord): Promise { await backend.pause(handleFor(box)); await persistLastState(box, 'paused'); diff --git a/packages/sandbox-hetzner/src/backend.ts b/packages/sandbox-hetzner/src/backend.ts index 938f0651..2083b764 100644 --- a/packages/sandbox-hetzner/src/backend.ts +++ b/packages/sandbox-hetzner/src/backend.ts @@ -44,11 +44,13 @@ import { type HetznerServer, type HetznerServerStatus, } from './client.js'; -import { detectEgressIp } from './egress-ip.js'; +import { detectEgressIp, egressIpCached } from './egress-ip.js'; import { createPerBoxFirewall, deletePerBoxFirewall, + firewallNeedsSync, normalizeSourceCidr, + syncFirewallSource, } from './firewall.js'; import { pollUntil } from './poll.js'; import { readPreparedState } from './prepared-state.js'; @@ -230,13 +232,67 @@ function buildSshTarget(state: PerBoxState, vpsIp: string, controlPath?: string) }; } +/** + * The per-box firewall's current SSH source vs the host's live egress IP. Best- + * effort and called ONLY on a connection-failure path (the open-failed hint and + * `recover`'s auto-sync), never on the happy path — `egressIpCached` keeps the + * probe from storming across retries. Returns null when there's no firewall or + * we can't determine the state. + */ +interface FirewallEgressStatus { + firewallId: number; + /** The CIDR the firewall currently allows for inbound SSH (`source_ips[0]`). */ + allowedSource: string | undefined; + /** The host's current egress IP as a `/32` CIDR. */ + currentEgress: string; + /** Friendly box ref for the `firewall sync` hint (the `agentbox.box` label). */ + boxRef: string; +} + +async function firewallEgressStatus(sandboxId: string): Promise { + const id = Number.parseInt(sandboxId, 10); + if (!Number.isFinite(id)) return null; + const server = await client().getServer(id); + if (!server) return null; + const firewallId = Number.parseInt(server.labels['agentbox.firewall'] ?? '', 10); + if (!Number.isFinite(firewallId)) return null; + const firewall = await client().getFirewall(firewallId); + const sshRule = firewall?.rules.find((r) => r.direction === 'in' && r.port === '22'); + const allowedSource = sshRule?.source_ips?.[0]; + const currentEgress = `${await egressIpCached({})}/32`; + return { + firewallId, + allowedSource, + currentEgress, + boxRef: server.labels['agentbox.box'] ?? sandboxId, + }; +} + async function ensureTunnel(sandboxId: string, state: PerBoxState, vpsIp: string): Promise { if (tunnels.has(sandboxId)) return; - await tunnels.open({ - boxId: sandboxId, - vpsHost: vpsIp, - identity: state.identity, - }); + try { + await tunnels.open({ + boxId: sandboxId, + vpsHost: vpsIp, + identity: state.identity, + }); + } catch (err) { + // A host egress-IP change locks us out of the per-box firewall, surfacing as + // an opaque SSH connect timeout. Best-effort: detect the mismatch and enrich + // the error with the fix. Never let the diagnostic mask the original error + // on a match (box is just down) or a probe failure. + const s = await firewallEgressStatus(sandboxId).catch(() => null); + if (s && firewallNeedsSync(s.allowedSource, s.currentEgress)) { + throw new Error( + `${(err as Error).message}\n\n` + + `hetzner: SSH is blocked by the box firewall — it allows ${s.allowedSource ?? '(no rule)'} ` + + `but your egress IP is now ${s.currentEgress}. Your IP changed; run:\n` + + ` agentbox hetzner firewall sync ${s.boxRef}\n` + + `(or \`agentbox recover ${s.boxRef}\`, which auto-syncs).`, + ); + } + throw err; + } } /** @@ -639,6 +695,22 @@ export const hetznerBackend: CloudBackend = { return { url: `http://127.0.0.1:${String(localPort)}` }; }, + async repairReachability(h): Promise<{ changed: boolean; detail?: string }> { + // Re-sync the per-box firewall to the host's CURRENT egress IP, but only + // when it actually changed — the host laptop moved networks and the + // firewall is now blocking us. Called by the CLI ONLY on a connection- + // establishment failure (`recover`, the initial attach connect), never on a + // mid-session drop. A `0.0.0.0/0` firewall (explicit dynamic-IP opt-in) is + // already open, so it's a no-op. + const s = await firewallEgressStatus(h.sandboxId).catch(() => null); + if (!s || !firewallNeedsSync(s.allowedSource, s.currentEgress)) return { changed: false }; + await syncFirewallSource(client(), s.firewallId, s.currentEgress); + return { + changed: true, + detail: `firewall updated: SSH now allowed from ${s.currentEgress} (was ${s.allowedSource ?? '(no rule)'})`, + }; + }, + async startInBoxPortless(h, opts): Promise { // Bring up a `portless` proxy *inside the VPS* mirroring the host's // mode so `.localhost:

` resolves to the same content on diff --git a/packages/sandbox-hetzner/src/egress-ip.ts b/packages/sandbox-hetzner/src/egress-ip.ts index ecea72fb..95e203d8 100644 --- a/packages/sandbox-hetzner/src/egress-ip.ts +++ b/packages/sandbox-hetzner/src/egress-ip.ts @@ -65,6 +65,36 @@ export async function detectEgressIp(opts: DetectEgressIpOptions = {}): Promise< ); } +/** Default TTL for the cached egress lookup (ms). */ +const EGRESS_CACHE_TTL_MS = 60_000; + +let egressCache: { ip: string; at: number } | null = null; + +/** + * `detectEgressIp` with a short TTL cache. Only ever called on a connection- + * failure path (the firewall-mismatch hint + `recover`'s auto-sync), where a + * single host IP-change can otherwise trigger a probe storm — the cloud poller + * backs off and re-hits the tunnel open repeatedly, and a multi-box `recover + * --all` would re-probe per box. The egress IP is host-global, so one cached + * value serves every box. Throws (same as `detectEgressIp`) when all probes + * fail and there's no fresh cache entry. + */ +export async function egressIpCached( + opts: DetectEgressIpOptions & { ttlMs?: number; now?: () => number } = {}, +): Promise { + const ttl = opts.ttlMs ?? EGRESS_CACHE_TTL_MS; + const now = opts.now ?? Date.now; + if (egressCache && now() - egressCache.at < ttl) return egressCache.ip; + const ip = await detectEgressIp(opts); + egressCache = { ip, at: now() }; + return ip; +} + +/** Test seam: drop the cached egress so each case starts cold. */ +export function __resetEgressCache(): void { + egressCache = null; +} + async function probe(url: string, fetchImpl: typeof fetch): Promise { const res = await fetchImpl(url, { method: 'GET' }); if (!res.ok) return null; diff --git a/packages/sandbox-hetzner/src/firewall.ts b/packages/sandbox-hetzner/src/firewall.ts index 019c3a6a..99ac8410 100644 --- a/packages/sandbox-hetzner/src/firewall.ts +++ b/packages/sandbox-hetzner/src/firewall.ts @@ -133,6 +133,21 @@ export async function deletePerBoxFirewall( } } +/** + * Whether the firewall's allowed SSH source needs re-syncing to the current + * egress: true when they differ AND the firewall isn't already wide-open + * (`0.0.0.0/0`, the explicit dynamic-IP opt-in). Pure so the hint + auto-sync + * decision is unit-testable without the Hetzner API. An absent allowed source + * (no SSH rule) counts as a mismatch worth syncing. + */ +export function firewallNeedsSync( + allowedSource: string | undefined, + currentEgress: string, +): boolean { + if (allowedSource === '0.0.0.0/0') return false; + return allowedSource !== currentEgress; +} + /** * Normalize a source spec into a CIDR. Accepts: * - bare IPv4 → appends `/32` diff --git a/packages/sandbox-hetzner/test/egress-ip.test.ts b/packages/sandbox-hetzner/test/egress-ip.test.ts index c0b4357a..df063882 100644 --- a/packages/sandbox-hetzner/test/egress-ip.test.ts +++ b/packages/sandbox-hetzner/test/egress-ip.test.ts @@ -1,5 +1,5 @@ -import { describe, expect, it } from 'vitest'; -import { detectEgressIp } from '../src/egress-ip.js'; +import { afterEach, describe, expect, it } from 'vitest'; +import { detectEgressIp, egressIpCached, __resetEgressCache } from '../src/egress-ip.js'; function fakeFetch(map: Record): typeof fetch { // Use Parameters[0] instead of `RequestInfo | URL` so this @@ -49,3 +49,33 @@ describe('detectEgressIp', () => { ).rejects.toThrow(/could not auto-detect/i); }); }); + +describe('egressIpCached', () => { + afterEach(() => __resetEgressCache()); + + /** A fetch that counts how many times it was invoked. */ + function countingFetch(body: string): { fetchImpl: typeof fetch; calls: () => number } { + let n = 0; + const fetchImpl = (async () => { + n += 1; + return new Response(body, { status: 200 }) as unknown as Response; + }) as typeof fetch; + return { fetchImpl, calls: () => n }; + } + + it('probes once within the TTL window, re-probes after it', async () => { + const { fetchImpl, calls } = countingFetch('203.0.113.9\n'); + const opts = { probes: ['https://p'], fetchImpl, ttlMs: 1000 }; + let t = 10_000; + const now = () => t; + + expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); + t = 10_500; // within TTL → cached, no new probe + expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); + expect(calls()).toBe(1); + + t = 11_500; // past TTL → re-probe + expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); + expect(calls()).toBe(2); + }); +}); diff --git a/packages/sandbox-hetzner/test/firewall.test.ts b/packages/sandbox-hetzner/test/firewall.test.ts index 60e88d9e..13c3cf43 100644 --- a/packages/sandbox-hetzner/test/firewall.test.ts +++ b/packages/sandbox-hetzner/test/firewall.test.ts @@ -1,5 +1,23 @@ import { describe, expect, it } from 'vitest'; -import { normalizeSourceCidr, sshOnlyInboundRule } from '../src/firewall.js'; +import { firewallNeedsSync, normalizeSourceCidr, sshOnlyInboundRule } from '../src/firewall.js'; + +describe('firewallNeedsSync', () => { + it('no sync when the allowed source already matches the current egress', () => { + expect(firewallNeedsSync('1.2.3.4/32', '1.2.3.4/32')).toBe(false); + }); + + it('sync when the egress IP changed', () => { + expect(firewallNeedsSync('1.2.3.4/32', '5.6.7.8/32')).toBe(true); + }); + + it('never syncs a wide-open (0.0.0.0/0) firewall — explicit dynamic-IP opt-in', () => { + expect(firewallNeedsSync('0.0.0.0/0', '5.6.7.8/32')).toBe(false); + }); + + it('syncs when there is no SSH rule at all (absent allowed source)', () => { + expect(firewallNeedsSync(undefined, '5.6.7.8/32')).toBe(true); + }); +}); describe('normalizeSourceCidr', () => { it('appends /32 to a bare IPv4', () => { From e6dfc230f20e0ab44fec462d0b0f6c2d1ffa2ef4 Mon Sep 17 00:00:00 2001 From: Marco D'Alia Date: Mon, 29 Jun 2026 23:24:33 +0100 Subject: [PATCH 2/3] =?UTF-8?q?fix(hetzner):=20address=20Bugbot=20?= =?UTF-8?q?=E2=80=94=20bound=20egress=20cache=20+=20cover=20all=20attach?= =?UTF-8?q?=20establishes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Stale egress cache could mask a real IP change: cut the cache TTL from 60s to 5s. It only exists to dedup a burst of failure-path probes (poller backoff, `recover --all`), not to remember the IP over time — a long TTL would hide the very IP change we're detecting. 2. The firewall self-heal wrapped only the final buildAttach, but the resume probe and the detached pre-start connect first — a firewall block there aborted the attach (or silently dropped the resumed session) before repair ran. Move the repair to a single up-front warm-up (`exec true`, Hetzner-only) that opens the tunnel + self-heals BEFORE any later establish touch, which then reuse the live master. Verified live: a locked firewall is now auto-synced on `claude attach` before it connects. Claude-Session: https://claude.ai/code/session_01Ja5HgEjwyER5BhhFCpPUup --- apps/cli/src/commands/_cloud-attach.ts | 34 +++++++++++++++-------- packages/sandbox-hetzner/src/egress-ip.ts | 16 ++++++++--- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/apps/cli/src/commands/_cloud-attach.ts b/apps/cli/src/commands/_cloud-attach.ts index 2d577e4c..203ab8fc 100644 --- a/apps/cli/src/commands/_cloud-attach.ts +++ b/apps/cli/src/commands/_cloud-attach.ts @@ -186,6 +186,24 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise log.success(line) }, + () => provider.exec(box, ['true']), + ); + } // Attaching to a box that just came back up (a stop / cloud idle-timeout // resume): if the user passed no args of their own and the box has a resumable // claude/codex session, launch resuming it (claude --resume / codex resume @@ -214,17 +232,9 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise log.success(line) }, - () => buildAttach(box, 'agent', { sessionName: args.sessionName, command }), - ); + // The tunnel is already established (and firewall-healed) by the up-front warm + // -up above, so this reuses the live master. + let spec = await buildAttach(box, 'agent', { sessionName: args.sessionName, command }); // claude only, and only when this host can capture a clipboard image (macOS, // or a Linux desktop with xclip/wl-paste). Otherwise Ctrl+V forwards verbatim. const canPaste = args.mode === 'claude' && (await clipboardCaptureAvailable()); @@ -243,7 +253,7 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise number } = {}, From ec544984caf21981ee87d3740837fab17a549499 Mon Sep 17 00:00:00 2001 From: Marco D'Alia Date: Mon, 29 Jun 2026 23:30:08 +0100 Subject: [PATCH 3/3] =?UTF-8?q?fix(hetzner):=20drop=20the=20egress=20cache?= =?UTF-8?q?=20=E2=80=94=20always=20probe=20fresh=20on=20the=20failure=20pa?= =?UTF-8?q?th?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot (round 2): even a 5s-TTL cache could read a just-changed egress IP as "unchanged" in the firewall comparison and skip the heal — the exact mismatch this exists to catch. The cache only dedup'd failure-path probes, but the cloud poller already de-dupes its recover calls and `recover --all` is sequential, so a fresh `detectEgressIp` in `firewallEgressStatus` won't storm. Remove the cache entirely; correctness over a marginal probe dedup. Claude-Session: https://claude.ai/code/session_01Ja5HgEjwyER5BhhFCpPUup --- packages/sandbox-hetzner/src/backend.ts | 8 +++- packages/sandbox-hetzner/src/egress-ip.ts | 38 ------------------- .../sandbox-hetzner/test/egress-ip.test.ts | 34 +---------------- 3 files changed, 8 insertions(+), 72 deletions(-) diff --git a/packages/sandbox-hetzner/src/backend.ts b/packages/sandbox-hetzner/src/backend.ts index 2083b764..331d2912 100644 --- a/packages/sandbox-hetzner/src/backend.ts +++ b/packages/sandbox-hetzner/src/backend.ts @@ -44,7 +44,7 @@ import { type HetznerServer, type HetznerServerStatus, } from './client.js'; -import { detectEgressIp, egressIpCached } from './egress-ip.js'; +import { detectEgressIp } from './egress-ip.js'; import { createPerBoxFirewall, deletePerBoxFirewall, @@ -259,7 +259,11 @@ async function firewallEgressStatus(sandboxId: string): Promise r.direction === 'in' && r.port === '22'); const allowedSource = sshRule?.source_ips?.[0]; - const currentEgress = `${await egressIpCached({})}/32`; + // A FRESH probe (not cached): this runs only on a connection-failure path, and + // a stale value could read the new IP as "unchanged" and skip the heal — the + // very mismatch we exist to catch. The poller already de-dupes its recover + // calls and `recover --all` is sequential, so fresh probing here won't storm. + const currentEgress = `${await detectEgressIp({})}/32`; return { firewallId, allowedSource, diff --git a/packages/sandbox-hetzner/src/egress-ip.ts b/packages/sandbox-hetzner/src/egress-ip.ts index 2013e68b..ecea72fb 100644 --- a/packages/sandbox-hetzner/src/egress-ip.ts +++ b/packages/sandbox-hetzner/src/egress-ip.ts @@ -65,44 +65,6 @@ export async function detectEgressIp(opts: DetectEgressIpOptions = {}): Promise< ); } -/** - * Default TTL for the cached egress lookup (ms). Deliberately SHORT: the cache - * only exists to dedup a *burst* of failure-path probes (the poller's backoff - * re-hitting the tunnel, or `recover --all` walking many boxes within a couple - * seconds), not to remember the IP across time. A long TTL would mask a *real* - * IP change that happens right after a probe — the very thing we're detecting — - * so we keep the staleness window to a few seconds and re-probe after. - */ -const EGRESS_CACHE_TTL_MS = 5_000; - -let egressCache: { ip: string; at: number } | null = null; - -/** - * `detectEgressIp` with a short TTL cache. Only ever called on a connection- - * failure path (the firewall-mismatch hint + `recover`'s auto-sync), where a - * single host IP-change can otherwise trigger a probe storm — the cloud poller - * backs off and re-hits the tunnel open repeatedly, and a multi-box `recover - * --all` would re-probe per box. The egress IP is host-global, so one cached - * value serves every box. The short TTL (see above) bounds how long a stale - * value can hide a fresh IP change. Throws (same as `detectEgressIp`) when all - * probes fail and there's no fresh cache entry. - */ -export async function egressIpCached( - opts: DetectEgressIpOptions & { ttlMs?: number; now?: () => number } = {}, -): Promise { - const ttl = opts.ttlMs ?? EGRESS_CACHE_TTL_MS; - const now = opts.now ?? Date.now; - if (egressCache && now() - egressCache.at < ttl) return egressCache.ip; - const ip = await detectEgressIp(opts); - egressCache = { ip, at: now() }; - return ip; -} - -/** Test seam: drop the cached egress so each case starts cold. */ -export function __resetEgressCache(): void { - egressCache = null; -} - async function probe(url: string, fetchImpl: typeof fetch): Promise { const res = await fetchImpl(url, { method: 'GET' }); if (!res.ok) return null; diff --git a/packages/sandbox-hetzner/test/egress-ip.test.ts b/packages/sandbox-hetzner/test/egress-ip.test.ts index df063882..c0b4357a 100644 --- a/packages/sandbox-hetzner/test/egress-ip.test.ts +++ b/packages/sandbox-hetzner/test/egress-ip.test.ts @@ -1,5 +1,5 @@ -import { afterEach, describe, expect, it } from 'vitest'; -import { detectEgressIp, egressIpCached, __resetEgressCache } from '../src/egress-ip.js'; +import { describe, expect, it } from 'vitest'; +import { detectEgressIp } from '../src/egress-ip.js'; function fakeFetch(map: Record): typeof fetch { // Use Parameters[0] instead of `RequestInfo | URL` so this @@ -49,33 +49,3 @@ describe('detectEgressIp', () => { ).rejects.toThrow(/could not auto-detect/i); }); }); - -describe('egressIpCached', () => { - afterEach(() => __resetEgressCache()); - - /** A fetch that counts how many times it was invoked. */ - function countingFetch(body: string): { fetchImpl: typeof fetch; calls: () => number } { - let n = 0; - const fetchImpl = (async () => { - n += 1; - return new Response(body, { status: 200 }) as unknown as Response; - }) as typeof fetch; - return { fetchImpl, calls: () => n }; - } - - it('probes once within the TTL window, re-probes after it', async () => { - const { fetchImpl, calls } = countingFetch('203.0.113.9\n'); - const opts = { probes: ['https://p'], fetchImpl, ttlMs: 1000 }; - let t = 10_000; - const now = () => t; - - expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); - t = 10_500; // within TTL → cached, no new probe - expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); - expect(calls()).toBe(1); - - t = 11_500; // past TTL → re-probe - expect(await egressIpCached({ ...opts, now })).toBe('203.0.113.9'); - expect(calls()).toBe(2); - }); -});