Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions apps/cli/src/commands/_cloud-attach.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ import { spawn } from 'node:child_process';
import { appendFileSync } from 'node:fs';
import { homedir } from 'node:os';
import { join } from 'node:path';
import { spinner } from '@clack/prompts';
import { log, spinner } from '@clack/prompts';
import { DEFAULT_RELAY_PORT } from '@agentbox/sandbox-docker';
import type { BoxRecord, Provider } from '@agentbox/core';
import type { AttachOpenIn } from '@agentbox/config';
import { agentResumeArgs } from '../agent-sessions.js';
import { withFirewallRepair } from '../lib/firewall-repair.js';
import { providerForBox } from '../provider/registry.js';
import { runWrappedAttach } from '../wrapped-pty/index.js';
import { pasteHostClipboardImage, uploadImageFileToBox } from '../lib/paste-image.js';
Expand Down Expand Up @@ -185,6 +186,24 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise<void
box = await provider.start(box);
s.stop('box running');
}
// Hetzner only: open the SSH tunnel UP FRONT, self-healing a stale firewall (a
// host egress-IP change locks the per-box firewall) BEFORE any of the later
// establish touches — the resume probe, the detached pre-start, buildAttach.
// Whichever of those connected first would otherwise be an unguarded
// establish: a firewall block there aborts the attach (or silently drops the
// resumed session, since the resume probe swallows exec errors). Doing it once
// here covers them all. Repairs ONLY on an actual connect failure; otherwise a
// `true` over the already-open master is a cheap no-op. This is an ESTABLISH
// path — distinct from the mid-session `reconnect` closure below, which must
// NOT touch the firewall (a checkpoint/pause drop isn't an IP change).
if (box.provider === 'hetzner') {
await withFirewallRepair(
provider,
box,
{ enabled: true, onLog: (line) => log.success(line) },
() => provider.exec(box, ['true']),
);
}
// Attaching to a box that just came back up (a stop / cloud idle-timeout
// resume): if the user passed no args of their own and the box has a resumable
// claude/codex session, launch resuming it (claude --resume <id> / codex resume
Expand Down Expand Up @@ -213,10 +232,9 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise<void
await startDetachedSession(provider, box, args.sessionName, command);
}

let spec = await provider.buildAttach(box, 'agent', {
Comment thread
cursor[bot] marked this conversation as resolved.
sessionName: args.sessionName,
command,
});
// The tunnel is already established (and firewall-healed) by the up-front warm
// -up above, so this reuses the live master.
let spec = await buildAttach(box, 'agent', { sessionName: args.sessionName, command });
// claude only, and only when this host can capture a clipboard image (macOS,
// or a Linux desktop with xclip/wl-paste). Otherwise Ctrl+V forwards verbatim.
const canPaste = args.mode === 'claude' && (await clipboardCaptureAvailable());
Expand All @@ -231,6 +249,12 @@ export async function cloudAgentAttach(args: CloudAgentAttachArgs): Promise<void
// a reboot this lands in a freshly-created tmux session (the snapshot is
// filesystem-only); a blip on a still-running box re-attaches the same live
// session. Returns null to give up (cancelled or timed out).
//
// NOTE: deliberately NO firewall repair here. This is a MID-SESSION drop — a
// checkpoint stops the box (the PTY drops) and we wait for it to come back;
// the host IP didn't change, so re-syncing the firewall would be wrong.
// Firewall self-heal belongs only to establish paths (the up-front warm-up
// above, and `agentbox recover`).
const reconnect = async (
signal: AbortSignal,
): Promise<{ command: string; argv: string[]; env?: Record<string, string> } | null> => {
Expand Down
39 changes: 34 additions & 5 deletions apps/cli/src/commands/recover.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
import { ensureRelay, generateRelayToken, readState, recordBox } from '@agentbox/sandbox-docker';
import { Command } from 'commander';
import { restoreAgentSessions } from '../agent-sessions.js';
import { withFirewallRepair } from '../lib/firewall-repair.js';
import { resolveBoxOrExit } from '../box-ref.js';
import { providerForBox } from '../provider/registry.js';
import { cloudBackendForProvider } from '../provider/cloud-backend.js';
Expand All @@ -46,6 +47,7 @@ import { handleLifecycleError } from './_errors.js';
interface RecoverOpts {
all?: boolean;
attach?: boolean;
firewallSync?: boolean;
provider?: string;
adopt?: boolean;
attachIn?: string;
Expand Down Expand Up @@ -98,15 +100,26 @@ async function readBoxBranch(box: BoxRecord): Promise<string | undefined> {
* (optionally) attach. Returns false on a non-fatal skip (e.g. Hetzner key
* gone) so `--all` keeps going.
*/
async function recoverKnownBox(box: BoxRecord, opts: { attach: boolean }): Promise<boolean> {
async function recoverKnownBox(
box: BoxRecord,
opts: { attach: boolean; firewallSync: boolean },
): Promise<boolean> {
if (await hetznerKeyMissing(box)) {
log.warn(
`${box.name}: per-box SSH key not found at ${hetznerKeyPath(box.cloud?.sandboxId ?? box.id)} — this box was created on another host and can't be controlled from here. Skipping.`,
);
return false;
}
const provider = await providerForBox(box);
const record = await provider.reconnect(box);
// Reconnect is an explicit connection-establishment, so a connect failure may
// be a host IP change that locked the Hetzner firewall — self-heal it (only
// when the egress actually changed) and retry once.
const record = await withFirewallRepair(
provider,
box,
{ enabled: opts.firewallSync, onLog: (line) => log.success(line) },
() => provider.reconnect(box),
);
log.success(`reconnected ${record.name}`);
// Bring back exactly the box's last agent: resume its session if there's one
// to resume, else start it fresh (adopted box / cleared pointer; the only
Expand Down Expand Up @@ -245,6 +258,10 @@ export const recoverCommand = new Command('recover')
)
.option('--all', 'recover every box in local state (skips attach)')
.option('--no-attach', 'restore only; do not attach to the agent')
.option(
'--no-firewall-sync',
"don't auto-sync a Hetzner box's firewall to your current egress IP on a connect failure",
)
.option('--provider <name>', 'cloud provider for --adopt (daytona|hetzner|vercel|e2b)')
.option('--adopt', 'rebuild local state from a live sandbox that is missing from this host')
.action(async function (this: Command, idOrName: string | undefined) {
Expand All @@ -264,7 +281,10 @@ export const recoverCommand = new Command('recover')
}
const adopted = await adoptUnknownBox(provider, idOrName);
if (!adopted) return;
await recoverKnownBox(adopted, { attach: opts.attach !== false });
await recoverKnownBox(adopted, {
attach: opts.attach !== false,
firewallSync: opts.firewallSync !== false,
});
return;
}

Expand All @@ -277,7 +297,13 @@ export const recoverCommand = new Command('recover')
let ok = 0;
for (const box of state.boxes) {
try {
if (await recoverKnownBox(box, { attach: false })) ok++;
if (
await recoverKnownBox(box, {
attach: false,
firewallSync: opts.firewallSync !== false,
})
)
ok++;
} catch (err) {
log.warn(
`${box.name}: recover failed: ${err instanceof Error ? err.message : String(err)}`,
Expand All @@ -289,7 +315,10 @@ export const recoverCommand = new Command('recover')
}

const box = await resolveBoxOrExit(idOrName);
await recoverKnownBox(box, { attach: opts.attach !== false });
await recoverKnownBox(box, {
attach: opts.attach !== false,
firewallSync: opts.firewallSync !== false,
});
} catch (err) {
handleLifecycleError(err);
}
Expand Down
36 changes: 36 additions & 0 deletions apps/cli/src/lib/firewall-repair.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import type { BoxRecord, Provider } from '@agentbox/core';

export interface FirewallRepairOptions {
/** When false, skip repair entirely (the `--no-firewall-sync` opt-out). */
enabled: boolean;
/** Surface what the repair did (e.g. "firewall updated: …"). */
onLog: (line: string) => void;
}

/**
* Run a connection-ESTABLISHMENT attempt with one self-heal retry: if it throws
* and `provider.repairReachability` reports it changed something (today: a
* Hetzner host egress-IP change that locked the per-box firewall), retry once.
* When nothing changed (or repair is unsupported/disabled) the original error
* is rethrown — a non-IP failure isn't masked.
*
* Use this ONLY at establish sites (`recover`, the initial attach connect),
* never around a mid-session reconnect: a checkpoint stops the box and drops the
* connection, and that must not be mistaken for an IP change.
*/
export async function withFirewallRepair<T>(
provider: Provider,
box: BoxRecord,
opts: FirewallRepairOptions,
attempt: () => Promise<T>,
): Promise<T> {
try {
return await attempt();
} catch (err) {
if (!opts.enabled || !provider.repairReachability) throw err;
const repair = await provider.repairReachability(box).catch(() => null);
if (!repair?.changed) throw err;
opts.onLog(repair.detail ?? 'firewall synced to current egress IP');
return await attempt();
}
}
12 changes: 12 additions & 0 deletions packages/core/src/cloud-backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,18 @@ export interface CloudBackend {
*/
refreshPreviewUrl?(h: CloudHandle, port: number): Promise<CloudPreviewUrl>;

/**
* Re-establish host→box reachability when establishing a connection fails for
* a reason the backend can self-heal. Today only Hetzner implements it: a host
* egress-IP change locks the per-box Cloud Firewall, so this re-syncs the
* firewall to the current egress IP — but ONLY when it actually changed (else
* `{ changed: false }`, so the caller surfaces the original error). The CLI
* calls it ONLY on a connection-ESTABLISHMENT failure (`recover`, the initial
* attach connect), never on a mid-session drop (a checkpoint stops the box —
* not an IP change). Backends with public URLs / no host transport omit it.
*/
repairReachability?(h: CloudHandle): Promise<{ changed: boolean; detail?: string }>;

/**
* Browser-bound signed preview URL with the auth token embedded in the URL
* (no header needed). Used for `agentbox url` / `agentbox screen` — anywhere
Expand Down
11 changes: 11 additions & 0 deletions packages/core/src/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,17 @@ export interface Provider {
* provider has no cheaper reconnect path.
*/
reconnect(box: BoxRecord): Promise<BoxRecord>;
/**
* Self-heal host→box reachability when establishing a connection fails for a
* reason the provider can repair. Today only the Hetzner cloud provider acts:
* a host egress-IP change locks the per-box firewall, so this re-syncs it to
* the current egress — but ONLY when it actually changed (`{ changed: false }`
* otherwise, so the caller rethrows the original error). The CLI calls it ONLY
* on a connection-ESTABLISHMENT failure (`recover`, the initial attach
* connect), never on a mid-session drop. Optional — docker and public-URL
* clouds omit it.
*/
repairReachability?(box: BoxRecord): Promise<{ changed: boolean; detail?: string }>;
pause(box: BoxRecord): Promise<void>;
resume(box: BoxRecord): Promise<void>;
stop(box: BoxRecord): Promise<void>;
Expand Down
6 changes: 6 additions & 0 deletions packages/sandbox-cloud/src/cloud-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,12 @@ export function createCloudProvider(
);
},

async repairReachability(box: BoxRecord): Promise<{ changed: boolean; detail?: string }> {
// Delegate to the backend (only Hetzner self-heals its firewall); other
// backends have no host transport to repair.
return (await backend.repairReachability?.(handleFor(box))) ?? { changed: false };
},

async pause(box: BoxRecord): Promise<void> {
await backend.pause(handleFor(box));
await persistLastState(box, 'paused');
Expand Down
86 changes: 81 additions & 5 deletions packages/sandbox-hetzner/src/backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ import { detectEgressIp } from './egress-ip.js';
import {
createPerBoxFirewall,
deletePerBoxFirewall,
firewallNeedsSync,
normalizeSourceCidr,
syncFirewallSource,
} from './firewall.js';
import { pollUntil } from './poll.js';
import { readPreparedState } from './prepared-state.js';
Expand Down Expand Up @@ -230,13 +232,71 @@ function buildSshTarget(state: PerBoxState, vpsIp: string, controlPath?: string)
};
}

/**
* The per-box firewall's current SSH source vs the host's live egress IP. Best-
* effort and called ONLY on a connection-failure path (the open-failed hint and
* `recover`'s auto-sync), never on the happy path — `egressIpCached` keeps the
* probe from storming across retries. Returns null when there's no firewall or
* we can't determine the state.
*/
interface FirewallEgressStatus {
firewallId: number;
/** The CIDR the firewall currently allows for inbound SSH (`source_ips[0]`). */
allowedSource: string | undefined;
/** The host's current egress IP as a `/32` CIDR. */
currentEgress: string;
/** Friendly box ref for the `firewall sync` hint (the `agentbox.box` label). */
boxRef: string;
}

async function firewallEgressStatus(sandboxId: string): Promise<FirewallEgressStatus | null> {
const id = Number.parseInt(sandboxId, 10);
if (!Number.isFinite(id)) return null;
const server = await client().getServer(id);
if (!server) return null;
const firewallId = Number.parseInt(server.labels['agentbox.firewall'] ?? '', 10);
if (!Number.isFinite(firewallId)) return null;
const firewall = await client().getFirewall(firewallId);
const sshRule = firewall?.rules.find((r) => r.direction === 'in' && r.port === '22');
const allowedSource = sshRule?.source_ips?.[0];
// A FRESH probe (not cached): this runs only on a connection-failure path, and
// a stale value could read the new IP as "unchanged" and skip the heal — the
// very mismatch we exist to catch. The poller already de-dupes its recover
// calls and `recover --all` is sequential, so fresh probing here won't storm.
const currentEgress = `${await detectEgressIp({})}/32`;
return {
firewallId,
allowedSource,
currentEgress,
boxRef: server.labels['agentbox.box'] ?? sandboxId,
};
}

async function ensureTunnel(sandboxId: string, state: PerBoxState, vpsIp: string): Promise<void> {
if (tunnels.has(sandboxId)) return;
await tunnels.open({
boxId: sandboxId,
vpsHost: vpsIp,
identity: state.identity,
});
try {
await tunnels.open({
boxId: sandboxId,
vpsHost: vpsIp,
identity: state.identity,
});
} catch (err) {
// A host egress-IP change locks us out of the per-box firewall, surfacing as
// an opaque SSH connect timeout. Best-effort: detect the mismatch and enrich
// the error with the fix. Never let the diagnostic mask the original error
// on a match (box is just down) or a probe failure.
const s = await firewallEgressStatus(sandboxId).catch(() => null);
if (s && firewallNeedsSync(s.allowedSource, s.currentEgress)) {
throw new Error(
`${(err as Error).message}\n\n` +
`hetzner: SSH is blocked by the box firewall — it allows ${s.allowedSource ?? '(no rule)'} ` +
`but your egress IP is now ${s.currentEgress}. Your IP changed; run:\n` +
` agentbox hetzner firewall sync ${s.boxRef}\n` +
`(or \`agentbox recover ${s.boxRef}\`, which auto-syncs).`,
);
}
throw err;
}
}

/**
Expand Down Expand Up @@ -639,6 +699,22 @@ export const hetznerBackend: CloudBackend = {
return { url: `http://127.0.0.1:${String(localPort)}` };
},

async repairReachability(h): Promise<{ changed: boolean; detail?: string }> {
// Re-sync the per-box firewall to the host's CURRENT egress IP, but only
// when it actually changed — the host laptop moved networks and the
// firewall is now blocking us. Called by the CLI ONLY on a connection-
// establishment failure (`recover`, the initial attach connect), never on a
// mid-session drop. A `0.0.0.0/0` firewall (explicit dynamic-IP opt-in) is
// already open, so it's a no-op.
const s = await firewallEgressStatus(h.sandboxId).catch(() => null);
if (!s || !firewallNeedsSync(s.allowedSource, s.currentEgress)) return { changed: false };
await syncFirewallSource(client(), s.firewallId, s.currentEgress);
return {
changed: true,
detail: `firewall updated: SSH now allowed from ${s.currentEgress} (was ${s.allowedSource ?? '(no rule)'})`,
};
},

async startInBoxPortless(h, opts): Promise<void> {
// Bring up a `portless` proxy *inside the VPS* mirroring the host's
// mode so `<boxName>.localhost:<P>` resolves to the same content on
Expand Down
15 changes: 15 additions & 0 deletions packages/sandbox-hetzner/src/firewall.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,21 @@ export async function deletePerBoxFirewall(
}
}

/**
* Whether the firewall's allowed SSH source needs re-syncing to the current
* egress: true when they differ AND the firewall isn't already wide-open
* (`0.0.0.0/0`, the explicit dynamic-IP opt-in). Pure so the hint + auto-sync
* decision is unit-testable without the Hetzner API. An absent allowed source
* (no SSH rule) counts as a mismatch worth syncing.
*/
export function firewallNeedsSync(
allowedSource: string | undefined,
currentEgress: string,
): boolean {
if (allowedSource === '0.0.0.0/0') return false;
return allowedSource !== currentEgress;
}

/**
* Normalize a source spec into a CIDR. Accepts:
* - bare IPv4 → appends `/32`
Expand Down
Loading
Loading