diff --git a/src/runner/LightRunClient.ts b/src/runner/LightRunClient.ts index 0ffc892..95870cf 100644 --- a/src/runner/LightRunClient.ts +++ b/src/runner/LightRunClient.ts @@ -25,6 +25,16 @@ export interface RunState { const POLL_INTERVAL_MS = 500; +/* + * Safety net for a wedged run. A bounded node (timeout > 0) is killed by + * light-runner at its timeout and turns terminal shortly after; if polling + * still sees `running` past timeout + this grace (image pull, extraction and + * teardown all happen outside the container's own timeout window), the run is + * stuck - fail loudly instead of looping forever. Nodes with timeout 0 opt out + * of any limit and keep polling indefinitely. + */ +const POLL_GRACE_MS = 300_000; + let seq = 0; function sleep(ms: number): Promise { @@ -122,7 +132,7 @@ export class LightRunClient { const accepted = (await res.json()) as { id: string }; onRunId(accepted.id); - const state = await this.pollUntilDone(accepted.id, onLog); + const state = await this.pollUntilDone(accepted.id, onLog, node.timeout); let output: Record = {}; const artifactName = OUTPUT_FILE; @@ -174,8 +184,9 @@ export class LightRunClient { } } - private async pollUntilDone(runId: string, onLog?: (line: string) => void): Promise { + private async pollUntilDone(runId: string, onLog?: (line: string) => void, timeoutMs = 0): Promise { let printed = 0; + const deadline = timeoutMs > 0 ? Date.now() + timeoutMs + POLL_GRACE_MS : 0; while (true) { const res = await fetch(`${this.url}/runs/${runId}`, { headers: this.headers() }); if (!res.ok) { @@ -189,6 +200,13 @@ export class LightRunClient { printed = state.logs.length; } if (state.status !== 'running') return state; + if (deadline > 0 && Date.now() > deadline) { + throw new Error( + `light-run run ${runId} still running after ${Math.round( + (timeoutMs + POLL_GRACE_MS) / 1000, + )}s; treating it as wedged`, + ); + } await sleep(POLL_INTERVAL_MS); } }