From dfc090609ad5c3dff798c2305a1bacb6b51d7a85 Mon Sep 17 00:00:00 2001 From: aymericcousaert Date: Mon, 25 May 2026 10:45:04 +0200 Subject: [PATCH 1/2] feat(worker): surface child memory state on failures, drop no-op --optimize-for-size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a processing's child task aborts during a heavy run, the only diagnostic that previously made it to the run document was `child process exited with code N`. This is not actionable for operators tuning a deployment: they cannot tell whether the child hit a V8 heap ceiling, was OOM-killed by the cgroup, or died for another reason. Changes: - Print a one-line memory snapshot to stderr at child start and end (`task start mem rss=...MB heap=.../...MB ext=...MB`). Because the parent worker already captures the child's stderr via `buildErrorMessageFromStderr`, these lines are surfaced into the run's error message on non-zero exits without polluting successful runs. - Add `exitCodeHint(code)`: a small mapping from 134 (SIGABRT, the V8 `Check failed: (result.ptr) != nullptr` / `std::bad_alloc` signature) and 137 (SIGKILL, the cgroup OOM-kill signature) to a human-readable hint pointing at `NODE_OPTIONS=--max-old-space-size` and `mem_limit` respectively. The hint is appended to the run error message. - Drop `--optimize-for-size` from the Dockerfile worker CMD (introduced in cbadef8). The flag has no effect on the heavy work — each run is executed in a child process spawned without this flag (see worker/src/worker.ts) — and only a small slowdown effect on the orchestrator's GC. Memory tuning should be done at the container level via `NODE_OPTIONS` and `mem_limit`, which propagate to both parent and child. - Unit tests for `formatMemoryUsage` and `exitCodeHint`. Discovered while investigating a recurring nightly OOM on a customer deployment where the only visible error was "child process exited with code 134" — a 30s-readable hint would have saved a week. --- Dockerfile | 5 ++- .../worker-operations.unit.spec.ts | 38 ++++++++++++++++++- worker/src/task/index.ts | 6 +++ worker/src/utils/worker-operations.ts | 21 ++++++++++ worker/src/worker.ts | 6 ++- 5 files changed, 72 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index efcdf063..5e1afee6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -108,7 +108,10 @@ COPY package.json README.md LICENSE BUILD.json* ./ EXPOSE 9090 # USER node # This would be great to use, but not possible as the volumes are mounted as root WORKDIR /app/worker -CMD ["node", "--disable-warning=ExperimentalWarning", "--optimize-for-size", "index.ts"] +# Heavy per-run work happens in a child process spawned by the orchestrator (see worker/src/worker.ts). +# The orchestrator itself only schedules; memory tuning should be done at the container level via +# NODE_OPTIONS=--max-old-space-size=... and mem_limit, which propagate to both the orchestrator and the child. +CMD ["node", "--disable-warning=ExperimentalWarning", "index.ts"] # ============================= # Install production dependencies for API diff --git a/tests/features/worker-utils/worker-operations.unit.spec.ts b/tests/features/worker-utils/worker-operations.unit.spec.ts index 10ff70cc..8d38ce4c 100644 --- a/tests/features/worker-utils/worker-operations.unit.spec.ts +++ b/tests/features/worker-utils/worker-operations.unit.spec.ts @@ -1,5 +1,5 @@ import { test, expect } from '@playwright/test' -import { buildErrorMessageFromStderr } from '../../../worker/src/utils/worker-operations.ts' +import { buildErrorMessageFromStderr, formatMemoryUsage, exitCodeHint } from '../../../worker/src/utils/worker-operations.ts' test.describe('buildErrorMessageFromStderr', () => { test('falls back to errMessage when stderr is empty', () => { @@ -30,3 +30,39 @@ test.describe('buildErrorMessageFromStderr', () => { expect(buildErrorMessageFromStderr('a\n\nb\n', 'fb')).toBe('a\nb') }) }) + +test.describe('formatMemoryUsage', () => { + test('renders all components rounded to MB', () => { + const mb = 1024 * 1024 + expect(formatMemoryUsage({ + rss: 256 * mb, + heapUsed: 128 * mb, + heapTotal: 200 * mb, + external: 16 * mb, + arrayBuffers: 0 + })).toBe('rss=256MB heap=128/200MB ext=16MB') + }) + + test('returns a string when called without arguments', () => { + expect(typeof formatMemoryUsage()).toBe('string') + }) +}) + +test.describe('exitCodeHint', () => { + test('returns a V8/SIGABRT hint for code 134', () => { + expect(exitCodeHint(134)).toContain('SIGABRT') + expect(exitCodeHint(134)).toContain('NODE_OPTIONS') + }) + + test('returns an OOM-kill hint for code 137', () => { + expect(exitCodeHint(137)).toContain('SIGKILL') + expect(exitCodeHint(137)).toContain('mem_limit') + }) + + test('returns empty string for unrelated codes', () => { + expect(exitCodeHint(1)).toBe('') + expect(exitCodeHint(143)).toBe('') + expect(exitCodeHint(null)).toBe('') + expect(exitCodeHint(undefined)).toBe('') + }) +}) diff --git a/worker/src/task/index.ts b/worker/src/task/index.ts index 862a208d..5993b960 100644 --- a/worker/src/task/index.ts +++ b/worker/src/task/index.ts @@ -3,9 +3,14 @@ import nodemailer from 'nodemailer' import config from '#config' import mongo from '#mongo' import { run, stop } from './task.ts' +import { formatMemoryUsage } from '../utils/worker-operations.ts' let exitCode = 0 +// Memory diagnostic: print on stderr so the parent worker captures it via +// buildErrorMessageFromStderr when the child exits non-zero. +console.error(`task start mem ${formatMemoryUsage()}`) + process.on('SIGTERM', function onSigterm () { console.info('Received SIGTERM signal, shutdown gracefully...') exitCode = 143 @@ -28,4 +33,5 @@ if (err) exitCode = 1 await mongo.close() mailTransport.close() +console.error(`task end mem ${formatMemoryUsage()}`) process.exit(exitCode) diff --git a/worker/src/utils/worker-operations.ts b/worker/src/utils/worker-operations.ts index b290fd71..21eba1bb 100644 --- a/worker/src/utils/worker-operations.ts +++ b/worker/src/utils/worker-operations.ts @@ -18,3 +18,24 @@ export const buildErrorMessageFromStderr = (stderr: string, errMessage: string): if (!lines.length) lines.push(errMessage) return lines.join('\n') } + +/** + * Format a Node.js MemoryUsage as a compact one-liner, suitable for logging. + * All values are rounded to MB. + */ +export const formatMemoryUsage = (mem: NodeJS.MemoryUsage = process.memoryUsage()): string => { + const mb = (n: number) => Math.round(n / 1024 / 1024) + return `rss=${mb(mem.rss)}MB heap=${mb(mem.heapUsed)}/${mb(mem.heapTotal)}MB ext=${mb(mem.external)}MB` +} + +/** + * Map a non-zero child exit code to a human hint about likely causes. + * Returns an empty string when no specific hint applies. + * - 134 = SIGABRT, the signature of a V8 fatal allocation failure (std::bad_alloc / Check failed: (result.ptr) != nullptr). + * - 137 = SIGKILL, the signature of an OOM-kill from the host kernel / docker cgroup. + */ +export const exitCodeHint = (code: number | null | undefined): string => { + if (code === 134) return 'le processus enfant a abandonné (SIGABRT, code 134) — typique d\'une allocation V8 impossible. Vérifier NODE_OPTIONS=--max-old-space-size et la limite mémoire du conteneur.' + if (code === 137) return 'le processus enfant a été tué (SIGKILL, code 137) — typique d\'un OOM-kill par le noyau / cgroup docker. Augmenter mem_limit du conteneur.' + return '' +} diff --git a/worker/src/worker.ts b/worker/src/worker.ts index e107892d..96c817be 100644 --- a/worker/src/worker.ts +++ b/worker/src/worker.ts @@ -18,7 +18,7 @@ import locks from '#locks' import limits from './utils/limits.ts' import { initMetrics } from './utils/metrics.ts' import { finish } from './utils/runs.ts' -import { buildErrorMessageFromStderr } from './utils/worker-operations.ts' +import { buildErrorMessageFromStderr, exitCodeHint } from './utils/worker-operations.ts' const debug = Debug('worker') const debugLoop = Debug('worker-loop') @@ -243,7 +243,9 @@ async function iter (run: Run) { await finish(run) } catch (err: any) { // Build back the original error message from the stderr of the child process - const errorMessage = buildErrorMessageFromStderr(stderr, err.message) + let errorMessage = buildErrorMessageFromStderr(stderr, err.message) + const hint = exitCodeHint(err.code) + if (hint) errorMessage = `${errorMessage}\n${hint}` if (run) { // case of interruption by a SIGTERM From 00cc57a2e5ce951dc0991a76cdc797fc475d1835 Mon Sep 17 00:00:00 2001 From: aymericcousaert Date: Mon, 25 May 2026 10:51:17 +0200 Subject: [PATCH 2/2] review: K8s-friendly wording in exitCodeHint + const errorMessage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Mention both `mem_limit` (Docker Compose) and `resources.limits.memory` (Kubernetes) so the hint is actionable on either platform. - Replace `let errorMessage` + reassignment with a single `const` ternary in worker.ts:iter (no behavior change). The existing unit tests still pass — they assert `.toContain('mem_limit')`, which remains true. --- worker/src/utils/worker-operations.ts | 4 ++-- worker/src/worker.ts | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/worker/src/utils/worker-operations.ts b/worker/src/utils/worker-operations.ts index 21eba1bb..4a4ddd41 100644 --- a/worker/src/utils/worker-operations.ts +++ b/worker/src/utils/worker-operations.ts @@ -35,7 +35,7 @@ export const formatMemoryUsage = (mem: NodeJS.MemoryUsage = process.memoryUsage( * - 137 = SIGKILL, the signature of an OOM-kill from the host kernel / docker cgroup. */ export const exitCodeHint = (code: number | null | undefined): string => { - if (code === 134) return 'le processus enfant a abandonné (SIGABRT, code 134) — typique d\'une allocation V8 impossible. Vérifier NODE_OPTIONS=--max-old-space-size et la limite mémoire du conteneur.' - if (code === 137) return 'le processus enfant a été tué (SIGKILL, code 137) — typique d\'un OOM-kill par le noyau / cgroup docker. Augmenter mem_limit du conteneur.' + if (code === 134) return 'le processus enfant a abandonné (SIGABRT, code 134) — typique d\'une allocation V8 impossible. Vérifier NODE_OPTIONS=--max-old-space-size et la limite mémoire du conteneur (mem_limit / resources.limits.memory).' + if (code === 137) return 'le processus enfant a été tué (SIGKILL, code 137) — typique d\'un OOM-kill par le noyau / cgroup. Augmenter la limite mémoire du conteneur (mem_limit / resources.limits.memory).' return '' } diff --git a/worker/src/worker.ts b/worker/src/worker.ts index 96c817be..f1b762df 100644 --- a/worker/src/worker.ts +++ b/worker/src/worker.ts @@ -242,10 +242,11 @@ async function iter (run: Run) { }) await finish(run) } catch (err: any) { - // Build back the original error message from the stderr of the child process - let errorMessage = buildErrorMessageFromStderr(stderr, err.message) + // Build back the original error message from the stderr of the child process, + // appending a hint when the child exit code matches a known OOM signature. + const baseMessage = buildErrorMessageFromStderr(stderr, err.message) const hint = exitCodeHint(err.code) - if (hint) errorMessage = `${errorMessage}\n${hint}` + const errorMessage = hint ? `${baseMessage}\n${hint}` : baseMessage if (run) { // case of interruption by a SIGTERM