From 005536a91b1988de614d598047fe6c5a4be0971a Mon Sep 17 00:00:00 2001 From: Chris George Date: Sat, 2 May 2026 11:57:50 -0700 Subject: [PATCH 1/3] feat(api): reserve HealthStatus enum + health field on ContainerSnapshot Adds a new public enum HealthStatus { none, starting, healthy, unhealthy } and a new optional 'health: HealthStatus?' field on ContainerSnapshot, defaulting to nil at all construction sites. Motivation ---------- External orchestrators that drive the API server (the canonical use case is the compose-spec depends_on: condition: service_healthy gate) need to know whether a container is up AND healthy, not just up. Today ContainerSnapshot exposes .running for any started container, so consumers have to fall back to .running and treat liveness == health. Real workloads (databases that take seconds to accept connections, queue brokers that warm up an in-memory state) hit this regularly and end up either waiting too long or proceeding too early. Scope of this PR (deliberately minimal) --------------------------------------- This PR is data-shape only. It adds the enum and the field to the SDK. It does NOT wire a healthcheck observer into the daemon: at runtime the field is always nil, so the on-the-wire behavior is unchanged modulo one new Codable key on ContainerSnapshot. Why ship a nil-only field? ~~~~~~~~~~~~~~~~~~~~~~~~~~ A container-level healthcheck observer is a non-trivial design discussion (where does the spec live? does the API server exec into the container, or does the runtime drive it? does it leak into the sandbox boundary?) and we'd rather have that discussion separately, referencing a concrete companion issue. Reserving the SDK shape now lets downstream tools start coding against the field with the 'always nil today' guarantee documented inline; flipping the implementation on later does not require another SDK-shape PR. Wire compatibility ------------------ ContainerSnapshot is marshaled as Codable JSON over XPC. Adding an optional field is forward-compatible: - Older clients reading from a newer server: ignore the new key. - Newer clients reading from an older server: decode health as nil. Files ----- - Sources/ContainerResource/Container/HealthStatus.swift (new): the enum, with cases documented and a note on the daemon-side observer caveat. - Sources/ContainerResource/Container/ContainerSnapshot.swift: new optional field + init parameter (default nil). Companion issue --------------- Filed at apple/container with the design proposal for the eventual healthcheck observer; this PR is deliberately the smaller surface so the data shape can land independently of that discussion. --- .../Container/ContainerSnapshot.swift | 11 +++++- .../Container/HealthStatus.swift | 35 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 Sources/ContainerResource/Container/HealthStatus.swift diff --git a/Sources/ContainerResource/Container/ContainerSnapshot.swift b/Sources/ContainerResource/Container/ContainerSnapshot.swift index bae992423..0168b7a2e 100644 --- a/Sources/ContainerResource/Container/ContainerSnapshot.swift +++ b/Sources/ContainerResource/Container/ContainerSnapshot.swift @@ -39,16 +39,25 @@ public struct ContainerSnapshot: Codable, Sendable { public var networks: [Attachment] /// When the container was started. public var startedDate: Date? + /// The most recently observed health of the container. + /// + /// At present the daemon does not run a container-level healthcheck + /// observer, so this field is always `nil`. The shape is reserved so that + /// downstream tools (e.g. `compose`) have a stable type to read from once + /// a healthcheck observer is wired into the API server. + public var health: HealthStatus? public init( configuration: ContainerConfiguration, status: RuntimeStatus, networks: [Attachment], - startedDate: Date? = nil + startedDate: Date? = nil, + health: HealthStatus? = nil ) { self.configuration = configuration self.status = status self.networks = networks self.startedDate = startedDate + self.health = health } } diff --git a/Sources/ContainerResource/Container/HealthStatus.swift b/Sources/ContainerResource/Container/HealthStatus.swift new file mode 100644 index 000000000..3f13a4ffc --- /dev/null +++ b/Sources/ContainerResource/Container/HealthStatus.swift @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// The observed health status of a container, as derived from a periodic +/// healthcheck probe. +/// +/// At present the daemon does not run a container-level healthcheck observer, +/// so ``ContainerSnapshot/health`` is always `nil`. This type is reserved for +/// downstream tools (e.g. `compose`) that want a stable shape to read from +/// once a healthcheck observer is wired into the API server. +public enum HealthStatus: String, CaseIterable, Sendable, Codable { + /// No healthcheck has been configured or no result is yet available. + case none + /// The healthcheck is running but has not yet produced a successful probe. + case starting + /// The most recent probe(s) reported the container as healthy. + case healthy + /// The most recent probe(s) reported the container as unhealthy. + case unhealthy +} From 403b37bfadee3065d48454d5040933e475ff0b6b Mon Sep 17 00:00:00 2001 From: Chris George Date: Sat, 2 May 2026 16:09:31 -0700 Subject: [PATCH 2/3] feat(api): wire container healthcheck observer end to end Implements the full healthcheck observer that populates `ContainerSnapshot.health` (the read-only field reserved by CHAOS-1319) by running the configured probe inside the running container, interpreting exit codes through a Docker-compatible state machine, and writing the result back through the `ContainersService` actor under a generation-gated update path. Motivation ---------- CHAOS-1319 reserved the SDK shape (`HealthStatus` enum + optional `health` field on `ContainerSnapshot`) but the daemon never populated it; the field is always `nil` today, so external orchestrators (the canonical use case is a compose-spec orchestrator implementing `depends_on.condition: service_healthy`) can only block on image-baked healthchecks and only when the underlying runtime owns the probe loop. Real workloads (databases that take seconds to accept connections, queue brokers that warm up an in-memory state) need a container-level healthcheck observer that the daemon owns. This PR adds it. What this PR changes -------------------- - Sources/ContainerResource/Container/Healthcheck.swift (new): public Codable / Sendable struct mirroring the Docker / compose-spec schema (`test`, `interval`, `timeout`, `retries`, `start_period`, `start_interval`, `disable`). Validates the probe shape (`NONE` / `CMD` / `CMD-SHELL`) and rejects malformed inputs with actionable error messages. - Sources/ContainerResource/Container/ContainerConfiguration.swift: new optional `healthcheck: Healthcheck?` field, `decodeIfPresent` on the wire so legacy on-disk configurations decode unchanged. - Sources/Services/ContainerAPIService/Server/Containers/ HealthStateMachine.swift (new): pure value type that maps probe outcomes to `HealthStatus`. Implements the Docker-compatible flow: initial `.starting`, immediate transition to `.healthy` on the first successful probe (including during the `start_period` grace window), `retries` consecutive failures post-grace transition to `.unhealthy`, recovery to `.healthy` without restart. - Sources/Services/ContainerAPIService/Server/Containers/ HealthProber.swift (new): `HealthProber` protocol plus production `SandboxClientHealthProber` that drives an existing `SandboxClient` to spawn a fresh `__container_healthcheck_` synthetic process per probe, races `wait()` against a per-probe timeout, and signals `SIGKILL` on timeout to unblock the synthetic wait task before draining the task group. - Sources/Services/ContainerAPIService/Server/Containers/ HealthMonitor.swift (new): per-container observer manager actor that mirrors `ExitMonitor`. `register(id:generation:startedAt: healthcheck:prober:onUpdate:)` cancels any prior observer, fires the initial `.starting` (or `.none` for disabled checks) callback, and runs the probe loop. `unregister(id:)` is idempotent and triggers cooperative cancellation. - Sources/Services/ContainerAPIService/Server/Containers/ ContainersService.swift: new private `healthMonitor: HealthMonitor` field; new `healthGeneration: UInt64` token on `ContainerState` bumped on every transition into `.running`; observer registered inside `startProcess` once the init process is up; unregister wired into `handleContainerExit`. New private `applyHealthUpdate(id: generation:status:)` is the single mutation entry; it drops updates whose generation no longer matches the live container or whose status is no longer `.running`, closing the late-callback / restart race. - Sources/Services/ContainerAPIService/Client/Flags.swift: seven new flags on `Flags.Management` covering `--health-cmd`, `--health-interval`, `--health-timeout`, `--health-retries`, `--health-start-period`, `--health-start-interval`, and `--no-healthcheck`. - Sources/Services/ContainerAPIService/Client/Utility.swift: new private `makeHealthcheck(management:)` that translates the flag bag into a `Healthcheck`. Rejects orphan `--health-*` flags without `--health-cmd` to catch typos at submit time. - Package.swift: `ContainerAPIServiceTests` gains a dependency on the `ContainerAPIService` target so the new tests can use the `@testable` import. - Tests: - Tests/ContainerResourceTests/HealthcheckTest.swift: 12 tests covering shape parsing (`CMD` / `CMD-SHELL` / `NONE`), validation error paths, the `disable` flag, the `probeInterval` selection rule (start-interval inside the grace window only), and a legacy-config Codable round-trip regression. - Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift: 10 tests exercising every transition documented in the design: initial state, success during grace, failure during grace, failures past grace toward `retries`, success resets the counter, `unhealthy` recovers without restart, disabled machine ignores inputs, retries=0 corner case. - Tests/ContainerAPIServiceTests/HealthMonitorTest.swift: 4 tests against a `ScriptedProber` actor (deterministic probe outcomes) and a `StatusRecorder` (ordered update capture). Covers the disabled-check single-callback path, the `.starting` -> `.healthy` transition, the consecutive-failure -> `.unhealthy` path, and the unregister-cancels-loop guarantee. Design notes ------------ The implementation follows the architecture recommendation produced during a design consult (see CHAOS-1381 thread): observer placement in a dedicated actor (mirroring `ExitMonitor`), probe execution through the existing `createProcess` / `startProcess` / `wait` path (no new XPC route added), Docker-compatible state machine semantics, and generation-gated snapshot updates rather than relying on cancellation alone to suppress stale callbacks. Wire compatibility ------------------ `ContainerConfiguration.healthcheck` is a new optional field, decoded with `decodeIfPresent`. Containers persisted by older daemons round-trip cleanly (covered by `testLegacyContainerConfigurationDecodesWithoutHealthcheck`). New CLI flags are independent and have no effect when omitted, so older clients hitting a newer daemon and vice versa both behave identically to today. Known limitations (intentional, follow-up work) ----------------------------------------------- - The `--health-cmd` CLI shape currently accepts only the shell form (translated to `["CMD-SHELL", cmd]`). The richer `["CMD", "exec", "arg1", ...]` form is reachable via API clients that build `Healthcheck` directly (e.g. compose orchestrators). Adding a CLI surface for CMD-form probes is a follow-up. - Daemon restart does not rehydrate health state. On daemon launch, observers are restarted from `.starting` rather than persisting probe counters. Per the design consult this is deliberate scope for v1. - Probe intervals use Foundation `TimeInterval` (Double seconds). Compose-spec duration strings (`30s`, `1m30s`) are parsed by the client (e.g. container-compose) before reaching the API. Pairs with CHAOS-1319 --------------------- CHAOS-1319 reserved the SDK shape (`ContainerSnapshot.health`). This PR is the runtime that populates it, closing the loop for compose-spec `depends_on.condition: service_healthy` against container-compose orchestrators. CHAOS-1319's PR (full-chaos/container#13) should land first or be batched with this one. Verification ------------ - `swift build -c release` clean on macOS 26 / Apple silicon. - `swift test --filter 'HealthcheckTest|HealthStateMachineTest| HealthMonitorTest'` passes 26/26: 12 Healthcheck data shape + Codable + validation, 10 pure HealthStateMachine transitions, 4 HealthMonitor actor lifecycle / cancellation tests. --- Package.swift | 1 + .../Container/ContainerConfiguration.swift | 6 + .../Container/Healthcheck.swift | 214 ++++++++++++++++++ .../ContainerAPIService/Client/Flags.swift | 42 ++++ .../ContainerAPIService/Client/Utility.swift | 29 +++ .../Server/Containers/ContainersService.swift | 34 +++ .../Server/Containers/HealthMonitor.swift | 127 +++++++++++ .../Server/Containers/HealthProber.swift | 132 +++++++++++ .../Containers/HealthStateMachine.swift | 60 +++++ .../HealthMonitorTest.swift | 194 ++++++++++++++++ .../HealthStateMachineTest.swift | 113 +++++++++ .../HealthcheckTest.swift | 173 ++++++++++++++ 12 files changed, 1125 insertions(+) create mode 100644 Sources/ContainerResource/Container/Healthcheck.swift create mode 100644 Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift create mode 100644 Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift create mode 100644 Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift create mode 100644 Tests/ContainerAPIServiceTests/HealthMonitorTest.swift create mode 100644 Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift create mode 100644 Tests/ContainerResourceTests/HealthcheckTest.swift diff --git a/Package.swift b/Package.swift index 442aa8b9a..bf3f7d958 100644 --- a/Package.swift +++ b/Package.swift @@ -203,6 +203,7 @@ let package = Package( name: "ContainerAPIServiceTests", dependencies: [ .product(name: "Containerization", package: "containerization"), + "ContainerAPIService", "ContainerResource", "ContainerRuntimeLinuxClient", "ContainerRuntimeClient", diff --git a/Sources/ContainerResource/Container/ContainerConfiguration.swift b/Sources/ContainerResource/Container/ContainerConfiguration.swift index 4dc75b9ad..99e898afb 100644 --- a/Sources/ContainerResource/Container/ContainerConfiguration.swift +++ b/Sources/ContainerResource/Container/ContainerConfiguration.swift @@ -61,6 +61,10 @@ public struct ContainerConfiguration: Sendable, Codable { public var shmSize: UInt64? /// Signal to send to the container process on stop (from image config). public var stopSignal: String? + /// Optional periodic healthcheck spec. When set and not effectively + /// disabled, the API server starts a per-container observer that runs + /// the configured probe and updates ``ContainerSnapshot/health``. + public var healthcheck: Healthcheck? enum CodingKeys: String, CodingKey { case id @@ -85,6 +89,7 @@ public struct ContainerConfiguration: Sendable, Codable { case capDrop case shmSize case stopSignal + case healthcheck } /// Create a configuration from the supplied Decoder, initializing missing @@ -120,6 +125,7 @@ public struct ContainerConfiguration: Sendable, Codable { capDrop = try container.decodeIfPresent([String].self, forKey: .capDrop) ?? [] shmSize = try container.decodeIfPresent(UInt64.self, forKey: .shmSize) stopSignal = try container.decodeIfPresent(String.self, forKey: .stopSignal) + healthcheck = try container.decodeIfPresent(Healthcheck.self, forKey: .healthcheck) } public struct DNSConfiguration: Sendable, Codable { diff --git a/Sources/ContainerResource/Container/Healthcheck.swift b/Sources/ContainerResource/Container/Healthcheck.swift new file mode 100644 index 000000000..c443f8a4b --- /dev/null +++ b/Sources/ContainerResource/Container/Healthcheck.swift @@ -0,0 +1,214 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationError +import Foundation + +/// Configuration for a periodic, container-level healthcheck. +/// +/// The shape mirrors the Docker / compose-spec healthcheck schema so that +/// downstream tools (the canonical use case is a compose-spec orchestrator +/// implementing `depends_on.condition: service_healthy`) can populate this +/// type directly from a `docker-compose.yml` `healthcheck:` block. +/// +/// Semantics applied by the daemon's healthcheck observer: +/// +/// 1. When the observer starts and the healthcheck is enabled, the +/// container's ``ContainerSnapshot/health`` is set to +/// ``HealthStatus/starting``. +/// 2. While the wall-clock age of the container is within ``startPeriod``, +/// failed probes do not advance the consecutive failure counter. +/// Successful probes during the grace period transition the container +/// immediately to ``HealthStatus/healthy``. +/// 3. After the grace period elapses, ``retries`` consecutive failed probes +/// transition the container to ``HealthStatus/unhealthy``. A subsequent +/// successful probe resets the counter and transitions back to +/// ``HealthStatus/healthy`` without requiring a restart. +/// 4. A probe that does not return within ``timeout`` counts as a failed +/// probe. +/// 5. ``test`` of `["NONE"]` and ``disable`` set to `true` both bypass the +/// observer entirely; ``ContainerSnapshot/health`` remains `nil`. +public struct Healthcheck: Codable, Sendable, Equatable { + /// The probe specification. + /// + /// Compatible shapes: + /// - `["NONE"]` — disable any healthcheck inherited from the image. + /// - `["CMD", "executable", "arg1", ...]` — run `executable` with the + /// supplied arguments directly inside the container. Exit code `0` + /// means healthy, any other exit code means unhealthy. + /// - `["CMD-SHELL", "shell command string"]` — run the entire command + /// string through the container's default shell (`/bin/sh -c`). + public let test: [String] + + /// Time between consecutive probes, in seconds. Defaults to 30 seconds. + public let interval: TimeInterval + + /// Per-probe deadline, in seconds. A probe that does not return within + /// this window counts as a failed probe. Defaults to 30 seconds. + public let timeout: TimeInterval + + /// Number of consecutive failed probes that transition the container + /// from ``HealthStatus/healthy`` (or ``HealthStatus/starting``) to + /// ``HealthStatus/unhealthy``. Defaults to 3. + public let retries: Int + + /// Optional grace window, in seconds, during which failed probes do not + /// count toward ``retries``. The first successful probe during this + /// window transitions the container immediately to + /// ``HealthStatus/healthy``. When `nil`, no grace is applied. + public let startPeriod: TimeInterval? + + /// Optional probe interval used while the container is still within + /// ``startPeriod``. When `nil`, ``interval`` is used during the grace + /// window as well. + public let startInterval: TimeInterval? + + /// Bypass the observer entirely. Equivalent to ``test`` = `["NONE"]`. + public let disable: Bool? + + /// Default probe interval applied when the configuration omits one. + public static let defaultInterval: TimeInterval = 30 + /// Default per-probe deadline applied when the configuration omits one. + public static let defaultTimeout: TimeInterval = 30 + /// Default consecutive-failure threshold applied when the configuration + /// omits one. + public static let defaultRetries: Int = 3 + + public init( + test: [String], + interval: TimeInterval = Healthcheck.defaultInterval, + timeout: TimeInterval = Healthcheck.defaultTimeout, + retries: Int = Healthcheck.defaultRetries, + startPeriod: TimeInterval? = nil, + startInterval: TimeInterval? = nil, + disable: Bool? = nil + ) throws { + self.test = test + self.interval = interval + self.timeout = timeout + self.retries = retries + self.startPeriod = startPeriod + self.startInterval = startInterval + self.disable = disable + try validate() + } + + enum CodingKeys: String, CodingKey { + case test + case interval + case timeout + case retries + case startPeriod + case startInterval + case disable + } + + public init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + test = try container.decode([String].self, forKey: .test) + interval = try container.decodeIfPresent(TimeInterval.self, forKey: .interval) ?? Healthcheck.defaultInterval + timeout = try container.decodeIfPresent(TimeInterval.self, forKey: .timeout) ?? Healthcheck.defaultTimeout + retries = try container.decodeIfPresent(Int.self, forKey: .retries) ?? Healthcheck.defaultRetries + startPeriod = try container.decodeIfPresent(TimeInterval.self, forKey: .startPeriod) + startInterval = try container.decodeIfPresent(TimeInterval.self, forKey: .startInterval) + disable = try container.decodeIfPresent(Bool.self, forKey: .disable) + try validate() + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encode(test, forKey: .test) + try container.encode(interval, forKey: .interval) + try container.encode(timeout, forKey: .timeout) + try container.encode(retries, forKey: .retries) + try container.encodeIfPresent(startPeriod, forKey: .startPeriod) + try container.encodeIfPresent(startInterval, forKey: .startInterval) + try container.encodeIfPresent(disable, forKey: .disable) + } + + /// Whether the healthcheck is effectively disabled (no observer should + /// be started, ``ContainerSnapshot/health`` remains `nil`). + public var isEffectivelyDisabled: Bool { + if disable == true { return true } + if test.count == 1 && test[0] == "NONE" { return true } + return false + } + + /// The probe interval that should be used at the supplied wall-clock age + /// of the container. Returns ``startInterval`` while the container is + /// still within ``startPeriod``, otherwise ``interval``. + public func probeInterval(forContainerAge age: TimeInterval) -> TimeInterval { + if let startPeriod, age < startPeriod, let startInterval { + return startInterval + } + return interval + } + + private func validate() throws { + guard !test.isEmpty else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test must not be empty" + ) + } + if !isEffectivelyDisabled { + switch test[0] { + case "CMD", "CMD-SHELL": + guard test.count >= 2 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test '\(test[0])' requires at least one argument" + ) + } + default: + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test must start with 'NONE', 'CMD', or 'CMD-SHELL' (got '\(test[0])')" + ) + } + } + guard interval > 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck interval must be positive (got \(interval))" + ) + } + guard timeout > 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck timeout must be positive (got \(timeout))" + ) + } + guard retries >= 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck retries must be non-negative (got \(retries))" + ) + } + if let startPeriod, startPeriod < 0 { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck start_period must be non-negative (got \(startPeriod))" + ) + } + if let startInterval, startInterval <= 0 { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck start_interval must be positive (got \(startInterval))" + ) + } + } +} diff --git a/Sources/Services/ContainerAPIService/Client/Flags.swift b/Sources/Services/ContainerAPIService/Client/Flags.swift index f8361ef68..65cc5926f 100644 --- a/Sources/Services/ContainerAPIService/Client/Flags.swift +++ b/Sources/Services/ContainerAPIService/Client/Flags.swift @@ -345,6 +345,48 @@ public struct Flags { @Option(name: [.customLong("volume"), .short], help: "Bind mount a volume into the container") public var volumes: [String] = [] + @Option( + name: .customLong("health-cmd"), + help: "Healthcheck command to run inside the container (executed via /bin/sh -c)." + ) + public var healthCmd: String? + + @Option( + name: .customLong("health-interval"), + help: "Time between healthcheck probes, in seconds (default 30)." + ) + public var healthInterval: Double? + + @Option( + name: .customLong("health-timeout"), + help: "Per-probe deadline for the healthcheck, in seconds (default 30)." + ) + public var healthTimeout: Double? + + @Option( + name: .customLong("health-retries"), + help: "Number of consecutive failed probes before the container is reported unhealthy (default 3)." + ) + public var healthRetries: Int? + + @Option( + name: .customLong("health-start-period"), + help: "Grace window after start during which failed probes do not count, in seconds." + ) + public var healthStartPeriod: Double? + + @Option( + name: .customLong("health-start-interval"), + help: "Probe interval used while still within the grace window, in seconds." + ) + public var healthStartInterval: Double? + + @Flag( + name: .customLong("no-healthcheck"), + help: "Disable any image-baked healthcheck for this container." + ) + public var noHealthcheck: Bool = false + public func validate() throws { if dnsDisabled { let hasDNSConfig = diff --git a/Sources/Services/ContainerAPIService/Client/Utility.swift b/Sources/Services/ContainerAPIService/Client/Utility.swift index cf5b8d6df..cf6e95c66 100644 --- a/Sources/Services/ContainerAPIService/Client/Utility.swift +++ b/Sources/Services/ContainerAPIService/Client/Utility.swift @@ -268,9 +268,38 @@ public struct Utility { config.runtimeHandler = runtime } + config.healthcheck = try Self.makeHealthcheck(management: management) + return (config, kernel, management.initImage) } + private static func makeHealthcheck(management: Flags.Management) throws -> Healthcheck? { + if management.noHealthcheck { + return try Healthcheck(test: ["NONE"]) + } + guard let cmd = management.healthCmd else { + // Reject orphan health-* flags without a command — catch typos early. + if management.healthInterval != nil || management.healthTimeout != nil + || management.healthRetries != nil || management.healthStartPeriod != nil + || management.healthStartInterval != nil + { + throw ContainerizationError( + .invalidArgument, + message: "--health-* flags require --health-cmd to be specified" + ) + } + return nil + } + return try Healthcheck( + test: ["CMD-SHELL", cmd], + interval: management.healthInterval ?? Healthcheck.defaultInterval, + timeout: management.healthTimeout ?? Healthcheck.defaultTimeout, + retries: management.healthRetries ?? Healthcheck.defaultRetries, + startPeriod: management.healthStartPeriod, + startInterval: management.healthStartInterval + ) + } + static func getAttachmentConfigurations( containerId: String, builtinNetworkId: String?, diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index b18bf55d5..0b82dc170 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -35,6 +35,7 @@ public actor ContainersService { struct ContainerState { var snapshot: ContainerSnapshot var client: RuntimeClient? = nil + var healthGeneration: UInt64 = 0 func getClient() throws -> RuntimeClient { guard let client else { @@ -58,6 +59,7 @@ public actor ContainersService { private let runtimePlugins: [Plugin] private let exitMonitor: ExitMonitor private let containerSystemConfig: ContainerSystemConfig + private let healthMonitor: HealthMonitor private let lock: AsyncLock private var containers: [String: ContainerState] @@ -75,6 +77,7 @@ public actor ContainersService { let containerRoot = appRoot.appendingPathComponent("containers") try FileManager.default.createDirectory(at: containerRoot, withIntermediateDirectories: true) self.exitMonitor = ExitMonitor(log: log) + self.healthMonitor = HealthMonitor(log: log) self.lock = AsyncLock(log: log) self.containerRoot = containerRoot self.pluginLoader = pluginLoader @@ -570,7 +573,25 @@ public actor ContainersService { state.snapshot.status = .running state.snapshot.networks = sandboxSnapshot.networks state.snapshot.startedDate = Date() + state.healthGeneration &+= 1 + let healthGeneration = state.healthGeneration + let healthcheck = state.snapshot.configuration.healthcheck + let healthClient = client + let startedAt = state.snapshot.startedDate ?? Date() await self.setContainerState(id, state, context: context) + + if let healthcheck { + let prober = SandboxClientHealthProber(sandboxClient: healthClient, log: self.log) + await self.healthMonitor.register( + id: id, + generation: healthGeneration, + startedAt: startedAt, + healthcheck: healthcheck, + prober: prober + ) { [weak self] containerID, gen, status in + await self?.applyHealthUpdate(id: containerID, generation: gen, status: status) + } + } } catch { await self.exitMonitor.stopTracking(id: id) try? await client.stop(options: ContainerStopOptions.default) @@ -945,6 +966,7 @@ public actor ContainersService { } await self.exitMonitor.stopTracking(id: id) + await self.healthMonitor.unregister(id: id) // Shutdown and deregister the runtime service self.log.info("shutting down runtime service", metadata: ["id": "\(id)"]) @@ -1117,6 +1139,18 @@ public actor ContainersService { self.containers[id] = state } + /// Apply a health-status update from the ``HealthMonitor`` observer. + /// Generation-gated: drops updates whose generation does not match the + /// current container instance, the container has been removed, or its + /// status is no longer ``RuntimeStatus/running``. + private func applyHealthUpdate(id: String, generation: UInt64, status: HealthStatus) async { + guard var state = self.containers[id] else { return } + guard state.healthGeneration == generation else { return } + guard state.snapshot.status == .running else { return } + state.snapshot.health = status + self.containers[id] = state + } + private func getContainerState(id: String, context: AsyncLock.Context) throws -> ContainerState { try self._getContainerState(id: id) } diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift new file mode 100644 index 000000000..0e3d6ae5c --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift @@ -0,0 +1,127 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Logging + +/// Per-container healthcheck observer manager. Mirrors the lifecycle pattern +/// of ``ExitMonitor``: callers register a container at the moment it reaches +/// ``RuntimeStatus/running`` and unregister when it transitions away from +/// running. The actor owns the per-container observer ``Task`` and is the +/// single point that may cancel them. +/// +/// Updates flow back to the caller through the supplied ``onUpdate`` callback +/// together with the generation token that was passed to ``register``. The +/// receiver is expected to drop updates whose generation no longer matches +/// the live container instance (see CHAOS-1381 design notes). +public actor HealthMonitor { + /// Callback signature: `(containerID, generation, status)`. + public typealias HealthUpdateCallback = @Sendable (String, UInt64, HealthStatus) async -> Void + + private var tasks: [String: Task] = [:] + private let log: Logger? + + public init(log: Logger? = nil) { + self.log = log + } + + /// Start observing the addressed container. Cancels any prior observer + /// for the same id. When ``Healthcheck/isEffectivelyDisabled`` is `true` + /// the callback is invoked once with ``HealthStatus/none`` and no probe + /// loop is started. + public func register( + id: String, + generation: UInt64, + startedAt: Date, + healthcheck: Healthcheck, + prober: any HealthProber, + onUpdate: @escaping HealthUpdateCallback + ) async { + await cancelExistingTask(id: id) + + if healthcheck.isEffectivelyDisabled { + await onUpdate(id, generation, .none) + return + } + + await onUpdate(id, generation, .starting) + + let log = self.log + let task = Task { [prober] in + var stateMachine = HealthStateMachine(configuration: healthcheck) + var lastReportedStatus = stateMachine.currentStatus + + while !Task.isCancelled { + let now = Date() + let age = now.timeIntervalSince(startedAt) + let interval = healthcheck.probeInterval(forContainerAge: age) + + do { + try await Task.sleep(nanoseconds: UInt64(max(0, interval) * 1_000_000_000)) + } catch { + return + } + + let probeResult = await prober.runProbe( + containerID: id, + test: healthcheck.test, + timeout: healthcheck.timeout + ) + + let probeAge = Date().timeIntervalSince(startedAt) + switch probeResult { + case .success: + stateMachine.recordSuccess() + case .failure, .timedOut: + stateMachine.recordFailure(containerAge: probeAge) + } + + if stateMachine.currentStatus != lastReportedStatus { + lastReportedStatus = stateMachine.currentStatus + log?.info( + "health status transition", + metadata: [ + "id": "\(id)", + "status": "\(stateMachine.currentStatus)", + "result": "\(probeResult)", + ]) + await onUpdate(id, generation, stateMachine.currentStatus) + } + } + } + tasks[id] = task + } + + /// Stop observing the addressed container if a task is registered. Idempotent. + public func unregister(id: String) async { + await cancelExistingTask(id: id) + } + + /// Cancel every registered observer. Used during daemon shutdown. + public func unregisterAll() async { + for id in tasks.keys { + tasks[id]?.cancel() + } + tasks.removeAll() + } + + private func cancelExistingTask(id: String) async { + if let existing = tasks.removeValue(forKey: id) { + existing.cancel() + } + } +} diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift new file mode 100644 index 000000000..ac7666b42 --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import ContainerSandboxServiceClient +import Foundation +import Logging + +/// The outcome of a single healthcheck probe attempt. +public enum HealthProbeResult: Sendable, Equatable { + case success + case failure(exitCode: Int32?) + case timedOut +} + +/// Abstracts the execution of a single probe so that the observer logic can +/// be unit-tested without a running sandbox. +public protocol HealthProber: Sendable { + /// Run a single probe inside the addressed container and return the + /// outcome. The implementation is responsible for enforcing the supplied + /// `timeout`; callers expect this method to return promptly. + func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult +} + +/// Production ``HealthProber`` that drives an existing ``SandboxClient`` to +/// spawn a fresh process per probe. Stdio is intentionally not forwarded so +/// the probe leaves no log output behind; the exit code (or absence thereof +/// on timeout) is the only signal consumed. +public struct SandboxClientHealthProber: HealthProber { + private let sandboxClient: SandboxClient + private let log: Logger? + private static let probeIDPrefix = "__container_healthcheck_" + + public init(sandboxClient: SandboxClient, log: Logger? = nil) { + self.sandboxClient = sandboxClient + self.log = log + } + + public func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult { + guard let processConfig = Self.makeProcessConfiguration(test: test) else { + return .failure(exitCode: nil) + } + let probeID = Self.probeIDPrefix + UUID().uuidString + + do { + try await sandboxClient.createProcess(probeID, config: processConfig, stdio: [nil, nil, nil]) + try await sandboxClient.startProcess(probeID) + } catch { + log?.warning( + "healthcheck probe failed to start", + metadata: [ + "id": "\(containerID)", + "probe": "\(probeID)", + "error": "\(error)", + ]) + return .failure(exitCode: nil) + } + + let outcome = await withTaskGroup(of: HealthProbeResult.self) { group in + group.addTask { [sandboxClient] in + do { + let status = try await sandboxClient.wait(probeID) + return status.exitCode == 0 + ? .success + : .failure(exitCode: status.exitCode) + } catch is CancellationError { + return .timedOut + } catch { + return .failure(exitCode: nil) + } + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeout * 1_000_000_000)) + return .timedOut + } + + let first = await group.next() ?? .failure(exitCode: nil) + // Unblock any still-running wait() by killing the synthetic probe. + // Done before draining the group so the wait task can return. + if first == .timedOut { + try? await sandboxClient.kill(probeID, signal: 9) + } + group.cancelAll() + for await _ in group {} + return first + } + return outcome + } + + private static func makeProcessConfiguration(test: [String]) -> ProcessConfiguration? { + guard let kind = test.first else { return nil } + switch kind { + case "CMD": + guard test.count >= 2 else { return nil } + return ProcessConfiguration( + executable: test[1], + arguments: Array(test.dropFirst(2)), + environment: [] + ) + case "CMD-SHELL": + guard test.count >= 2 else { return nil } + return ProcessConfiguration( + executable: "/bin/sh", + arguments: ["-c", test[1]], + environment: [] + ) + default: + return nil + } + } +} diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift new file mode 100644 index 000000000..cb4a4e7a7 --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation + +/// Pure state machine that maps a sequence of healthcheck probe outcomes to a +/// ``HealthStatus`` for a single container. This type is intentionally +/// dependency-free so the transition rules (Docker-compatible: grace window, +/// retries threshold, recovery without restart) can be exercised in isolation +/// by the unit-test layer. +public struct HealthStateMachine: Sendable { + public let configuration: Healthcheck + public private(set) var currentStatus: HealthStatus + public private(set) var consecutiveFailures: Int + + public init(configuration: Healthcheck) { + self.configuration = configuration + self.consecutiveFailures = 0 + self.currentStatus = configuration.isEffectivelyDisabled ? .none : .starting + } + + /// Record a probe that completed successfully (exit code zero). Resets the + /// consecutive failure counter and transitions the status to ``.healthy``. + /// No-op when the healthcheck is disabled. + public mutating func recordSuccess() { + guard !configuration.isEffectivelyDisabled else { return } + consecutiveFailures = 0 + currentStatus = .healthy + } + + /// Record a probe that did not complete successfully. Failures occurring + /// while the container's age is still within ``Healthcheck/startPeriod`` + /// do not advance the consecutive failure counter (grace window). + /// Otherwise the counter advances and the status transitions to + /// ``.unhealthy`` once it reaches ``Healthcheck/retries``. + public mutating func recordFailure(containerAge: TimeInterval) { + guard !configuration.isEffectivelyDisabled else { return } + if let startPeriod = configuration.startPeriod, containerAge < startPeriod { + return + } + consecutiveFailures += 1 + if consecutiveFailures >= configuration.retries { + currentStatus = .unhealthy + } + } +} diff --git a/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift b/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift new file mode 100644 index 000000000..1865f2d36 --- /dev/null +++ b/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift @@ -0,0 +1,194 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Testing + +@testable import ContainerAPIService + +/// Mock prober that returns scripted results in order. Each call consumes one +/// entry; once exhausted it parks indefinitely so the test stays in a known +/// state when the observer is cancelled mid-loop. +private actor ScriptedProber: HealthProber { + private var script: [HealthProbeResult] + private var calls: [(containerID: String, test: [String], timeout: TimeInterval)] = [] + + init(_ script: [HealthProbeResult]) { + self.script = script + } + + func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult { + calls.append((containerID, test, timeout)) + if script.isEmpty { + try? await Task.sleep(nanoseconds: 10_000_000_000) + return .failure(exitCode: nil) + } + return script.removeFirst() + } + + func recordedCalls() -> [(containerID: String, test: [String], timeout: TimeInterval)] { + calls + } +} + +/// Drains a sequence of expected status updates emitted by the monitor. +private actor StatusRecorder { + private var updates: [(id: String, generation: UInt64, status: HealthStatus)] = [] + private var continuations: [(Int, CheckedContinuation)] = [] + + func record(id: String, generation: UInt64, status: HealthStatus) { + updates.append((id, generation, status)) + // Wake any waiters whose threshold has been reached. + continuations = continuations.filter { (threshold, cont) in + if updates.count >= threshold { + cont.resume() + return false + } + return true + } + } + + func waitForUpdates(count: Int) async { + if updates.count >= count { return } + await withCheckedContinuation { cont in + continuations.append((count, cont)) + } + } + + func snapshot() -> [(id: String, generation: UInt64, status: HealthStatus)] { + updates + } +} + +struct HealthMonitorTest { + private func makeQuickHealthcheck(retries: Int = 1) throws -> Healthcheck { + try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 0.005, + timeout: 1, + retries: retries + ) + } + + @Test func disabledHealthcheckEmitsSingleNoneUpdate() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([]) + let recorder = StatusRecorder() + + let h = try Healthcheck(test: ["NONE"]) + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 1) + let updates = await recorder.snapshot() + #expect(updates.count == 1) + #expect(updates[0].id == "c1") + #expect(updates[0].generation == 1) + #expect(updates[0].status == .none) + await monitor.unregisterAll() + } + + @Test func enabledHealthcheckEmitsStartingThenHealthy() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([.success]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck() + await monitor.register( + id: "c1", + generation: 7, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 2) + let updates = await recorder.snapshot() + #expect(updates.count >= 2) + #expect(updates[0].status == .starting) + #expect(updates[1].status == .healthy) + #expect(updates.allSatisfy { $0.id == "c1" && $0.generation == 7 }) + await monitor.unregisterAll() + } + + @Test func consecutiveFailuresEventuallyTransitionToUnhealthy() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([ + .failure(exitCode: 1), + .failure(exitCode: 1), + .failure(exitCode: 1), + ]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck(retries: 3) + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 2) + let updates = await recorder.snapshot() + let unhealthyUpdates = updates.filter { $0.status == .unhealthy } + #expect(!unhealthyUpdates.isEmpty) + await monitor.unregisterAll() + } + + @Test func unregisterCancelsObserverLoop() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([.success, .success, .success]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck() + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + // Allow at least one probe to land before cancelling. + await recorder.waitForUpdates(count: 2) + await monitor.unregister(id: "c1") + + // Sleep briefly to let any in-flight probes finish, then capture. + try await Task.sleep(nanoseconds: 50_000_000) + let after = await recorder.snapshot().count + + // Verify that no significant additional updates accrue beyond what + // arrived during the brief settle window after cancellation. + try await Task.sleep(nanoseconds: 100_000_000) + let later = await recorder.snapshot().count + #expect(later <= after + 1) + } +} diff --git a/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift b/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift new file mode 100644 index 000000000..6865b00c3 --- /dev/null +++ b/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Testing + +@testable import ContainerAPIService + +struct HealthStateMachineTest { + private func makeHealthcheck( + retries: Int = 3, + startPeriod: TimeInterval? = nil + ) throws -> Healthcheck { + try Healthcheck( + test: ["CMD-SHELL", "true"], + retries: retries, + startPeriod: startPeriod + ) + } + + @Test func initialStateIsStartingWhenEnabled() throws { + let sm = HealthStateMachine(configuration: try makeHealthcheck()) + #expect(sm.currentStatus == .starting) + } + + @Test func initialStateIsNoneWhenDisabled() throws { + let h = try Healthcheck(test: ["NONE"]) + let sm = HealthStateMachine(configuration: h) + #expect(sm.currentStatus == .none) + } + + @Test func successDuringGraceTransitionsImmediatelyToHealthy() throws { + let h = try makeHealthcheck(startPeriod: 60) + var sm = HealthStateMachine(configuration: h) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + } + + @Test func failuresDuringGraceDoNotCount() throws { + let h = try makeHealthcheck(retries: 2, startPeriod: 60) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 5) + sm.recordFailure(containerAge: 10) + sm.recordFailure(containerAge: 15) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 0) + } + + @Test func failuresAfterGraceCountTowardRetries() throws { + let h = try makeHealthcheck(retries: 3, startPeriod: 30) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 60) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 1) + sm.recordFailure(containerAge: 90) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 2) + sm.recordFailure(containerAge: 120) + #expect(sm.currentStatus == .unhealthy) + #expect(sm.consecutiveFailures == 3) + } + + @Test func successResetsFailureCounter() throws { + let h = try makeHealthcheck(retries: 3) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + sm.recordFailure(containerAge: 130) + #expect(sm.consecutiveFailures == 2) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + #expect(sm.consecutiveFailures == 0) + } + + @Test func unhealthyRecoversToHealthyOnSuccess() throws { + let h = try makeHealthcheck(retries: 1) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .unhealthy) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + #expect(sm.consecutiveFailures == 0) + } + + @Test func disabledMachineIgnoresAllInputs() throws { + let h = try Healthcheck(test: ["NONE"]) + var sm = HealthStateMachine(configuration: h) + sm.recordSuccess() + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .none) + #expect(sm.consecutiveFailures == 0) + } + + @Test func retriesEqualsZeroFailsImmediatelyPostGrace() throws { + let h = try makeHealthcheck(retries: 0) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .unhealthy) + } +} diff --git a/Tests/ContainerResourceTests/HealthcheckTest.swift b/Tests/ContainerResourceTests/HealthcheckTest.swift new file mode 100644 index 000000000..d0e10b285 --- /dev/null +++ b/Tests/ContainerResourceTests/HealthcheckTest.swift @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationError +import Foundation +import Testing + +@testable import ContainerResource + +struct HealthcheckTest { + @Test func cmdFormParsesAndValidates() throws { + let h = try Healthcheck(test: ["CMD", "curl", "-f", "http://localhost"]) + #expect(h.test == ["CMD", "curl", "-f", "http://localhost"]) + #expect(h.interval == Healthcheck.defaultInterval) + #expect(h.timeout == Healthcheck.defaultTimeout) + #expect(h.retries == Healthcheck.defaultRetries) + #expect(!h.isEffectivelyDisabled) + } + + @Test func cmdShellFormParsesAndValidates() throws { + let h = try Healthcheck(test: ["CMD-SHELL", "test -f /tmp/ready"]) + #expect(h.test == ["CMD-SHELL", "test -f /tmp/ready"]) + #expect(!h.isEffectivelyDisabled) + } + + @Test func noneFormIsEffectivelyDisabled() throws { + let h = try Healthcheck(test: ["NONE"]) + #expect(h.isEffectivelyDisabled) + } + + @Test func disableFlagBypassesObserver() throws { + let h = try Healthcheck(test: ["CMD-SHELL", "true"], disable: true) + #expect(h.isEffectivelyDisabled) + } + + @Test func emptyTestArrayRejected() { + #expect { + _ = try Healthcheck(test: []) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + #expect(err.message.contains("must not be empty")) + return true + } + } + + @Test func unknownTestKindRejected() { + #expect { + _ = try Healthcheck(test: ["BADKIND", "..."]) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + #expect(err.message.contains("must start with")) + return true + } + } + + @Test func cmdWithoutArgumentsRejected() { + #expect { + _ = try Healthcheck(test: ["CMD"]) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + return true + } + } + + @Test func nonPositiveIntervalRejected() { + #expect { + _ = try Healthcheck(test: ["CMD-SHELL", "true"], interval: 0) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.message.contains("interval")) + return true + } + } + + @Test func negativeRetriesRejected() { + #expect { + _ = try Healthcheck(test: ["CMD-SHELL", "true"], retries: -1) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.message.contains("retries")) + return true + } + } + + @Test func probeIntervalUsesStartIntervalDuringGrace() throws { + let h = try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 30, + startPeriod: 60, + startInterval: 5 + ) + #expect(h.probeInterval(forContainerAge: 0) == 5) + #expect(h.probeInterval(forContainerAge: 30) == 5) + #expect(h.probeInterval(forContainerAge: 60) == 30) + #expect(h.probeInterval(forContainerAge: 600) == 30) + } + + @Test func probeIntervalFallsBackToIntervalWithoutStartInterval() throws { + let h = try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 30, + startPeriod: 60 + ) + #expect(h.probeInterval(forContainerAge: 0) == 30) + #expect(h.probeInterval(forContainerAge: 600) == 30) + } + + @Test func roundTripThroughCodable() throws { + let original = try Healthcheck( + test: ["CMD-SHELL", "test -f /tmp/ready"], + interval: 15, + timeout: 5, + retries: 5, + startPeriod: 30, + startInterval: 2, + disable: false + ) + let data = try JSONEncoder().encode(original) + let decoded = try JSONDecoder().decode(Healthcheck.self, from: data) + #expect(decoded.test == original.test) + #expect(decoded.interval == original.interval) + #expect(decoded.timeout == original.timeout) + #expect(decoded.retries == original.retries) + #expect(decoded.startPeriod == original.startPeriod) + #expect(decoded.startInterval == original.startInterval) + #expect(decoded.disable == original.disable) + } + + @Test func legacyContainerConfigurationDecodesWithoutHealthcheck() throws { + let json = """ + { + "id": "legacy", + "image": { + "reference": "redis:latest", + "descriptor": { + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:0000000000000000000000000000000000000000000000000000000000000000", + "size": 0 + } + }, + "initProcess": { + "executable": "/usr/local/bin/redis-server", + "arguments": [], + "environment": [], + "workingDirectory": "/", + "terminal": false, + "user": {"id": {"uid": 0, "gid": 0}}, + "supplementalGroups": [], + "rlimits": [] + } + } + """ + let data = json.data(using: .utf8)! + let decoded = try JSONDecoder().decode(ContainerConfiguration.self, from: data) + #expect(decoded.healthcheck == nil) + } +} From be4aee006af158191ad9dcf69670cd186f72024b Mon Sep 17 00:00:00 2001 From: Chris George Date: Wed, 6 May 2026 17:50:06 -0700 Subject: [PATCH 3/3] Fix none build issue --- .../Container/ContainerRun.swift | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Sources/ContainerCommands/Container/ContainerRun.swift b/Sources/ContainerCommands/Container/ContainerRun.swift index 2957fe473..9f23adbe8 100644 --- a/Sources/ContainerCommands/Container/ContainerRun.swift +++ b/Sources/ContainerCommands/Container/ContainerRun.swift @@ -176,5 +176,23 @@ extension Application { } throw ArgumentParser.ExitCode(exitCode) } + static func parseRestartPolicy(_ raw: String?) -> RestartPolicy? { + guard let raw, !raw.isEmpty else { return nil } + switch raw { + case "no": + return RestartPolicy.none + case "always": + return RestartPolicy(mode: .always) + case "unless-stopped": + return RestartPolicy(mode: .unlessStopped) + default: + if raw.hasPrefix("on-failure") { + let parts = raw.split(separator: ":", maxSplits: 1) + let retries = parts.count > 1 ? Int(parts[1]) ?? 0 : 0 + return RestartPolicy(mode: .onFailure, maxRetries: retries) + } + return nil + } + } } }