From 99882a51ad22b2acc0683cecf22e60510da4d863 Mon Sep 17 00:00:00 2001 From: Eric Fitzgerald Date: Sat, 4 Jul 2026 13:13:21 -0400 Subject: [PATCH] fix(dev): harden docker-desktop image delivery and drop broken --no-workers (#517, #518, #519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three follow-ups from the Docker Desktop dev-target work (#520), all in the dev-tooling layer (scripts/lib/deploy.py + dev k8s overlays). #517 — pre-import postgres/redis base images to avoid the first-run cgr.dev pull flake. Docker Desktop's containerd pulls cgr.dev/chainguard/{postgres,redis} independently of the host Docker daemon, and that first pull occasionally fails with a transient EOF, leaving the pods in ErrImagePull. build_and_push now `docker pull`s the base images on the host and imports them into the node's containerd alongside the tmi-* images, and the postgres/redis manifests are pinned to imagePullPolicy: IfNotPresent so the imported copy is used (a :latest tag otherwise defaults to Always and re-pulls, defeating the import). The redis pin is a per-overlay kustomize patch (redis.yml is shared with k3s, which remaps redis to redis:7-alpine); postgres is pinned directly in the docker-desktop postgres.yml (applied raw by deploy.py). The base-image set is db-aware: oracle uses an external ADB and deploys no Postgres pod, so only redis is imported there. #518 — remove the --no-workers bring-up path. It applied the raw leaf manifests (image: localhost:5000/tmi-*:dev), which only worked against the retired kind local registry and yields ErrImagePull on docker-desktop/k3s. No make target passes it, so it was developer-manual-only dead/broken code. Dropped the flag from devenv.py, the no_workers params from start/restart/apply_overlay, and the _no_workers_files helper. #519 — harden import_image_to_node against a Popen-raises-before-close hang. If the importer Popen raised before saver.stdout.close() ran, the parent kept the pipe's read end open and saver.wait() in the finally could deadlock once the pipe buffer filled. The importer Popen is now wrapped so saver's stdout is released (and saver killed) on any exception before the wait. Unit tests added for the db-aware base-image selection and the import teardown path; the --no-workers tests were removed. make test-dev-scripts (94) and make lint pass. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01Kk9GxWS9EpazjbwBKfMpUX --- .../docker-desktop-oracle/kustomization.yaml | 4 + .../k8s/dev/docker-desktop/kustomization.yaml | 4 + .../patches/redis-pullpolicy.yaml | 9 ++ .../k8s/dev/docker-desktop/postgres.yml | 4 + scripts/devenv.py | 17 ++-- scripts/lib/deploy.py | 88 ++++++++++++------- scripts/lib/tests/test_deploy.py | 66 +++++++++++--- scripts/lib/tests/test_devenv_cli.py | 12 --- 8 files changed, 139 insertions(+), 65 deletions(-) create mode 100644 deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml diff --git a/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml b/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml index 2e07729a..937b1376 100644 --- a/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml +++ b/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml @@ -23,6 +23,10 @@ patches: target: kind: Deployment name: tmi-server + - path: ../docker-desktop/patches/redis-pullpolicy.yaml + target: + kind: Deployment + name: redis - path: ../docker-desktop/patches/extractor-image.yaml target: group: tmi.dev diff --git a/deployments/k8s/dev/docker-desktop/kustomization.yaml b/deployments/k8s/dev/docker-desktop/kustomization.yaml index 6c21e09a..a7074b6d 100644 --- a/deployments/k8s/dev/docker-desktop/kustomization.yaml +++ b/deployments/k8s/dev/docker-desktop/kustomization.yaml @@ -22,6 +22,10 @@ patches: target: kind: Deployment name: tmi-server + - path: patches/redis-pullpolicy.yaml + target: + kind: Deployment + name: redis - path: patches/extractor-image.yaml target: group: tmi.dev diff --git a/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml b/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml new file mode 100644 index 00000000..df70fa40 --- /dev/null +++ b/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml @@ -0,0 +1,9 @@ +# docker-desktop pins redis to IfNotPresent so the base image pre-imported into +# the node's containerd (deploy.py build_and_push, #517) is used and the node +# never re-pulls cgr.dev/chainguard/redis:latest (a :latest tag would otherwise +# default to Always). redis.yml has no imagePullPolicy field, so this is an `add`, +# not a `replace` (contrast server-pullpolicy.yaml, whose manifest sets Always). +# k3s does not use this patch: it remaps redis to redis:7-alpine (Docker Hub). +- op: add + path: /spec/template/spec/containers/0/imagePullPolicy + value: IfNotPresent diff --git a/deployments/k8s/dev/docker-desktop/postgres.yml b/deployments/k8s/dev/docker-desktop/postgres.yml index 9e8b8f85..066cc1f2 100644 --- a/deployments/k8s/dev/docker-desktop/postgres.yml +++ b/deployments/k8s/dev/docker-desktop/postgres.yml @@ -50,6 +50,10 @@ spec: containers: - name: postgres image: cgr.dev/chainguard/postgres:latest + # The image is pre-imported into the node's containerd by deploy.py + # (build_and_push), so use the imported copy and never re-pull from + # cgr.dev (a :latest tag would otherwise default to Always). See #517. + imagePullPolicy: IfNotPresent envFrom: - secretRef: { name: tmi-postgres } env: diff --git a/scripts/devenv.py b/scripts/devenv.py index e60be41f..47e5479e 100644 --- a/scripts/devenv.py +++ b/scripts/devenv.py @@ -16,7 +16,7 @@ cluster up|down the cluster target Global: --db postgres|oracle (default postgres), --cluster docker-desktop|k3s - (default docker-desktop), --no-workers, --yes + (default docker-desktop), --yes """ from __future__ import annotations @@ -39,7 +39,7 @@ def cmd_up(args) -> None: cluster.up(cluster=args.cluster) deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_down(args) -> None: @@ -50,14 +50,14 @@ def cmd_down(args) -> None: def cmd_restart(args) -> None: deploy.restart(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_reset(args) -> None: log_info("dev-reset: redeploying the in-cluster stack (keeping cluster + db data)") deploy.teardown(db=args.db) deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) log_success("dev-reset complete") @@ -70,7 +70,7 @@ def cmd_nuke(args) -> None: deploy.remove_local_images(args.db, cluster_target=args.cluster) _clean_logs_and_files() deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) log_success(f"dev-nuke complete (fresh {args.cluster} environment up)") @@ -86,7 +86,7 @@ def cmd_status(args) -> None: def cmd_deploy(args) -> None: deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_logs(args) -> None: @@ -109,7 +109,7 @@ def _add_global_options( *, is_subparser: bool = False, ) -> None: - """Add --db/--no-workers/--yes to a parser (top-level or subparser). + """Add --db/--cluster/--yes to a parser (top-level or subparser). Defined as a helper so the exact same options are added to both the top-level parser and each subparser, enabling both orderings: @@ -127,15 +127,12 @@ def _add_global_options( default=argparse.SUPPRESS) parser.add_argument("--cluster", choices=["k3s", "docker-desktop"], default=argparse.SUPPRESS) - parser.add_argument("--no-workers", action="store_true", dest="no_workers", - default=argparse.SUPPRESS) parser.add_argument("--yes", action="store_true", default=argparse.SUPPRESS, help="Skip the local-kube-context safety check") else: parser.add_argument("--db", choices=["postgres", "oracle"], default="postgres") parser.add_argument("--cluster", choices=["k3s", "docker-desktop"], default="docker-desktop", help="Kube cluster target: docker-desktop (default) or k3s (remote)") - parser.add_argument("--no-workers", action="store_true", dest="no_workers") parser.add_argument("--yes", action="store_true", help="Skip the local-kube-context safety check") diff --git a/scripts/lib/deploy.py b/scripts/lib/deploy.py index 8068bea2..f20e3494 100644 --- a/scripts/lib/deploy.py +++ b/scripts/lib/deploy.py @@ -56,6 +56,19 @@ # low-throughput from the host (test setup only), so a port-forward is fine here. REDIS_PORT_FORWARD_PID = "/tmp/tmi-dev-redis-portforward.pid" +# Public base images the docker-desktop node would otherwise pull from cgr.dev on +# first bring-up. Docker Desktop Kubernetes' containerd pulls these independently +# of the host Docker daemon, and that first pull occasionally fails with a +# transient EOF from cgr.dev (#517), leaving postgres/redis in ErrImagePull. We +# instead `docker pull` them on the host and import them into the node's +# containerd alongside the tmi-* images, then pin imagePullPolicy: IfNotPresent on +# the postgres/redis manifests so the imported copy is used and no cgr.dev pull is +# attempted. KEEP THESE IN SYNC with the image refs in +# deployments/k8s/dev/docker-desktop/postgres.yml and deployments/k8s/dev/redis.yml. +DD_POSTGRES_IMAGE = "cgr.dev/chainguard/postgres:latest" +DD_REDIS_IMAGE = "cgr.dev/chainguard/redis:latest" +DD_BASE_IMAGES = (DD_POSTGRES_IMAGE, DD_REDIS_IMAGE) + # --------------------------------------------------------------------------- # Pure helpers @@ -95,15 +108,12 @@ def overlay_dir_for(db: str, cluster_target: str = "docker-desktop") -> str: raise ValueError(f"unknown cluster target: {cluster_target!r}") -def _no_workers_files(db: str) -> tuple[str, ...]: - """Return the per-file manifest list for --no-workers mode. - - controller.yml and redis.yml are shared across DB flavors; only the server - manifest filename differs (server.yml for postgres, server-oracle.yml for oracle). - All files physically live in DEV_DIR. - """ - server_file = "server-oracle.yml" if db == "oracle" else "server.yml" - return ("controller.yml", "redis.yml", server_file) +def dd_base_images_for(db: str) -> tuple[str, ...]: + """Public base images to pre-import into the docker-desktop node for the given + DB flavor (#517). Redis is always deployed; in-cluster Postgres is deployed + only for the postgres flavor (oracle uses an external ADB and brings up no + Postgres pod), so skip the Postgres base image there.""" + return (DD_REDIS_IMAGE,) if db == "oracle" else DD_BASE_IMAGES def content_hash(text: str) -> str: @@ -128,8 +138,20 @@ def import_image_to_node(ref: str, node: str) -> None: log_info(f"Importing {ref} -> {node} containerd (k8s.io)") saver = subprocess.Popen(save_cmd, stdout=subprocess.PIPE) try: - importer = subprocess.Popen(import_cmd, stdin=saver.stdout) - saver.stdout.close() # allow saver to receive SIGPIPE if importer exits + try: + importer = subprocess.Popen(import_cmd, stdin=saver.stdout) + except BaseException: + # The importer never started, so nobody will drain saver's stdout. + # Kill saver before the finally's wait() so it can't block forever + # writing into a pipe with no reader, then re-raise. + saver.kill() + raise + finally: + # Release the parent's copy of the pipe's read end unconditionally, + # whether or not the importer Popen raised. This both lets saver + # receive SIGPIPE if the importer exits and guarantees saver.wait() + # below cannot deadlock on a pipe the parent is still holding open. + saver.stdout.close() importer.communicate() if importer.returncode != 0: log_error(f"ctr import failed for {ref} (exit {importer.returncode})") @@ -383,6 +405,14 @@ def build_and_push(db: str, cluster_target: str = "docker-desktop") -> None: run_cmd(["docker", "push", ref]) if cluster_target == "docker-desktop": + # Import the public postgres/redis base images too, so first bring-up + # doesn't depend on the DD node's containerd reaching cgr.dev (#517). + # Pull on the host first (respects the host Docker's proxy/cache), then + # stream into the node's containerd exactly like the tmi-* images. + for base in dd_base_images_for(db): + log_info(f"Pulling base image {base}") + run_cmd(["docker", "pull", base]) + import_image_to_node(base, cluster.DD_NODE) log_success("All images built and imported into the docker-desktop node") else: log_success("All images built and pushed to local registry") @@ -503,28 +533,20 @@ def create_oracle_db_secret() -> None: log_success("oracle DB connection delivered as Secret/tmi-oracle-db") -def apply_overlay(no_workers: bool, db: str, cluster_target: str = "docker-desktop") -> None: +def apply_overlay(db: str, cluster_target: str = "docker-desktop") -> None: """Apply the dev overlay. - When --no-workers: apply the three core manifests individually to avoid - kustomize referencing the component CR files (which include TMIComponent - resources from ../platform/components/). - - Otherwise: render the full kustomize overlay with --load-restrictor - LoadRestrictionsNone (needed because the overlay references files outside - its own directory tree, e.g. ../../platform/components/). + Render the full kustomize overlay with --load-restrictor LoadRestrictionsNone + (needed because the overlay references files outside its own directory tree, + e.g. ../../platform/components/). """ project_root = get_project_root() - if no_workers: - for f in _no_workers_files(db): - kubectl(["apply", "-f", str(project_root / DEV_DIR / f)]) - else: - rendered = run_cmd( - ["kubectl", "kustomize", "--load-restrictor", "LoadRestrictionsNone", - str(project_root / overlay_dir_for(db, cluster_target))], - capture=True, - ).stdout - kubectl(["apply", "-f", "-"], input_text=rendered) + rendered = run_cmd( + ["kubectl", "kustomize", "--load-restrictor", "LoadRestrictionsNone", + str(project_root / overlay_dir_for(db, cluster_target))], + capture=True, + ).stdout + kubectl(["apply", "-f", "-"], input_text=rendered) def server_rollout_timeout(db: str) -> str: @@ -657,7 +679,7 @@ def server_http_status() -> tuple[bool, str]: # Orchestration entry points # --------------------------------------------------------------------------- -def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = False, +def start(*, db: str, cluster_target: str = "docker-desktop", skip_context_guard: bool = False) -> None: """Build images, deploy all components, wait for readiness, and start port-forwards.""" _preflight() @@ -678,7 +700,7 @@ def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db, cluster_target) + apply_overlay(db, cluster_target) # `kubectl apply` of an unchanged Deployment spec does not roll a new pod, so # a freshly-built :dev image (same tag) would not be picked up, and a pod # stuck in CrashLoopBackOff from a prior transient outage (e.g. the host DB @@ -691,7 +713,7 @@ def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = wait_and_forward(db, cluster_target) -def restart(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = False, +def restart(*, db: str, cluster_target: str = "docker-desktop", skip_context_guard: bool = False) -> None: """Rebuild the server image, re-deliver config, and roll the server deployment.""" _preflight() @@ -704,7 +726,7 @@ def restart(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db, cluster_target) + apply_overlay(db, cluster_target) kubectl(["-n", NS, "rollout", "restart", "deploy/tmi-server"]) kubectl(["-n", NS, "rollout", "status", "deploy/tmi-server", f"--timeout={server_rollout_timeout(db)}"]) start_redis_port_forward() diff --git a/scripts/lib/tests/test_deploy.py b/scripts/lib/tests/test_deploy.py index 27103270..320dd0f9 100644 --- a/scripts/lib/tests/test_deploy.py +++ b/scripts/lib/tests/test_deploy.py @@ -2,6 +2,7 @@ import sys import unittest from pathlib import Path +from unittest import mock sys.path.insert(0, str(Path(__file__).resolve().parents[1])) import deploy # noqa: E402 @@ -74,18 +75,36 @@ def test_k3s_rewrites_url_host_to_postgres_service(self): self.assertIn("@postgres:5432/tmi_dev", out) -class TestNoWorkersFiles(unittest.TestCase): - def test_no_workers_files_oracle_uses_oracle_server(self): - self.assertIn("server-oracle.yml", deploy._no_workers_files("oracle")) +class TestDdBaseImages(unittest.TestCase): + """The docker-desktop base images pre-imported to dodge the cgr.dev first-run + pull flake (#517) must stay in sync with the refs in the manifests that + reference them, or the pre-imported copy won't match what the pods request.""" - def test_no_workers_files_postgres_uses_plain_server(self): - self.assertIn("server.yml", deploy._no_workers_files("postgres")) + def test_includes_postgres_and_redis(self): + self.assertIn("cgr.dev/chainguard/postgres:latest", deploy.DD_BASE_IMAGES) + self.assertIn("cgr.dev/chainguard/redis:latest", deploy.DD_BASE_IMAGES) - def test_no_workers_files_includes_controller_and_redis(self): - for db in ("postgres", "oracle"): - files = deploy._no_workers_files(db) - self.assertIn("controller.yml", files) - self.assertIn("redis.yml", files) + def test_postgres_ref_matches_docker_desktop_manifest(self): + text = (_DEV_DIR / "docker-desktop" / "postgres.yml").read_text() + self.assertIn("cgr.dev/chainguard/postgres:latest", deploy.DD_BASE_IMAGES) + self.assertIn("image: cgr.dev/chainguard/postgres:latest", text) + + def test_redis_ref_matches_shared_manifest(self): + text = (_DEV_DIR / "redis.yml").read_text() + self.assertIn("cgr.dev/chainguard/redis:latest", deploy.DD_BASE_IMAGES) + self.assertIn("image: cgr.dev/chainguard/redis:latest", text) + + def test_postgres_flavor_imports_both_base_images(self): + imgs = deploy.dd_base_images_for("postgres") + self.assertIn(deploy.DD_POSTGRES_IMAGE, imgs) + self.assertIn(deploy.DD_REDIS_IMAGE, imgs) + + def test_oracle_flavor_skips_postgres_base_image(self): + # Oracle uses an external ADB — no in-cluster Postgres pod — so importing + # the Postgres base image would be wasted work. + imgs = deploy.dd_base_images_for("oracle") + self.assertIn(deploy.DD_REDIS_IMAGE, imgs) + self.assertNotIn(deploy.DD_POSTGRES_IMAGE, imgs) class TestRenderConfigmapYaml(unittest.TestCase): @@ -276,5 +295,32 @@ def test_builds_docker_save_and_ctr_import_pair(self): ) +class TestImportImageToNode(unittest.TestCase): + """#519: if the importer Popen raises before we release the saver's stdout, + the saver must be torn down (stdout closed + killed + waited) so it can't + deadlock writing into a pipe with no reader — rather than left to hang.""" + + def test_importer_popen_raises_tears_down_saver(self): + saver = mock.MagicMock() + saver.returncode = 0 + + def popen_side_effect(*_args, **_kwargs): + # First call (docker save) succeeds; second call (ctr import) fails to + # spawn, e.g. FileNotFoundError if docker exec were unavailable. + popen_side_effect.calls += 1 + if popen_side_effect.calls == 1: + return saver + raise FileNotFoundError("docker exec not found") + popen_side_effect.calls = 0 + + with mock.patch.object(deploy.subprocess, "Popen", side_effect=popen_side_effect): + with self.assertRaises(FileNotFoundError): + deploy.import_image_to_node("tmi-server:dev", "desktop-control-plane") + + saver.stdout.close.assert_called_once() # pipe read end released + saver.kill.assert_called_once() # saver stopped, can't block + saver.wait.assert_called_once() # reaped in the finally + + if __name__ == "__main__": unittest.main() diff --git a/scripts/lib/tests/test_devenv_cli.py b/scripts/lib/tests/test_devenv_cli.py index 85f6990c..12d47807 100644 --- a/scripts/lib/tests/test_devenv_cli.py +++ b/scripts/lib/tests/test_devenv_cli.py @@ -51,10 +51,6 @@ def test_parse_up_defaults_to_docker_desktop(self): args = devenv_cli.build_parser().parse_args(["up"]) self.assertEqual(args.cluster, "docker-desktop") - def test_parse_up_no_workers_default_false(self): - args = devenv_cli.build_parser().parse_args(["up"]) - self.assertFalse(args.no_workers) - def test_parse_up_yes_default_false(self): args = devenv_cli.build_parser().parse_args(["up"]) self.assertFalse(args.yes) @@ -73,14 +69,6 @@ def test_parse_db_oracle_before_verb(self): args = devenv_cli.build_parser().parse_args(["--db", "oracle", "up"]) self.assertEqual(args.db, "oracle") - def test_parse_no_workers_after_verb(self): - args = devenv_cli.build_parser().parse_args(["up", "--no-workers"]) - self.assertTrue(args.no_workers) - - def test_no_workers_before_verb(self): - args = devenv_cli.build_parser().parse_args(["--no-workers", "up"]) - self.assertTrue(args.no_workers) - def test_parse_yes_after_verb(self): args = devenv_cli.build_parser().parse_args(["up", "--yes"]) self.assertTrue(args.yes)