diff --git a/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml b/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml index 2e07729a..937b1376 100644 --- a/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml +++ b/deployments/k8s/dev/docker-desktop-oracle/kustomization.yaml @@ -23,6 +23,10 @@ patches: target: kind: Deployment name: tmi-server + - path: ../docker-desktop/patches/redis-pullpolicy.yaml + target: + kind: Deployment + name: redis - path: ../docker-desktop/patches/extractor-image.yaml target: group: tmi.dev diff --git a/deployments/k8s/dev/docker-desktop/kustomization.yaml b/deployments/k8s/dev/docker-desktop/kustomization.yaml index 6c21e09a..a7074b6d 100644 --- a/deployments/k8s/dev/docker-desktop/kustomization.yaml +++ b/deployments/k8s/dev/docker-desktop/kustomization.yaml @@ -22,6 +22,10 @@ patches: target: kind: Deployment name: tmi-server + - path: patches/redis-pullpolicy.yaml + target: + kind: Deployment + name: redis - path: patches/extractor-image.yaml target: group: tmi.dev diff --git a/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml b/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml new file mode 100644 index 00000000..df70fa40 --- /dev/null +++ b/deployments/k8s/dev/docker-desktop/patches/redis-pullpolicy.yaml @@ -0,0 +1,9 @@ +# docker-desktop pins redis to IfNotPresent so the base image pre-imported into +# the node's containerd (deploy.py build_and_push, #517) is used and the node +# never re-pulls cgr.dev/chainguard/redis:latest (a :latest tag would otherwise +# default to Always). redis.yml has no imagePullPolicy field, so this is an `add`, +# not a `replace` (contrast server-pullpolicy.yaml, whose manifest sets Always). +# k3s does not use this patch: it remaps redis to redis:7-alpine (Docker Hub). +- op: add + path: /spec/template/spec/containers/0/imagePullPolicy + value: IfNotPresent diff --git a/deployments/k8s/dev/docker-desktop/postgres.yml b/deployments/k8s/dev/docker-desktop/postgres.yml index 9e8b8f85..066cc1f2 100644 --- a/deployments/k8s/dev/docker-desktop/postgres.yml +++ b/deployments/k8s/dev/docker-desktop/postgres.yml @@ -50,6 +50,10 @@ spec: containers: - name: postgres image: cgr.dev/chainguard/postgres:latest + # The image is pre-imported into the node's containerd by deploy.py + # (build_and_push), so use the imported copy and never re-pull from + # cgr.dev (a :latest tag would otherwise default to Always). See #517. + imagePullPolicy: IfNotPresent envFrom: - secretRef: { name: tmi-postgres } env: diff --git a/scripts/devenv.py b/scripts/devenv.py index e60be41f..47e5479e 100644 --- a/scripts/devenv.py +++ b/scripts/devenv.py @@ -16,7 +16,7 @@ cluster up|down the cluster target Global: --db postgres|oracle (default postgres), --cluster docker-desktop|k3s - (default docker-desktop), --no-workers, --yes + (default docker-desktop), --yes """ from __future__ import annotations @@ -39,7 +39,7 @@ def cmd_up(args) -> None: cluster.up(cluster=args.cluster) deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_down(args) -> None: @@ -50,14 +50,14 @@ def cmd_down(args) -> None: def cmd_restart(args) -> None: deploy.restart(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_reset(args) -> None: log_info("dev-reset: redeploying the in-cluster stack (keeping cluster + db data)") deploy.teardown(db=args.db) deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) log_success("dev-reset complete") @@ -70,7 +70,7 @@ def cmd_nuke(args) -> None: deploy.remove_local_images(args.db, cluster_target=args.cluster) _clean_logs_and_files() deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) log_success(f"dev-nuke complete (fresh {args.cluster} environment up)") @@ -86,7 +86,7 @@ def cmd_status(args) -> None: def cmd_deploy(args) -> None: deploy.start(db=args.db, cluster_target=args.cluster, - no_workers=args.no_workers, skip_context_guard=args.yes) + skip_context_guard=args.yes) def cmd_logs(args) -> None: @@ -109,7 +109,7 @@ def _add_global_options( *, is_subparser: bool = False, ) -> None: - """Add --db/--no-workers/--yes to a parser (top-level or subparser). + """Add --db/--cluster/--yes to a parser (top-level or subparser). Defined as a helper so the exact same options are added to both the top-level parser and each subparser, enabling both orderings: @@ -127,15 +127,12 @@ def _add_global_options( default=argparse.SUPPRESS) parser.add_argument("--cluster", choices=["k3s", "docker-desktop"], default=argparse.SUPPRESS) - parser.add_argument("--no-workers", action="store_true", dest="no_workers", - default=argparse.SUPPRESS) parser.add_argument("--yes", action="store_true", default=argparse.SUPPRESS, help="Skip the local-kube-context safety check") else: parser.add_argument("--db", choices=["postgres", "oracle"], default="postgres") parser.add_argument("--cluster", choices=["k3s", "docker-desktop"], default="docker-desktop", help="Kube cluster target: docker-desktop (default) or k3s (remote)") - parser.add_argument("--no-workers", action="store_true", dest="no_workers") parser.add_argument("--yes", action="store_true", help="Skip the local-kube-context safety check") diff --git a/scripts/lib/deploy.py b/scripts/lib/deploy.py index 8068bea2..f20e3494 100644 --- a/scripts/lib/deploy.py +++ b/scripts/lib/deploy.py @@ -56,6 +56,19 @@ # low-throughput from the host (test setup only), so a port-forward is fine here. REDIS_PORT_FORWARD_PID = "/tmp/tmi-dev-redis-portforward.pid" +# Public base images the docker-desktop node would otherwise pull from cgr.dev on +# first bring-up. Docker Desktop Kubernetes' containerd pulls these independently +# of the host Docker daemon, and that first pull occasionally fails with a +# transient EOF from cgr.dev (#517), leaving postgres/redis in ErrImagePull. We +# instead `docker pull` them on the host and import them into the node's +# containerd alongside the tmi-* images, then pin imagePullPolicy: IfNotPresent on +# the postgres/redis manifests so the imported copy is used and no cgr.dev pull is +# attempted. KEEP THESE IN SYNC with the image refs in +# deployments/k8s/dev/docker-desktop/postgres.yml and deployments/k8s/dev/redis.yml. +DD_POSTGRES_IMAGE = "cgr.dev/chainguard/postgres:latest" +DD_REDIS_IMAGE = "cgr.dev/chainguard/redis:latest" +DD_BASE_IMAGES = (DD_POSTGRES_IMAGE, DD_REDIS_IMAGE) + # --------------------------------------------------------------------------- # Pure helpers @@ -95,15 +108,12 @@ def overlay_dir_for(db: str, cluster_target: str = "docker-desktop") -> str: raise ValueError(f"unknown cluster target: {cluster_target!r}") -def _no_workers_files(db: str) -> tuple[str, ...]: - """Return the per-file manifest list for --no-workers mode. - - controller.yml and redis.yml are shared across DB flavors; only the server - manifest filename differs (server.yml for postgres, server-oracle.yml for oracle). - All files physically live in DEV_DIR. - """ - server_file = "server-oracle.yml" if db == "oracle" else "server.yml" - return ("controller.yml", "redis.yml", server_file) +def dd_base_images_for(db: str) -> tuple[str, ...]: + """Public base images to pre-import into the docker-desktop node for the given + DB flavor (#517). Redis is always deployed; in-cluster Postgres is deployed + only for the postgres flavor (oracle uses an external ADB and brings up no + Postgres pod), so skip the Postgres base image there.""" + return (DD_REDIS_IMAGE,) if db == "oracle" else DD_BASE_IMAGES def content_hash(text: str) -> str: @@ -128,8 +138,20 @@ def import_image_to_node(ref: str, node: str) -> None: log_info(f"Importing {ref} -> {node} containerd (k8s.io)") saver = subprocess.Popen(save_cmd, stdout=subprocess.PIPE) try: - importer = subprocess.Popen(import_cmd, stdin=saver.stdout) - saver.stdout.close() # allow saver to receive SIGPIPE if importer exits + try: + importer = subprocess.Popen(import_cmd, stdin=saver.stdout) + except BaseException: + # The importer never started, so nobody will drain saver's stdout. + # Kill saver before the finally's wait() so it can't block forever + # writing into a pipe with no reader, then re-raise. + saver.kill() + raise + finally: + # Release the parent's copy of the pipe's read end unconditionally, + # whether or not the importer Popen raised. This both lets saver + # receive SIGPIPE if the importer exits and guarantees saver.wait() + # below cannot deadlock on a pipe the parent is still holding open. + saver.stdout.close() importer.communicate() if importer.returncode != 0: log_error(f"ctr import failed for {ref} (exit {importer.returncode})") @@ -383,6 +405,14 @@ def build_and_push(db: str, cluster_target: str = "docker-desktop") -> None: run_cmd(["docker", "push", ref]) if cluster_target == "docker-desktop": + # Import the public postgres/redis base images too, so first bring-up + # doesn't depend on the DD node's containerd reaching cgr.dev (#517). + # Pull on the host first (respects the host Docker's proxy/cache), then + # stream into the node's containerd exactly like the tmi-* images. + for base in dd_base_images_for(db): + log_info(f"Pulling base image {base}") + run_cmd(["docker", "pull", base]) + import_image_to_node(base, cluster.DD_NODE) log_success("All images built and imported into the docker-desktop node") else: log_success("All images built and pushed to local registry") @@ -503,28 +533,20 @@ def create_oracle_db_secret() -> None: log_success("oracle DB connection delivered as Secret/tmi-oracle-db") -def apply_overlay(no_workers: bool, db: str, cluster_target: str = "docker-desktop") -> None: +def apply_overlay(db: str, cluster_target: str = "docker-desktop") -> None: """Apply the dev overlay. - When --no-workers: apply the three core manifests individually to avoid - kustomize referencing the component CR files (which include TMIComponent - resources from ../platform/components/). - - Otherwise: render the full kustomize overlay with --load-restrictor - LoadRestrictionsNone (needed because the overlay references files outside - its own directory tree, e.g. ../../platform/components/). + Render the full kustomize overlay with --load-restrictor LoadRestrictionsNone + (needed because the overlay references files outside its own directory tree, + e.g. ../../platform/components/). """ project_root = get_project_root() - if no_workers: - for f in _no_workers_files(db): - kubectl(["apply", "-f", str(project_root / DEV_DIR / f)]) - else: - rendered = run_cmd( - ["kubectl", "kustomize", "--load-restrictor", "LoadRestrictionsNone", - str(project_root / overlay_dir_for(db, cluster_target))], - capture=True, - ).stdout - kubectl(["apply", "-f", "-"], input_text=rendered) + rendered = run_cmd( + ["kubectl", "kustomize", "--load-restrictor", "LoadRestrictionsNone", + str(project_root / overlay_dir_for(db, cluster_target))], + capture=True, + ).stdout + kubectl(["apply", "-f", "-"], input_text=rendered) def server_rollout_timeout(db: str) -> str: @@ -657,7 +679,7 @@ def server_http_status() -> tuple[bool, str]: # Orchestration entry points # --------------------------------------------------------------------------- -def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = False, +def start(*, db: str, cluster_target: str = "docker-desktop", skip_context_guard: bool = False) -> None: """Build images, deploy all components, wait for readiness, and start port-forwards.""" _preflight() @@ -678,7 +700,7 @@ def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db, cluster_target) + apply_overlay(db, cluster_target) # `kubectl apply` of an unchanged Deployment spec does not roll a new pod, so # a freshly-built :dev image (same tag) would not be picked up, and a pod # stuck in CrashLoopBackOff from a prior transient outage (e.g. the host DB @@ -691,7 +713,7 @@ def start(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = wait_and_forward(db, cluster_target) -def restart(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool = False, +def restart(*, db: str, cluster_target: str = "docker-desktop", skip_context_guard: bool = False) -> None: """Rebuild the server image, re-deliver config, and roll the server deployment.""" _preflight() @@ -704,7 +726,7 @@ def restart(*, db: str, cluster_target: str = "docker-desktop", no_workers: bool if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db, cluster_target) + apply_overlay(db, cluster_target) kubectl(["-n", NS, "rollout", "restart", "deploy/tmi-server"]) kubectl(["-n", NS, "rollout", "status", "deploy/tmi-server", f"--timeout={server_rollout_timeout(db)}"]) start_redis_port_forward() diff --git a/scripts/lib/tests/test_deploy.py b/scripts/lib/tests/test_deploy.py index 27103270..320dd0f9 100644 --- a/scripts/lib/tests/test_deploy.py +++ b/scripts/lib/tests/test_deploy.py @@ -2,6 +2,7 @@ import sys import unittest from pathlib import Path +from unittest import mock sys.path.insert(0, str(Path(__file__).resolve().parents[1])) import deploy # noqa: E402 @@ -74,18 +75,36 @@ def test_k3s_rewrites_url_host_to_postgres_service(self): self.assertIn("@postgres:5432/tmi_dev", out) -class TestNoWorkersFiles(unittest.TestCase): - def test_no_workers_files_oracle_uses_oracle_server(self): - self.assertIn("server-oracle.yml", deploy._no_workers_files("oracle")) +class TestDdBaseImages(unittest.TestCase): + """The docker-desktop base images pre-imported to dodge the cgr.dev first-run + pull flake (#517) must stay in sync with the refs in the manifests that + reference them, or the pre-imported copy won't match what the pods request.""" - def test_no_workers_files_postgres_uses_plain_server(self): - self.assertIn("server.yml", deploy._no_workers_files("postgres")) + def test_includes_postgres_and_redis(self): + self.assertIn("cgr.dev/chainguard/postgres:latest", deploy.DD_BASE_IMAGES) + self.assertIn("cgr.dev/chainguard/redis:latest", deploy.DD_BASE_IMAGES) - def test_no_workers_files_includes_controller_and_redis(self): - for db in ("postgres", "oracle"): - files = deploy._no_workers_files(db) - self.assertIn("controller.yml", files) - self.assertIn("redis.yml", files) + def test_postgres_ref_matches_docker_desktop_manifest(self): + text = (_DEV_DIR / "docker-desktop" / "postgres.yml").read_text() + self.assertIn("cgr.dev/chainguard/postgres:latest", deploy.DD_BASE_IMAGES) + self.assertIn("image: cgr.dev/chainguard/postgres:latest", text) + + def test_redis_ref_matches_shared_manifest(self): + text = (_DEV_DIR / "redis.yml").read_text() + self.assertIn("cgr.dev/chainguard/redis:latest", deploy.DD_BASE_IMAGES) + self.assertIn("image: cgr.dev/chainguard/redis:latest", text) + + def test_postgres_flavor_imports_both_base_images(self): + imgs = deploy.dd_base_images_for("postgres") + self.assertIn(deploy.DD_POSTGRES_IMAGE, imgs) + self.assertIn(deploy.DD_REDIS_IMAGE, imgs) + + def test_oracle_flavor_skips_postgres_base_image(self): + # Oracle uses an external ADB — no in-cluster Postgres pod — so importing + # the Postgres base image would be wasted work. + imgs = deploy.dd_base_images_for("oracle") + self.assertIn(deploy.DD_REDIS_IMAGE, imgs) + self.assertNotIn(deploy.DD_POSTGRES_IMAGE, imgs) class TestRenderConfigmapYaml(unittest.TestCase): @@ -276,5 +295,32 @@ def test_builds_docker_save_and_ctr_import_pair(self): ) +class TestImportImageToNode(unittest.TestCase): + """#519: if the importer Popen raises before we release the saver's stdout, + the saver must be torn down (stdout closed + killed + waited) so it can't + deadlock writing into a pipe with no reader — rather than left to hang.""" + + def test_importer_popen_raises_tears_down_saver(self): + saver = mock.MagicMock() + saver.returncode = 0 + + def popen_side_effect(*_args, **_kwargs): + # First call (docker save) succeeds; second call (ctr import) fails to + # spawn, e.g. FileNotFoundError if docker exec were unavailable. + popen_side_effect.calls += 1 + if popen_side_effect.calls == 1: + return saver + raise FileNotFoundError("docker exec not found") + popen_side_effect.calls = 0 + + with mock.patch.object(deploy.subprocess, "Popen", side_effect=popen_side_effect): + with self.assertRaises(FileNotFoundError): + deploy.import_image_to_node("tmi-server:dev", "desktop-control-plane") + + saver.stdout.close.assert_called_once() # pipe read end released + saver.kill.assert_called_once() # saver stopped, can't block + saver.wait.assert_called_once() # reaped in the finally + + if __name__ == "__main__": unittest.main() diff --git a/scripts/lib/tests/test_devenv_cli.py b/scripts/lib/tests/test_devenv_cli.py index 85f6990c..12d47807 100644 --- a/scripts/lib/tests/test_devenv_cli.py +++ b/scripts/lib/tests/test_devenv_cli.py @@ -51,10 +51,6 @@ def test_parse_up_defaults_to_docker_desktop(self): args = devenv_cli.build_parser().parse_args(["up"]) self.assertEqual(args.cluster, "docker-desktop") - def test_parse_up_no_workers_default_false(self): - args = devenv_cli.build_parser().parse_args(["up"]) - self.assertFalse(args.no_workers) - def test_parse_up_yes_default_false(self): args = devenv_cli.build_parser().parse_args(["up"]) self.assertFalse(args.yes) @@ -73,14 +69,6 @@ def test_parse_db_oracle_before_verb(self): args = devenv_cli.build_parser().parse_args(["--db", "oracle", "up"]) self.assertEqual(args.db, "oracle") - def test_parse_no_workers_after_verb(self): - args = devenv_cli.build_parser().parse_args(["up", "--no-workers"]) - self.assertTrue(args.no_workers) - - def test_no_workers_before_verb(self): - args = devenv_cli.build_parser().parse_args(["--no-workers", "up"]) - self.assertTrue(args.no_workers) - def test_parse_yes_after_verb(self): args = devenv_cli.build_parser().parse_args(["up", "--yes"]) self.assertTrue(args.yes)