diff --git a/Makefile b/Makefile index 08d06eed..b6576ce8 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,9 @@ SERVER_PORT ?= 8080 # Default database backend for dev environment (postgres|oracle) DB ?= postgres +# Default kube cluster target for dev environment (kind|k3s) +CLUSTER ?= kind + # ============================================================================ # ATOMIC COMPONENTS - Infrastructure Management # ============================================================================ @@ -317,38 +320,38 @@ tilt-down: ## Stop Tilt and restore the prod-shaped server @echo "prod-shaped server restored" # ============================================================================ -# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle +# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle CLUSTER=kind|k3s # ============================================================================ -dev-up: ## Bring up the full kind dev environment (cluster + db + deploy). DB=postgres|oracle - @uv run scripts/devenv.py --db $(DB) up +dev-up: ## Bring up the full dev environment (cluster + db + deploy). DB=postgres|oracle CLUSTER=kind|k3s + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) up -dev-down: ## Tear down the kind dev environment; KEEP db data - @uv run scripts/devenv.py --db $(DB) down +dev-down: ## Tear down the dev environment; KEEP db data + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) down dev-restart: ## Rebuild the server image + roll the server pod (cluster + db untouched) - @uv run scripts/devenv.py --db $(DB) restart + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) restart dev-reset: ## Soft known-state: redeploy the stack with fresh images; KEEP db data - @uv run scripts/devenv.py --db $(DB) reset + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) reset dev-nuke: ## Hard known-state: destroy everything incl. db data + images, rebuild - @uv run scripts/devenv.py --db $(DB) nuke + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) nuke -dev-status: ## kind-aware dev environment status dashboard - @uv run scripts/devenv.py status +dev-status: ## dev environment status dashboard + @uv run scripts/devenv.py --cluster $(CLUSTER) status dev-logs: ## Stream the tmi-server pod logs - @uv run scripts/devenv.py logs + @uv run scripts/devenv.py --cluster $(CLUSTER) logs dev-deploy: ## (Re)apply manifests + rollout without recreating cluster/db - @uv run scripts/devenv.py --db $(DB) deploy + @uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) deploy -dev-cluster-up: ## Create the local kind cluster + registry only - @uv run scripts/devenv.py cluster up +dev-cluster-up: ## Create the local cluster + registry only (kind); switch context (k3s) + @uv run scripts/devenv.py --cluster $(CLUSTER) cluster up -dev-cluster-down: ## Delete the local kind cluster only - @uv run scripts/devenv.py cluster down +dev-cluster-down: ## Delete the local kind cluster only (no-op for k3s) + @uv run scripts/devenv.py --cluster $(CLUSTER) cluster down dev-db-up: ## Start the postgres dev container only @uv run scripts/devenv.py db up diff --git a/deployments/k8s/dev/k3s/README-node-setup.md b/deployments/k8s/dev/k3s/README-node-setup.md new file mode 100644 index 00000000..2f4b3b2c --- /dev/null +++ b/deployments/k8s/dev/k3s/README-node-setup.md @@ -0,0 +1,97 @@ +# k3s dev target — one-time host & node setup + +`make dev-up CLUSTER=k3s` deploys TMI to the remote **k3s-rp** cluster +(nodes `rp2`/`rp3`/`rp4` = `192.168.1.2`/`.3`/`.4`, all arm64). Images are served +from an in-cluster registry at **`rp2:30500`** (plain HTTP). Two one-time, +out-of-band configuration steps are required before the first `dev-up CLUSTER=k3s`, +because they need root/SSH on machines the dev tooling cannot reach. + +## 0. Mac: make `rp2` resolve reliably + +The k3s kubeconfig context (`k3s-rp`) and the registry ref (`rp2:30500`) both use +the bare short name `rp2`. On macOS that resolves only via mDNS (`rp2.local`), and +the bare form is unreliable — in particular, kubectl's Go resolver returns +`lookup rp2: no such host` right after the node reboots (mDNS hasn't re-advertised +yet), which aborts `make dev-up CLUSTER=k3s` in pre-flight. Pin it in `/etc/hosts` +so every resolver (kubectl, docker, curl) resolves it deterministically: + +```bash +echo "192.168.1.2 rp2" | sudo tee -a /etc/hosts +# verify +dscacheutil -q host -a name rp2 # -> ip_address: 192.168.1.2 +``` + +One-time and persists across reboots. Use `rp2.local` or `192.168.1.2` directly if +you prefer not to edit `/etc/hosts`, but then the kubeconfig server URL and the +registry refs would need to match — pinning `rp2` keeps everything as-is. + +## 1. Mac: trust the registry over plain HTTP + +The registry has no TLS, so the Docker daemon must treat `rp2:30500` as insecure, +or `docker push` refuses it. + +Docker Desktop → **Settings → Docker Engine**, add `rp2:30500` to +`insecure-registries`: + +```json +{ + "insecure-registries": ["rp2:30500"] +} +``` + +**Apply & Restart**. Verify: + +```bash +docker info 2>/dev/null | grep -A2 "Insecure Registries" # should list rp2:30500 +``` + +## 2. Each k3s node: mirror rp2:30500 over HTTP + +containerd on every node must also be told the registry is plain HTTP, or pods +fail to pull with an `http: server gave HTTP response to HTTPS client` error. + +On **each** node (`rp2`, `rp3`, `rp4`), create/merge +`/etc/rancher/k3s/registries.yaml`. Note the endpoint is the **IP** +`192.168.1.2` (rp2), not the hostname: the nodes do not resolve each other's +bare hostnames (`lookup rp2: no such host`), so the mirror must dial an IP. The +mirror *key* stays `rp2:30500` so image references and the Mac's Docker config +are unchanged. + +```yaml +mirrors: + "rp2:30500": + endpoint: + - "http://192.168.1.2:30500" +``` + +Then restart k3s so containerd reloads: + +```bash +# server nodes (this cluster is 3x control-plane): +sudo systemctl restart k3s +``` + +Verify a node can pull once an image has been pushed (after the first build): + +```bash +sudo k3s crictl pull rp2:30500/tmi-server:dev # should succeed +``` + +### Optional helper + +To push the file to all three nodes at once (requires SSH + sudo on each): + +```bash +for n in rp2 rp3 rp4; do + ssh "$n" 'sudo mkdir -p /etc/rancher/k3s && \ + printf "mirrors:\n \"rp2:30500\":\n endpoint:\n - \"http://192.168.1.2:30500\"\n" | sudo tee /etc/rancher/k3s/registries.yaml >/dev/null && \ + sudo systemctl restart k3s' +done +``` + +## Notes + +- These steps are **idempotent** and only needed once per machine (they persist + across reboots). They do not affect the default `kind` dev path. +- `rp2` must resolve from the Mac — pin it in `/etc/hosts` (step 0). Relying on + mDNS alone is what breaks `dev-up` after a node reboot. diff --git a/deployments/k8s/dev/k3s/kustomization.yaml b/deployments/k8s/dev/k3s/kustomization.yaml new file mode 100644 index 00000000..12774ecf --- /dev/null +++ b/deployments/k8s/dev/k3s/kustomization.yaml @@ -0,0 +1,48 @@ +# k3s (CLUSTER=k3s) dev overlay — full parity with the kind dev stack, but every +# TMI image is served from the in-cluster registry at rp2:30500 instead of the +# kind-local localhost:5000. +# +# Postgres (postgres.yml) and the in-cluster registry (registry.yml) are NOT part +# of this overlay: deploy.py applies them as prerequisites (registry before the +# image build/push; Postgres before the server, mirroring how kind brings the DB +# up first). This overlay is the workload set: server, controller, redis, and the +# extractor/chunk-embed worker TMIComponents. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: tmi-platform +resources: + - ../controller.yml + - ../redis.yml + - ../server.yml + - ../../platform/components/tmi-extractor.yml + - ../../platform/components/tmi-chunk-embed.yml +# Deployment container images (server, controller) — remapped by the images +# transformer (preserves the :dev tag). +images: + - name: localhost:5000/tmi-server + newName: rp2:30500/tmi-server + - name: localhost:5000/tmi-component-controller + newName: rp2:30500/tmi-component-controller + # Redis: the kind path uses cgr.dev/chainguard/redis, whose bundled jemalloc is + # compiled for 4KB pages and aborts with "Unsupported system page size" on the + # Pi 5 (BCM2712) nodes, which run a 16KB-page kernel. redis:7-alpine is built + # with libc malloc (page-size-agnostic) and runs the same redis-server with the + # same args. Verified starting cleanly on the nodes. + - name: cgr.dev/chainguard/redis + newName: redis + newTag: "7-alpine" +# The workers are TMIComponent custom resources; their .spec.image is not a +# PodSpec container image, so the images transformer cannot reach it — patch it. +patches: + - path: patches/extractor-image.yaml + target: + group: tmi.dev + version: v1alpha1 + kind: TMIComponent + name: tmi-extractor + - path: patches/chunkembed-image.yaml + target: + group: tmi.dev + version: v1alpha1 + kind: TMIComponent + name: tmi-chunk-embed diff --git a/deployments/k8s/dev/k3s/patches/chunkembed-image.yaml b/deployments/k8s/dev/k3s/patches/chunkembed-image.yaml new file mode 100644 index 00000000..bc933b3a --- /dev/null +++ b/deployments/k8s/dev/k3s/patches/chunkembed-image.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/image + value: rp2:30500/tmi-chunk-embed:dev diff --git a/deployments/k8s/dev/k3s/patches/extractor-image.yaml b/deployments/k8s/dev/k3s/patches/extractor-image.yaml new file mode 100644 index 00000000..ee6659bc --- /dev/null +++ b/deployments/k8s/dev/k3s/patches/extractor-image.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/image + value: rp2:30500/tmi-extractor:dev diff --git a/deployments/k8s/dev/k3s/postgres.yml b/deployments/k8s/dev/k3s/postgres.yml new file mode 100644 index 00000000..a469984a --- /dev/null +++ b/deployments/k8s/dev/k3s/postgres.yml @@ -0,0 +1,84 @@ +# In-cluster single-node Postgres for the k3s (CLUSTER=k3s) dev target. +# +# Uses the same vanilla Chainguard image as the kind dev DB (Dockerfile.postgres +# is just `FROM cgr.dev/chainguard/postgres:latest` + PGDATA/locale env, with no +# extensions), so the node pulls it straight from cgr.dev — no build/push to the +# in-cluster registry is needed. Schema is created by the server's GORM +# AutoMigrate at startup (same as kind). +# +# Credentials MUST match database.url in config-development.yml +# (postgres://tmi_dev:dev123@.../tmi_dev). The server reaches this DB via the +# `postgres` Service DNS name; deploy.py rewrites the config URL host to `postgres` +# for CLUSTER=k3s. +apiVersion: v1 +kind: Secret +metadata: + name: tmi-postgres + namespace: tmi-platform +type: Opaque +stringData: + POSTGRES_USER: "tmi_dev" + POSTGRES_PASSWORD: "dev123" + POSTGRES_DB: "tmi_dev" +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: tmi-platform +spec: + selector: { app: postgres } + ports: + - port: 5432 + targetPort: 5432 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: tmi-platform +spec: + serviceName: postgres + replicas: 1 + selector: + matchLabels: { app: postgres } + template: + metadata: + labels: { app: postgres } + spec: + containers: + - name: postgres + image: cgr.dev/chainguard/postgres:latest + envFrom: + - secretRef: { name: tmi-postgres } + env: + # Match Dockerfile.postgres so behavior mirrors the kind dev DB. + - { name: PGDATA, value: /var/lib/postgresql/data/pgdata } + - { name: LANG, value: en_US.UTF-8 } + - { name: LC_ALL, value: en_US.UTF-8 } + ports: + - containerPort: 5432 + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + readinessProbe: + exec: { command: ["pg_isready", "-U", "tmi_dev", "-d", "tmi_dev"] } + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + livenessProbe: + exec: { command: ["pg_isready", "-U", "tmi_dev"] } + initialDelaySeconds: 20 + periodSeconds: 10 + resources: + requests: { cpu: 100m, memory: 256Mi } + limits: { cpu: "1", memory: 1Gi } + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: longhorn + resources: + requests: + storage: 8Gi diff --git a/deployments/k8s/dev/k3s/registry.yml b/deployments/k8s/dev/k3s/registry.yml new file mode 100644 index 00000000..636a01ba --- /dev/null +++ b/deployments/k8s/dev/k3s/registry.yml @@ -0,0 +1,66 @@ +# In-cluster image registry for the k3s (CLUSTER=k3s) dev target. +# +# The Mac builds arm64 images and pushes them to this registry at rp2:30500 +# (NodePort 30500). The k3s nodes pull from it via a one-time +# /etc/rancher/k3s/registries.yaml mirror entry (see README-node-setup.md). +# +# registry:2 is multi-arch (arm64-capable) and is pulled from Docker Hub by the +# node, so there is no chicken-and-egg with this registry itself. +apiVersion: v1 +kind: Namespace +metadata: + name: tmi-platform +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: registry-data + namespace: tmi-platform +spec: + accessModes: ["ReadWriteOnce"] + storageClassName: longhorn + resources: + requests: + storage: 5Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: registry + namespace: tmi-platform +spec: + replicas: 1 + selector: + matchLabels: { app: registry } + template: + metadata: + labels: { app: registry } + spec: + containers: + - name: registry + image: registry:2 + ports: + - containerPort: 5000 + volumeMounts: + - name: data + mountPath: /var/lib/registry + resources: + requests: { cpu: 50m, memory: 128Mi } + limits: { cpu: "1", memory: 512Mi } + volumes: + - name: data + persistentVolumeClaim: + claimName: registry-data +--- +apiVersion: v1 +kind: Service +metadata: + name: registry + namespace: tmi-platform +spec: + type: NodePort + selector: { app: registry } + ports: + - port: 5000 + targetPort: 5000 + nodePort: 30500 diff --git a/scripts/devenv.py b/scripts/devenv.py index 71c40624..f54e2c29 100644 --- a/scripts/devenv.py +++ b/scripts/devenv.py @@ -16,7 +16,8 @@ cluster up|down the kind cluster only db up|down the postgres container only -Global: --db postgres|oracle (default postgres), --no-workers, --yes +Global: --db postgres|oracle (default postgres), --cluster kind|k3s + (default kind), --no-workers, --yes """ from __future__ import annotations @@ -41,47 +42,69 @@ def _db_profile(): return database.dev_profile() +def _uses_host_db(args) -> bool: + """True when the Mac-hosted Postgres container is in play: postgres DB on the + kind cluster. For CLUSTER=k3s the DB runs in-cluster, so the host container is + never touched.""" + return args.db == "postgres" and args.cluster == "kind" + + def cmd_up(args) -> None: - cluster.up() - if args.db == "postgres": + cluster.up(cluster=args.cluster) + if _uses_host_db(args): database.up(_db_profile()) - deploy.start(db=args.db, no_workers=args.no_workers, skip_context_guard=args.yes) + deploy.start(db=args.db, cluster_target=args.cluster, + no_workers=args.no_workers, skip_context_guard=args.yes) def cmd_down(args) -> None: deploy.teardown(db=args.db) - if args.db == "postgres": + if _uses_host_db(args): database.down(_db_profile()) # keep volume - cluster.down() + cluster.down(cluster=args.cluster) log_success("dev environment down (db data preserved)") def cmd_restart(args) -> None: - deploy.restart(db=args.db, no_workers=args.no_workers, skip_context_guard=args.yes) + deploy.restart(db=args.db, cluster_target=args.cluster, + no_workers=args.no_workers, skip_context_guard=args.yes) def cmd_reset(args) -> None: log_info("dev-reset: redeploying the in-cluster stack (keeping cluster + db data)") deploy.teardown(db=args.db) - if args.db == "postgres": + if _uses_host_db(args): database.up(_db_profile()) # ensure db is up (no data loss) - deploy.start(db=args.db, no_workers=args.no_workers, skip_context_guard=args.yes) + deploy.start(db=args.db, cluster_target=args.cluster, + no_workers=args.no_workers, skip_context_guard=args.yes) log_success("dev-reset complete") def cmd_nuke(args) -> None: log_info("dev-nuke: destroying EVERYTHING incl. db data + built images") + if args.cluster == "k3s": + # Namespace-scoped hard reset: we don't own the k3s cluster, so wipe the + # tmi-platform namespace (workloads + registry + Postgres data) and redeploy. + cluster.up(cluster="k3s") # ensure the k3s-rp context is active + deploy.teardown_k3s_namespace() + deploy.remove_local_images(args.db, cluster_target="k3s") + _clean_logs_and_files() + deploy.start(db=args.db, cluster_target="k3s", + no_workers=args.no_workers, skip_context_guard=args.yes) + log_success("dev-nuke complete (fresh k3s environment up)") + return deploy.teardown(db=args.db) - cluster.down() - if args.db == "postgres": + cluster.down(cluster=args.cluster) + if _uses_host_db(args): database.destroy(_db_profile()) # removes container + volume (data wiped) deploy.remove_local_images(args.db) _clean_logs_and_files() # rebuild from scratch - cluster.up() - if args.db == "postgres": + cluster.up(cluster=args.cluster) + if _uses_host_db(args): database.up(_db_profile()) - deploy.start(db=args.db, no_workers=args.no_workers, skip_context_guard=args.yes) + deploy.start(db=args.db, cluster_target=args.cluster, + no_workers=args.no_workers, skip_context_guard=args.yes) log_success("dev-nuke complete (fresh environment up)") @@ -96,7 +119,8 @@ def cmd_status(args) -> None: def cmd_deploy(args) -> None: - deploy.start(db=args.db, no_workers=args.no_workers, skip_context_guard=args.yes) + deploy.start(db=args.db, cluster_target=args.cluster, + no_workers=args.no_workers, skip_context_guard=args.yes) def cmd_logs(args) -> None: @@ -104,7 +128,7 @@ def cmd_logs(args) -> None: def cmd_cluster(args) -> None: - {"up": cluster.up, "down": cluster.down}[args.action]() + {"up": cluster.up, "down": cluster.down}[args.action](cluster=args.cluster) def cmd_db(args) -> None: @@ -145,12 +169,16 @@ def _add_global_options( if is_subparser: parser.add_argument("--db", choices=["postgres", "oracle"], default=argparse.SUPPRESS) + parser.add_argument("--cluster", choices=["kind", "k3s"], + default=argparse.SUPPRESS) parser.add_argument("--no-workers", action="store_true", dest="no_workers", default=argparse.SUPPRESS) parser.add_argument("--yes", action="store_true", default=argparse.SUPPRESS, help="Skip the local-kube-context safety check") else: parser.add_argument("--db", choices=["postgres", "oracle"], default="postgres") + parser.add_argument("--cluster", choices=["kind", "k3s"], default="kind", + help="Kube cluster target: kind (local, default) or k3s (remote k3s-rp)") parser.add_argument("--no-workers", action="store_true", dest="no_workers") parser.add_argument("--yes", action="store_true", help="Skip the local-kube-context safety check") diff --git a/scripts/lib/cluster.py b/scripts/lib/cluster.py index 347d2481..81d079ad 100644 --- a/scripts/lib/cluster.py +++ b/scripts/lib/cluster.py @@ -21,14 +21,30 @@ _PROJECT_ROOT = Path(__file__).resolve().parents[2] KIND_CONFIG = str(_PROJECT_ROOT / "deployments/k8s/dev/kind-cluster.yml") +# Remote k3s dev target (CLUSTER=k3s). We do not own this cluster: we select +# its context but never create/delete it. Images go to an in-cluster registry +# exposed at rp2:30500 (NodePort 30500). +K3S_CONTEXT = "k3s-rp" +K3S_REGISTRY = "rp2:30500" + + +def registry_for(cluster: str = "kind") -> str: + """Return the dev image-registry hostname for the given cluster target.""" + return K3S_REGISTRY if cluster == "k3s" else LOCAL_REGISTRY + + +def expected_context(cluster: str = "kind") -> str: + """Return the kube-context that must be active for the given cluster target.""" + return K3S_CONTEXT if cluster == "k3s" else f"kind-{CLUSTER_NAME}" + # Contexts we consider safe to deploy a dev environment into without --yes. _LOCAL_CONTEXT_PREFIXES = ("kind-", "k3d-") _LOCAL_CONTEXT_EXACT = {"k3s", "default", "rancher-desktop", "docker-desktop", "minikube"} -def local_image_ref(name: str, tag: str = "dev", registry: str = LOCAL_REGISTRY) -> str: - """Return the fully-qualified local-registry image reference.""" - return f"{registry}/{name}:{tag}" +def local_image_ref(name: str, tag: str = "dev", *, cluster: str = "kind") -> str: + """Return the fully-qualified dev-registry image reference for the cluster.""" + return f"{registry_for(cluster)}/{name}:{tag}" def is_local_kube_context(name: str) -> bool: @@ -116,8 +132,17 @@ def connect_registry_to_kind() -> None: ) -def up() -> None: - """Bring up the kind cluster and local dev registry.""" +def up(cluster: str = "kind") -> None: + """Bring up the dev cluster: create kind + local registry, or (k3s) select the + remote context without creating anything (the in-cluster registry and workloads + are applied by deploy).""" + if cluster == "k3s": + check_tool("kubectl") + log_info(f"Using existing k3s context '{K3S_CONTEXT}' (no cluster create)") + run_cmd(["kubectl", "config", "use-context", K3S_CONTEXT]) + log_success(f"kube context set to '{K3S_CONTEXT}'") + return + for tool in ("docker", "kind", "kubectl"): check_tool(tool) @@ -131,12 +156,21 @@ def up() -> None: run_cmd(["kind", "create", "cluster", "--config", KIND_CONFIG]) log_success(f"kind cluster '{CLUSTER_NAME}' created") + # `kind create` sets the kubectl context, but the skip-create path does not, + # so an active context from another cluster (e.g. k3s-rp) would linger and fail + # deploy's context guard. Always select the kind context — idempotent. + run_cmd(["kubectl", "config", "use-context", f"kind-{CLUSTER_NAME}"]) + connect_registry_to_kind() log_success(f"kind cluster '{CLUSTER_NAME}' ready; context kind-{CLUSTER_NAME}") -def down() -> None: - """Delete the kind cluster entirely.""" +def down(cluster: str = "kind") -> None: + """Delete the kind cluster entirely; no-op for the remote k3s cluster (not ours + to delete — namespace teardown is handled by deploy).""" + if cluster == "k3s": + log_info("cluster down is a no-op for k3s (remote cluster is not ours to delete)") + return check_tool("kind") run_cmd(["kind", "delete", "cluster", "--name", CLUSTER_NAME]) log_success(f"kind cluster '{CLUSTER_NAME}' deleted") diff --git a/scripts/lib/deploy.py b/scripts/lib/deploy.py index 53b81955..2746f253 100644 --- a/scripts/lib/deploy.py +++ b/scripts/lib/deploy.py @@ -80,8 +80,14 @@ def image_builds_for(db: str) -> list[tuple[str, str, dict]]: ] -def overlay_dir_for(db: str) -> str: - """Return the kustomize overlay directory path for the chosen DB flavor.""" +def overlay_dir_for(db: str, cluster_target: str = "kind") -> str: + """Return the kustomize overlay directory path for the chosen cluster + DB flavor. + + CLUSTER=k3s uses its own overlay (in-cluster registry image refs, full stack); + Oracle-on-k3s is out of scope, so k3s implies the postgres overlay. + """ + if cluster_target == "k3s": + return f"{DEV_DIR}/k3s" return f"{DEV_DIR}/oracle" if db == "oracle" else DEV_DIR @@ -108,6 +114,17 @@ def content_hash(text: str) -> str: # database-URL authority to this host. Docker Desktop maps host.docker.internal to # the host, reaching the 127.0.0.1:5432-published Postgres container. IN_CLUSTER_DB_HOST = "host.docker.internal" + + +def in_cluster_db_host(cluster_target: str = "kind") -> str: + """Host the in-cluster server uses to reach Postgres for the given cluster. + + kind: Postgres is a Mac container, reached via host.docker.internal. + k3s: Postgres runs in-cluster as the `postgres` Service (postgres.yml). + """ + return "postgres" if cluster_target == "k3s" else IN_CLUSTER_DB_HOST + + # Match the host (and optional :port) inside a postgres:// URL authority, after # the credentials '@'. Deliberately narrow: it touches ONLY a postgres URL's host, # never other localhost references in the config (redis host, OAuth callbacks, ...). @@ -179,14 +196,16 @@ def _preflight() -> None: sys.exit(1) -def _guard_context(skip: bool) -> str: +def _guard_context(skip: bool, cluster_target: str = "kind") -> str: ctx = current_kube_context() log_info(f"kubectl context: {ctx or '(none)'} namespace: {NS}") if not ctx: log_error("No kubectl context set. Run 'make dev-cluster-up'") sys.exit(1) - if not skip and not cluster.is_local_kube_context(ctx): - log_error(f"Context '{ctx}' is not a recognized local dev cluster. Re-run with --yes to override.") + expected = cluster.expected_context(cluster_target) + if not skip and ctx != expected and not cluster.is_local_kube_context(ctx): + log_error(f"Context '{ctx}' is not the expected '{expected}' for CLUSTER={cluster_target}, " + f"nor a recognized local dev cluster. Re-run with --yes to override.") sys.exit(1) return ctx @@ -297,8 +316,13 @@ def unstage_tmi_client(created: bool) -> None: # Image build + push # --------------------------------------------------------------------------- -def build_and_push(db: str) -> None: - """Build all images and push them to the local registry. +def build_and_push(db: str, cluster_target: str = "kind") -> None: + """Build all images and push them to the target cluster's registry. + + kind -> localhost:5000; k3s -> the in-cluster registry at rp2:30500. The Mac + and the k3s nodes are both arm64, so a plain host-arch `docker build` already + produces arm64 images — no buildx/--platform is needed; only the registry the + ref points at differs. All four Dockerfiles require the tmi-client staged in .docker-deps/. Stage once before the first build and clean up in a try/finally block @@ -311,7 +335,7 @@ def build_and_push(db: str) -> None: created = stage_tmi_client() try: for name, dockerfile, build_args_map in image_builds_for(db): - ref = cluster.local_image_ref(name) + ref = cluster.local_image_ref(name, cluster=cluster_target) log_info(f"Building {name} ({dockerfile}) -> {ref}") cmd = ["docker", "build", "-f", str(project_root / dockerfile)] @@ -347,11 +371,32 @@ def ensure_namespace() -> None: ) -def deliver_config() -> None: +def ensure_k3s_registry() -> None: + """Apply the in-cluster registry and wait for it (k3s prerequisite before push). + + The Mac builds images and pushes them to this registry (rp2:30500), and the + nodes pull from it, so it must be Running before build_and_push.""" + project_root = get_project_root() + kubectl(["apply", "-f", str(project_root / DEV_DIR / "k3s" / "registry.yml")]) + kubectl(["-n", NS, "rollout", "status", "deploy/registry", "--timeout=180s"]) + log_success("In-cluster registry ready (rp2:30500)") + + +def apply_k3s_postgres() -> None: + """Apply the in-cluster Postgres and wait for it (k3s prerequisite before the + server, mirroring how the kind path brings the host DB up first).""" + project_root = get_project_root() + kubectl(["apply", "-f", str(project_root / DEV_DIR / "k3s" / "postgres.yml")]) + kubectl(["-n", NS, "rollout", "status", "statefulset/postgres", "--timeout=180s"]) + log_success("In-cluster Postgres ready (svc/postgres:5432)") + + +def deliver_config(cluster_target: str = "kind") -> None: content = (get_project_root() / CONFIG_FILE).read_text() - # The on-disk config points the DB at localhost (for host-side tools); the - # in-cluster server reaches the host-published Postgres via host.docker.internal. - content = rewrite_db_host_for_incluster(content) + # The on-disk config points the DB at localhost (for host-side tools); rewrite + # it to the host the in-cluster server uses: host.docker.internal (kind) or the + # in-cluster `postgres` Service (k3s). + content = rewrite_db_host_for_incluster(content, db_host=in_cluster_db_host(cluster_target)) manifest = render_configmap_yaml( name=CONFIGMAP_NAME, namespace=NS, file_key="config.yml", content=content, ) @@ -421,7 +466,7 @@ def create_oracle_db_secret() -> None: log_success("oracle DB connection delivered as Secret/tmi-oracle-db") -def apply_overlay(no_workers: bool, db: str) -> None: +def apply_overlay(no_workers: bool, db: str, cluster_target: str = "kind") -> None: """Apply the dev overlay. When --no-workers: apply the three core manifests individually to avoid @@ -441,7 +486,7 @@ def apply_overlay(no_workers: bool, db: str) -> None: else: rendered = run_cmd( ["kubectl", "kustomize", "--load-restrictor", "LoadRestrictionsNone", - str(project_root / overlay_dir_for(db))], + str(project_root / overlay_dir_for(db, cluster_target))], capture=True, ).stdout kubectl(["apply", "-f", "-"], input_text=rendered) @@ -461,10 +506,15 @@ def server_rollout_timeout(db: str) -> str: return "1200s" if db == "oracle" else "180s" -def wait_and_forward(db: str = "postgres") -> None: +def wait_and_forward(db: str = "postgres", cluster_target: str = "kind") -> None: kubectl(["-n", NS, "rollout", "status", "deploy/tmi-component-controller", "--timeout=120s"]) kubectl(["-n", NS, "rollout", "status", "deploy/tmi-server", f"--timeout={server_rollout_timeout(db)}"]) start_redis_port_forward() + # k3s has no extraPortMappings, so preserve localhost:8080 with a server + # port-forward. Start it AFTER the redis forward, whose stop_port_forward() + # clears both pidfiles, so this one survives. + if cluster_target == "k3s": + start_server_port_forward() wait_for_server() log_success(f"Dev environment ready at {SERVER_URL}") @@ -509,6 +559,25 @@ def start_redis_port_forward() -> None: log_info(f"Port-forward started (PID {redis_proc.pid}): localhost:6379 -> svc/redis:6379") +def start_server_port_forward() -> None: + """Forward the in-cluster server to localhost:8080 (k3s only). + + kind publishes the server NodePort on localhost:8080 via extraPortMappings, so + no forward is needed there. A remote k3s cluster has no such mapping, so we + preserve the localhost:8080 contract with a port-forward. (For CATS/high- + throughput, hit the NodePort at rp2:30080 directly — the userspace forward + throttles under load, the #463 problem.) Stops only a prior SERVER forward so + it does not disturb the redis forward started just before it.""" + _stop_port_forward_pidfile(PORT_FORWARD_PID) + srv_proc = subprocess.Popen( + ["kubectl", "-n", NS, "port-forward", "svc/tmi-server", f"{HOST_PORT}:{HOST_PORT}"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + Path(PORT_FORWARD_PID).write_text(str(srv_proc.pid)) + log_info(f"Port-forward started (PID {srv_proc.pid}): localhost:{HOST_PORT} -> svc/tmi-server:{HOST_PORT}") + + def _stop_port_forward_pidfile(pid_path: str) -> None: p = Path(pid_path) if p.exists(): @@ -535,10 +604,11 @@ def tail_server_logs() -> None: kubectl(["-n", NS, "logs", "-f", "deploy/tmi-server", "--tail=200"], check=False) -def remove_local_images(db: str) -> None: +def remove_local_images(db: str, cluster_target: str = "kind") -> None: """Remove the locally-built dev images (used by `devenv.py nuke`).""" for name, _df, _args in image_builds_for(db): - run_cmd(["docker", "rmi", "-f", cluster.local_image_ref(name)], check=False) + run_cmd(["docker", "rmi", "-f", cluster.local_image_ref(name, cluster=cluster_target)], + check=False) def server_http_status() -> tuple[bool, str]: @@ -556,21 +626,27 @@ def server_http_status() -> tuple[bool, str]: # Orchestration entry points # --------------------------------------------------------------------------- -def start(*, db: str, no_workers: bool = False, skip_context_guard: bool = False) -> None: +def start(*, db: str, cluster_target: str = "kind", no_workers: bool = False, + skip_context_guard: bool = False) -> None: """Build images, deploy all components, wait for readiness, and start port-forwards.""" _preflight() - _guard_context(skip_context_guard) - cluster.ensure_registry() - cluster.connect_registry_to_kind() - build_and_push(db) + _guard_context(skip_context_guard, cluster_target) + if cluster_target == "k3s": + ensure_k3s_registry() # in-cluster registry must be up before push + else: + cluster.ensure_registry() + cluster.connect_registry_to_kind() + build_and_push(db, cluster_target) ensure_namespace() apply_platform_base() - deliver_config() + if cluster_target == "k3s": + apply_k3s_postgres() # in-cluster DB up before the server (AutoMigrate) + deliver_config(cluster_target) create_embedding_secret() if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db) + apply_overlay(no_workers, db, cluster_target) # `kubectl apply` of an unchanged Deployment spec does not roll a new pod, so # a freshly-built :dev image (same tag) would not be picked up, and a pod # stuck in CrashLoopBackOff from a prior transient outage (e.g. the host DB @@ -580,24 +656,30 @@ def start(*, db: str, no_workers: bool = False, skip_context_guard: bool = False # backoff-cleared pods. imagePullPolicy:Always ensures the new image is pulled. kubectl(["-n", NS, "rollout", "restart", "deploy/tmi-component-controller"]) kubectl(["-n", NS, "rollout", "restart", "deploy/tmi-server"]) - wait_and_forward(db) + wait_and_forward(db, cluster_target) -def restart(*, db: str, no_workers: bool = False, skip_context_guard: bool = False) -> None: +def restart(*, db: str, cluster_target: str = "kind", no_workers: bool = False, + skip_context_guard: bool = False) -> None: """Rebuild the server image, re-deliver config, and roll the server deployment.""" _preflight() - _guard_context(skip_context_guard) - cluster.ensure_registry() - cluster.connect_registry_to_kind() - build_and_push(db) - deliver_config() + _guard_context(skip_context_guard, cluster_target) + if cluster_target == "k3s": + ensure_k3s_registry() + else: + cluster.ensure_registry() + cluster.connect_registry_to_kind() + build_and_push(db, cluster_target) + deliver_config(cluster_target) if db == "oracle": create_oracle_wallet_secret() create_oracle_db_secret() - apply_overlay(no_workers, db) + apply_overlay(no_workers, db, cluster_target) kubectl(["-n", NS, "rollout", "restart", "deploy/tmi-server"]) kubectl(["-n", NS, "rollout", "status", "deploy/tmi-server", f"--timeout={server_rollout_timeout(db)}"]) start_redis_port_forward() + if cluster_target == "k3s": + start_server_port_forward() wait_for_server() log_success(f"Server restarted; {SERVER_URL}") @@ -670,3 +752,12 @@ def teardown(*, db: str = "postgres") -> None: run_cmd(["docker", "stop", cluster.REGISTRY_CONTAINER], check=False) log_success("Dev environment torn down (cluster left intact)") + + +def teardown_k3s_namespace() -> None: + """Hard reset for k3s: delete the entire tmi-platform namespace (all workloads, + the in-cluster registry, and the Postgres PVC/data). Never touches the k3s + cluster itself — we do not own it.""" + stop_port_forward() + kubectl(["delete", "namespace", NS, "--ignore-not-found", "--wait=true"]) + log_success(f"Namespace {NS} deleted (k3s hard reset)") diff --git a/scripts/lib/tests/test_cluster.py b/scripts/lib/tests/test_cluster.py index faa87f87..c0a344f1 100644 --- a/scripts/lib/tests/test_cluster.py +++ b/scripts/lib/tests/test_cluster.py @@ -13,6 +13,40 @@ def test_local_image_ref_default_tag(self): def test_local_image_ref_custom_tag(self): self.assertEqual(cluster.local_image_ref("tmi-server", tag="x"), "localhost:5000/tmi-server:x") + def test_local_image_ref_kind_explicit(self): + self.assertEqual( + cluster.local_image_ref("tmi-server", cluster="kind"), + "localhost:5000/tmi-server:dev", + ) + + def test_local_image_ref_k3s(self): + self.assertEqual( + cluster.local_image_ref("tmi-server", cluster="k3s"), + "rp2:30500/tmi-server:dev", + ) + + +class TestRegistryFor(unittest.TestCase): + def test_registry_for_default_is_kind(self): + self.assertEqual(cluster.registry_for(), "localhost:5000") + + def test_registry_for_kind(self): + self.assertEqual(cluster.registry_for("kind"), "localhost:5000") + + def test_registry_for_k3s(self): + self.assertEqual(cluster.registry_for("k3s"), "rp2:30500") + + +class TestExpectedContext(unittest.TestCase): + def test_expected_context_default_is_kind(self): + self.assertEqual(cluster.expected_context(), "kind-tmi-dev") + + def test_expected_context_kind(self): + self.assertEqual(cluster.expected_context("kind"), "kind-tmi-dev") + + def test_expected_context_k3s(self): + self.assertEqual(cluster.expected_context("k3s"), "k3s-rp") + class TestIsLocalKubeContext(unittest.TestCase): def test_is_local_kube_context_kind_prefix(self): diff --git a/scripts/lib/tests/test_deploy.py b/scripts/lib/tests/test_deploy.py index 29e8e708..77abe687 100644 --- a/scripts/lib/tests/test_deploy.py +++ b/scripts/lib/tests/test_deploy.py @@ -39,6 +39,25 @@ def test_overlay_dir_oracle(self): def test_overlay_dir_postgres(self): self.assertFalse(deploy.overlay_dir_for("postgres").endswith("/oracle")) + def test_overlay_dir_k3s(self): + # CLUSTER=k3s uses the k3s overlay regardless of DB flavor. + self.assertTrue(deploy.overlay_dir_for("postgres", "k3s").endswith("/k3s")) + self.assertTrue(deploy.overlay_dir_for("oracle", "k3s").endswith("/k3s")) + + +class TestInClusterDbHost(unittest.TestCase): + def test_kind_uses_host_docker_internal(self): + self.assertEqual(deploy.in_cluster_db_host(), "host.docker.internal") + self.assertEqual(deploy.in_cluster_db_host("kind"), "host.docker.internal") + + def test_k3s_uses_postgres_service(self): + self.assertEqual(deploy.in_cluster_db_host("k3s"), "postgres") + + def test_k3s_rewrites_url_host_to_postgres_service(self): + src = 'url: "postgres://tmi_dev:dev123@localhost:5432/tmi_dev?sslmode=disable"' + out = deploy.rewrite_db_host_for_incluster(src, db_host=deploy.in_cluster_db_host("k3s")) + self.assertIn("@postgres:5432/tmi_dev", out) + class TestNoWorkersFiles(unittest.TestCase): def test_no_workers_files_oracle_uses_oracle_server(self): @@ -130,13 +149,23 @@ def test_kind_cluster_maps_host_to_nodeport(self): "kind-cluster.yml: extraPortMappings hostPort must equal deploy.HOST_PORT", ) - def test_no_server_port_forward_remains(self): - """The server must NOT be port-forwarded; only redis is (issue #463).""" + def test_server_port_forward_is_k3s_only(self): + """#463: the KIND server is reached via the NodePort, never a port-forward + (the userspace proxy collapsed under CATS load). A server port-forward + exists ONLY for the remote k3s target — which has no extraPortMappings, and + where CATS uses the NodePort at rp2:30080 directly — and every invocation is + gated on cluster_target == 'k3s'.""" src = (Path(deploy.__file__)).read_text() - self.assertNotRegex( - src, re.compile(r"port-forward.*svc/tmi-server"), - "deploy.py must not port-forward the server (use the NodePort)", - ) + # Exactly one server port-forward command, inside start_server_port_forward. + cmd_lines = re.findall(r'port-forward".*svc/tmi-server', src) + self.assertEqual(len(cmd_lines), 1, + "exactly one server port-forward command (the k3s helper)") + # Every call site (excluding the def) must be immediately gated on k3s. + call_sites = re.findall(r"(?