ericfitz · ericfitz · Jul 4, 2026 · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/Makefile b/Makefile
@@ -22,6 +22,9 @@ SERVER_PORT ?= 8080
 # Default database backend for dev environment (postgres|oracle)
 DB ?= postgres
 
+# Default kube cluster target for dev environment (kind|k3s)
+CLUSTER ?= kind
+
 # ============================================================================
 # ATOMIC COMPONENTS - Infrastructure Management
 # ============================================================================
@@ -317,38 +320,38 @@ tilt-down:  ## Stop Tilt and restore the prod-shaped server
 	@echo "prod-shaped server restored"
 
 # ============================================================================
-# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle
+# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle CLUSTER=kind|k3s
 # ============================================================================
 
-dev-up:  ## Bring up the full kind dev environment (cluster + db + deploy). DB=postgres|oracle
-	@uv run scripts/devenv.py --db $(DB) up
+dev-up:  ## Bring up the full dev environment (cluster + db + deploy). DB=postgres|oracle CLUSTER=kind|k3s
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) up
 
-dev-down:  ## Tear down the kind dev environment; KEEP db data
-	@uv run scripts/devenv.py --db $(DB) down
+dev-down:  ## Tear down the dev environment; KEEP db data
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) down
 
 dev-restart:  ## Rebuild the server image + roll the server pod (cluster + db untouched)
-	@uv run scripts/devenv.py --db $(DB) restart
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) restart
 
 dev-reset:  ## Soft known-state: redeploy the stack with fresh images; KEEP db data
-	@uv run scripts/devenv.py --db $(DB) reset
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) reset
 
 dev-nuke:  ## Hard known-state: destroy everything incl. db data + images, rebuild
-	@uv run scripts/devenv.py --db $(DB) nuke
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) nuke
 
-dev-status:  ## kind-aware dev environment status dashboard
-	@uv run scripts/devenv.py status
+dev-status:  ## dev environment status dashboard
+	@uv run scripts/devenv.py --cluster $(CLUSTER) status
 
 dev-logs:  ## Stream the tmi-server pod logs
-	@uv run scripts/devenv.py logs
+	@uv run scripts/devenv.py --cluster $(CLUSTER) logs
 
 dev-deploy:  ## (Re)apply manifests + rollout without recreating cluster/db
-	@uv run scripts/devenv.py --db $(DB) deploy
+	@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) deploy
 
-dev-cluster-up:  ## Create the local kind cluster + registry only
-	@uv run scripts/devenv.py cluster up
+dev-cluster-up:  ## Create the local cluster + registry only (kind); switch context (k3s)
+	@uv run scripts/devenv.py --cluster $(CLUSTER) cluster up
 
-dev-cluster-down:  ## Delete the local kind cluster only
-	@uv run scripts/devenv.py cluster down
+dev-cluster-down:  ## Delete the local kind cluster only (no-op for k3s)
+	@uv run scripts/devenv.py --cluster $(CLUSTER) cluster down
 
 dev-db-up:  ## Start the postgres dev container only
 	@uv run scripts/devenv.py db up

diff --git a/deployments/k8s/dev/k3s/README-node-setup.md b/deployments/k8s/dev/k3s/README-node-setup.md
@@ -0,0 +1,97 @@
+# k3s dev target — one-time host & node setup
+
+`make dev-up CLUSTER=k3s` deploys TMI to the remote **k3s-rp** cluster
+(nodes `rp2`/`rp3`/`rp4` = `192.168.1.2`/`.3`/`.4`, all arm64). Images are served
+from an in-cluster registry at **`rp2:30500`** (plain HTTP). Two one-time,
+out-of-band configuration steps are required before the first `dev-up CLUSTER=k3s`,
+because they need root/SSH on machines the dev tooling cannot reach.
+
+## 0. Mac: make `rp2` resolve reliably
+
+The k3s kubeconfig context (`k3s-rp`) and the registry ref (`rp2:30500`) both use
+the bare short name `rp2`. On macOS that resolves only via mDNS (`rp2.local`), and
+the bare form is unreliable — in particular, kubectl's Go resolver returns
+`lookup rp2: no such host` right after the node reboots (mDNS hasn't re-advertised
+yet), which aborts `make dev-up CLUSTER=k3s` in pre-flight. Pin it in `/etc/hosts`
+so every resolver (kubectl, docker, curl) resolves it deterministically:
+
+```bash
+echo "192.168.1.2  rp2" | sudo tee -a /etc/hosts
+# verify
+dscacheutil -q host -a name rp2   # -> ip_address: 192.168.1.2
+```
+
+One-time and persists across reboots. Use `rp2.local` or `192.168.1.2` directly if
+you prefer not to edit `/etc/hosts`, but then the kubeconfig server URL and the
+registry refs would need to match — pinning `rp2` keeps everything as-is.
+
+## 1. Mac: trust the registry over plain HTTP
+
+The registry has no TLS, so the Docker daemon must treat `rp2:30500` as insecure,
+or `docker push` refuses it.
+
+Docker Desktop → **Settings → Docker Engine**, add `rp2:30500` to
+`insecure-registries`:
+
+```json
+{
+  "insecure-registries": ["rp2:30500"]
+}
+```
+
+**Apply & Restart**. Verify:
+
+```bash
+docker info 2>/dev/null | grep -A2 "Insecure Registries"   # should list rp2:30500
+```
+
+## 2. Each k3s node: mirror rp2:30500 over HTTP
+
+containerd on every node must also be told the registry is plain HTTP, or pods
+fail to pull with an `http: server gave HTTP response to HTTPS client` error.
+
+On **each** node (`rp2`, `rp3`, `rp4`), create/merge
+`/etc/rancher/k3s/registries.yaml`. Note the endpoint is the **IP**
+`192.168.1.2` (rp2), not the hostname: the nodes do not resolve each other's
+bare hostnames (`lookup rp2: no such host`), so the mirror must dial an IP. The
+mirror *key* stays `rp2:30500` so image references and the Mac's Docker config
+are unchanged.
+
+```yaml
+mirrors:
+  "rp2:30500":
+    endpoint:
+      - "http://192.168.1.2:30500"
+```
+
+Then restart k3s so containerd reloads:
+
+```bash
+# server nodes (this cluster is 3x control-plane):
+sudo systemctl restart k3s
+```
+
+Verify a node can pull once an image has been pushed (after the first build):
+
+```bash
+sudo k3s crictl pull rp2:30500/tmi-server:dev   # should succeed
+```
+
+### Optional helper
+
+To push the file to all three nodes at once (requires SSH + sudo on each):
+
+```bash
+for n in rp2 rp3 rp4; do
+  ssh "$n" 'sudo mkdir -p /etc/rancher/k3s && \
+    printf "mirrors:\n  \"rp2:30500\":\n    endpoint:\n      - \"http://192.168.1.2:30500\"\n" | sudo tee /etc/rancher/k3s/registries.yaml >/dev/null && \
+    sudo systemctl restart k3s'
+done
+```
+
+## Notes
+
+- These steps are **idempotent** and only needed once per machine (they persist
+  across reboots). They do not affect the default `kind` dev path.
+- `rp2` must resolve from the Mac — pin it in `/etc/hosts` (step 0). Relying on
+  mDNS alone is what breaks `dev-up` after a node reboot.
diff --git a/deployments/k8s/dev/k3s/kustomization.yaml b/deployments/k8s/dev/k3s/kustomization.yaml
@@ -0,0 +1,48 @@
+# k3s (CLUSTER=k3s) dev overlay — full parity with the kind dev stack, but every
+# TMI image is served from the in-cluster registry at rp2:30500 instead of the
+# kind-local localhost:5000.
+#
+# Postgres (postgres.yml) and the in-cluster registry (registry.yml) are NOT part
+# of this overlay: deploy.py applies them as prerequisites (registry before the
+# image build/push; Postgres before the server, mirroring how kind brings the DB
+# up first). This overlay is the workload set: server, controller, redis, and the
+# extractor/chunk-embed worker TMIComponents.
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: tmi-platform
+resources:
+  - ../controller.yml
+  - ../redis.yml
+  - ../server.yml
+  - ../../platform/components/tmi-extractor.yml
+  - ../../platform/components/tmi-chunk-embed.yml
+# Deployment container images (server, controller) — remapped by the images
+# transformer (preserves the :dev tag).
+images:
+  - name: localhost:5000/tmi-server
+    newName: rp2:30500/tmi-server
+  - name: localhost:5000/tmi-component-controller
+    newName: rp2:30500/tmi-component-controller
+  # Redis: the kind path uses cgr.dev/chainguard/redis, whose bundled jemalloc is
+  # compiled for 4KB pages and aborts with "Unsupported system page size" on the
+  # Pi 5 (BCM2712) nodes, which run a 16KB-page kernel. redis:7-alpine is built
+  # with libc malloc (page-size-agnostic) and runs the same redis-server with the
+  # same args. Verified starting cleanly on the nodes.
+  - name: cgr.dev/chainguard/redis
+    newName: redis
+    newTag: "7-alpine"
+# The workers are TMIComponent custom resources; their .spec.image is not a
+# PodSpec container image, so the images transformer cannot reach it — patch it.
+patches:
+  - path: patches/extractor-image.yaml
+    target:
+      group: tmi.dev
+      version: v1alpha1
+      kind: TMIComponent
+      name: tmi-extractor
+  - path: patches/chunkembed-image.yaml
+    target:
+      group: tmi.dev
+      version: v1alpha1
+      kind: TMIComponent
+      name: tmi-chunk-embed
diff --git a/deployments/k8s/dev/k3s/patches/chunkembed-image.yaml b/deployments/k8s/dev/k3s/patches/chunkembed-image.yaml
@@ -0,0 +1,3 @@
+- op: replace
+  path: /spec/image
+  value: rp2:30500/tmi-chunk-embed:dev
diff --git a/deployments/k8s/dev/k3s/patches/extractor-image.yaml b/deployments/k8s/dev/k3s/patches/extractor-image.yaml
@@ -0,0 +1,3 @@
+- op: replace
+  path: /spec/image
+  value: rp2:30500/tmi-extractor:dev
diff --git a/deployments/k8s/dev/k3s/postgres.yml b/deployments/k8s/dev/k3s/postgres.yml
@@ -0,0 +1,84 @@
+# In-cluster single-node Postgres for the k3s (CLUSTER=k3s) dev target.
+#
+# Uses the same vanilla Chainguard image as the kind dev DB (Dockerfile.postgres
+# is just `FROM cgr.dev/chainguard/postgres:latest` + PGDATA/locale env, with no
+# extensions), so the node pulls it straight from cgr.dev — no build/push to the
+# in-cluster registry is needed. Schema is created by the server's GORM
+# AutoMigrate at startup (same as kind).
+#
+# Credentials MUST match database.url in config-development.yml
+# (postgres://tmi_dev:dev123@.../tmi_dev). The server reaches this DB via the
+# `postgres` Service DNS name; deploy.py rewrites the config URL host to `postgres`
+# for CLUSTER=k3s.
+apiVersion: v1
+kind: Secret
+metadata:
+  name: tmi-postgres
+  namespace: tmi-platform
+type: Opaque
+stringData:
+  POSTGRES_USER: "tmi_dev"
+  POSTGRES_PASSWORD: "dev123"
+  POSTGRES_DB: "tmi_dev"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: tmi-platform
+spec:
+  selector: { app: postgres }
+  ports:
+    - port: 5432
+      targetPort: 5432
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: tmi-platform
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector:
+    matchLabels: { app: postgres }
+  template:
+    metadata:
+      labels: { app: postgres }
+    spec:
+      containers:
+        - name: postgres
+          image: cgr.dev/chainguard/postgres:latest
+          envFrom:
+            - secretRef: { name: tmi-postgres }
+          env:
+            # Match Dockerfile.postgres so behavior mirrors the kind dev DB.
+            - { name: PGDATA, value: /var/lib/postgresql/data/pgdata }
+            - { name: LANG, value: en_US.UTF-8 }
+            - { name: LC_ALL, value: en_US.UTF-8 }
+          ports:
+            - containerPort: 5432
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/postgresql/data
+          readinessProbe:
+            exec: { command: ["pg_isready", "-U", "tmi_dev", "-d", "tmi_dev"] }
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            timeoutSeconds: 3
+          livenessProbe:
+            exec: { command: ["pg_isready", "-U", "tmi_dev"] }
+            initialDelaySeconds: 20
+            periodSeconds: 10
+          resources:
+            requests: { cpu: 100m, memory: 256Mi }
+            limits: { cpu: "1", memory: 1Gi }
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        storageClassName: longhorn
+        resources:
+          requests:
+            storage: 8Gi
diff --git a/deployments/k8s/dev/k3s/registry.yml b/deployments/k8s/dev/k3s/registry.yml
@@ -0,0 +1,66 @@
+# In-cluster image registry for the k3s (CLUSTER=k3s) dev target.
+#
+# The Mac builds arm64 images and pushes them to this registry at rp2:30500
+# (NodePort 30500). The k3s nodes pull from it via a one-time
+# /etc/rancher/k3s/registries.yaml mirror entry (see README-node-setup.md).
+#
+# registry:2 is multi-arch (arm64-capable) and is pulled from Docker Hub by the
+# node, so there is no chicken-and-egg with this registry itself.
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: tmi-platform
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: registry-data
+  namespace: tmi-platform
+spec:
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 5Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: registry
+  namespace: tmi-platform
+spec:
+  replicas: 1
+  selector:
+    matchLabels: { app: registry }
+  template:
+    metadata:
+      labels: { app: registry }
+    spec:
+      containers:
+        - name: registry
+          image: registry:2
+          ports:
+            - containerPort: 5000
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/registry
+          resources:
+            requests: { cpu: 50m, memory: 128Mi }
+            limits: { cpu: "1", memory: 512Mi }
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: registry-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: registry
+  namespace: tmi-platform
+spec:
+  type: NodePort
+  selector: { app: registry }
+  ports:
+    - port: 5000
+      targetPort: 5000
+      nodePort: 30500