Skip to content
Merged
35 changes: 19 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ SERVER_PORT ?= 8080
# Default database backend for dev environment (postgres|oracle)
DB ?= postgres

# Default kube cluster target for dev environment (kind|k3s)
CLUSTER ?= kind

# ============================================================================
# ATOMIC COMPONENTS - Infrastructure Management
# ============================================================================
Expand Down Expand Up @@ -317,38 +320,38 @@ tilt-down: ## Stop Tilt and restore the prod-shaped server
@echo "prod-shaped server restored"

# ============================================================================
# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle
# DEV ENVIRONMENT — single orchestrator (scripts/devenv.py). DB=postgres|oracle CLUSTER=kind|k3s
# ============================================================================

dev-up: ## Bring up the full kind dev environment (cluster + db + deploy). DB=postgres|oracle
@uv run scripts/devenv.py --db $(DB) up
dev-up: ## Bring up the full dev environment (cluster + db + deploy). DB=postgres|oracle CLUSTER=kind|k3s
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) up

dev-down: ## Tear down the kind dev environment; KEEP db data
@uv run scripts/devenv.py --db $(DB) down
dev-down: ## Tear down the dev environment; KEEP db data
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) down

dev-restart: ## Rebuild the server image + roll the server pod (cluster + db untouched)
@uv run scripts/devenv.py --db $(DB) restart
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) restart

dev-reset: ## Soft known-state: redeploy the stack with fresh images; KEEP db data
@uv run scripts/devenv.py --db $(DB) reset
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) reset

dev-nuke: ## Hard known-state: destroy everything incl. db data + images, rebuild
@uv run scripts/devenv.py --db $(DB) nuke
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) nuke

dev-status: ## kind-aware dev environment status dashboard
@uv run scripts/devenv.py status
dev-status: ## dev environment status dashboard
@uv run scripts/devenv.py --cluster $(CLUSTER) status

dev-logs: ## Stream the tmi-server pod logs
@uv run scripts/devenv.py logs
@uv run scripts/devenv.py --cluster $(CLUSTER) logs

dev-deploy: ## (Re)apply manifests + rollout without recreating cluster/db
@uv run scripts/devenv.py --db $(DB) deploy
@uv run scripts/devenv.py --db $(DB) --cluster $(CLUSTER) deploy

dev-cluster-up: ## Create the local kind cluster + registry only
@uv run scripts/devenv.py cluster up
dev-cluster-up: ## Create the local cluster + registry only (kind); switch context (k3s)
@uv run scripts/devenv.py --cluster $(CLUSTER) cluster up

dev-cluster-down: ## Delete the local kind cluster only
@uv run scripts/devenv.py cluster down
dev-cluster-down: ## Delete the local kind cluster only (no-op for k3s)
@uv run scripts/devenv.py --cluster $(CLUSTER) cluster down

dev-db-up: ## Start the postgres dev container only
@uv run scripts/devenv.py db up
Expand Down
97 changes: 97 additions & 0 deletions deployments/k8s/dev/k3s/README-node-setup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# k3s dev target — one-time host & node setup

`make dev-up CLUSTER=k3s` deploys TMI to the remote **k3s-rp** cluster
(nodes `rp2`/`rp3`/`rp4` = `192.168.1.2`/`.3`/`.4`, all arm64). Images are served
from an in-cluster registry at **`rp2:30500`** (plain HTTP). Two one-time,
out-of-band configuration steps are required before the first `dev-up CLUSTER=k3s`,
because they need root/SSH on machines the dev tooling cannot reach.

## 0. Mac: make `rp2` resolve reliably

The k3s kubeconfig context (`k3s-rp`) and the registry ref (`rp2:30500`) both use
the bare short name `rp2`. On macOS that resolves only via mDNS (`rp2.local`), and
the bare form is unreliable — in particular, kubectl's Go resolver returns
`lookup rp2: no such host` right after the node reboots (mDNS hasn't re-advertised
yet), which aborts `make dev-up CLUSTER=k3s` in pre-flight. Pin it in `/etc/hosts`
so every resolver (kubectl, docker, curl) resolves it deterministically:

```bash
echo "192.168.1.2 rp2" | sudo tee -a /etc/hosts
# verify
dscacheutil -q host -a name rp2 # -> ip_address: 192.168.1.2
```

One-time and persists across reboots. Use `rp2.local` or `192.168.1.2` directly if
you prefer not to edit `/etc/hosts`, but then the kubeconfig server URL and the
registry refs would need to match — pinning `rp2` keeps everything as-is.

## 1. Mac: trust the registry over plain HTTP

The registry has no TLS, so the Docker daemon must treat `rp2:30500` as insecure,
or `docker push` refuses it.

Docker Desktop → **Settings → Docker Engine**, add `rp2:30500` to
`insecure-registries`:

```json
{
"insecure-registries": ["rp2:30500"]
}
```

**Apply & Restart**. Verify:

```bash
docker info 2>/dev/null | grep -A2 "Insecure Registries" # should list rp2:30500
```

## 2. Each k3s node: mirror rp2:30500 over HTTP

containerd on every node must also be told the registry is plain HTTP, or pods
fail to pull with an `http: server gave HTTP response to HTTPS client` error.

On **each** node (`rp2`, `rp3`, `rp4`), create/merge
`/etc/rancher/k3s/registries.yaml`. Note the endpoint is the **IP**
`192.168.1.2` (rp2), not the hostname: the nodes do not resolve each other's
bare hostnames (`lookup rp2: no such host`), so the mirror must dial an IP. The
mirror *key* stays `rp2:30500` so image references and the Mac's Docker config
are unchanged.

```yaml
mirrors:
"rp2:30500":
endpoint:
- "http://192.168.1.2:30500"
```

Then restart k3s so containerd reloads:

```bash
# server nodes (this cluster is 3x control-plane):
sudo systemctl restart k3s
```

Verify a node can pull once an image has been pushed (after the first build):

```bash
sudo k3s crictl pull rp2:30500/tmi-server:dev # should succeed
```

### Optional helper

To push the file to all three nodes at once (requires SSH + sudo on each):

```bash
for n in rp2 rp3 rp4; do
ssh "$n" 'sudo mkdir -p /etc/rancher/k3s && \
printf "mirrors:\n \"rp2:30500\":\n endpoint:\n - \"http://192.168.1.2:30500\"\n" | sudo tee /etc/rancher/k3s/registries.yaml >/dev/null && \
sudo systemctl restart k3s'
done
```

## Notes

- These steps are **idempotent** and only needed once per machine (they persist
across reboots). They do not affect the default `kind` dev path.
- `rp2` must resolve from the Mac — pin it in `/etc/hosts` (step 0). Relying on
mDNS alone is what breaks `dev-up` after a node reboot.
48 changes: 48 additions & 0 deletions deployments/k8s/dev/k3s/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# k3s (CLUSTER=k3s) dev overlay — full parity with the kind dev stack, but every
# TMI image is served from the in-cluster registry at rp2:30500 instead of the
# kind-local localhost:5000.
#
# Postgres (postgres.yml) and the in-cluster registry (registry.yml) are NOT part
# of this overlay: deploy.py applies them as prerequisites (registry before the
# image build/push; Postgres before the server, mirroring how kind brings the DB
# up first). This overlay is the workload set: server, controller, redis, and the
# extractor/chunk-embed worker TMIComponents.
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: tmi-platform
resources:
- ../controller.yml
- ../redis.yml
- ../server.yml
- ../../platform/components/tmi-extractor.yml
- ../../platform/components/tmi-chunk-embed.yml
# Deployment container images (server, controller) — remapped by the images
# transformer (preserves the :dev tag).
images:
- name: localhost:5000/tmi-server
newName: rp2:30500/tmi-server
- name: localhost:5000/tmi-component-controller
newName: rp2:30500/tmi-component-controller
# Redis: the kind path uses cgr.dev/chainguard/redis, whose bundled jemalloc is
# compiled for 4KB pages and aborts with "Unsupported system page size" on the
# Pi 5 (BCM2712) nodes, which run a 16KB-page kernel. redis:7-alpine is built
# with libc malloc (page-size-agnostic) and runs the same redis-server with the
# same args. Verified starting cleanly on the nodes.
- name: cgr.dev/chainguard/redis
newName: redis
newTag: "7-alpine"
# The workers are TMIComponent custom resources; their .spec.image is not a
# PodSpec container image, so the images transformer cannot reach it — patch it.
patches:
- path: patches/extractor-image.yaml
target:
group: tmi.dev
version: v1alpha1
kind: TMIComponent
name: tmi-extractor
- path: patches/chunkembed-image.yaml
target:
group: tmi.dev
version: v1alpha1
kind: TMIComponent
name: tmi-chunk-embed
3 changes: 3 additions & 0 deletions deployments/k8s/dev/k3s/patches/chunkembed-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: replace
path: /spec/image
value: rp2:30500/tmi-chunk-embed:dev
3 changes: 3 additions & 0 deletions deployments/k8s/dev/k3s/patches/extractor-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: replace
path: /spec/image
value: rp2:30500/tmi-extractor:dev
84 changes: 84 additions & 0 deletions deployments/k8s/dev/k3s/postgres.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# In-cluster single-node Postgres for the k3s (CLUSTER=k3s) dev target.
#
# Uses the same vanilla Chainguard image as the kind dev DB (Dockerfile.postgres
# is just `FROM cgr.dev/chainguard/postgres:latest` + PGDATA/locale env, with no
# extensions), so the node pulls it straight from cgr.dev — no build/push to the
# in-cluster registry is needed. Schema is created by the server's GORM
# AutoMigrate at startup (same as kind).
#
# Credentials MUST match database.url in config-development.yml
# (postgres://tmi_dev:dev123@.../tmi_dev). The server reaches this DB via the
# `postgres` Service DNS name; deploy.py rewrites the config URL host to `postgres`
# for CLUSTER=k3s.
apiVersion: v1
kind: Secret
metadata:
name: tmi-postgres
namespace: tmi-platform
type: Opaque
stringData:
POSTGRES_USER: "tmi_dev"
POSTGRES_PASSWORD: "dev123"
POSTGRES_DB: "tmi_dev"
---
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: tmi-platform
spec:
selector: { app: postgres }
ports:
- port: 5432
targetPort: 5432
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: tmi-platform
spec:
serviceName: postgres
replicas: 1
selector:
matchLabels: { app: postgres }
template:
metadata:
labels: { app: postgres }
spec:
containers:
- name: postgres
image: cgr.dev/chainguard/postgres:latest
envFrom:
- secretRef: { name: tmi-postgres }
env:
# Match Dockerfile.postgres so behavior mirrors the kind dev DB.
- { name: PGDATA, value: /var/lib/postgresql/data/pgdata }
- { name: LANG, value: en_US.UTF-8 }
- { name: LC_ALL, value: en_US.UTF-8 }
ports:
- containerPort: 5432
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
readinessProbe:
exec: { command: ["pg_isready", "-U", "tmi_dev", "-d", "tmi_dev"] }
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
livenessProbe:
exec: { command: ["pg_isready", "-U", "tmi_dev"] }
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests: { cpu: 100m, memory: 256Mi }
limits: { cpu: "1", memory: 1Gi }
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: longhorn
resources:
requests:
storage: 8Gi
66 changes: 66 additions & 0 deletions deployments/k8s/dev/k3s/registry.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# In-cluster image registry for the k3s (CLUSTER=k3s) dev target.
#
# The Mac builds arm64 images and pushes them to this registry at rp2:30500
# (NodePort 30500). The k3s nodes pull from it via a one-time
# /etc/rancher/k3s/registries.yaml mirror entry (see README-node-setup.md).
#
# registry:2 is multi-arch (arm64-capable) and is pulled from Docker Hub by the
# node, so there is no chicken-and-egg with this registry itself.
apiVersion: v1
kind: Namespace
metadata:
name: tmi-platform
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: registry-data
namespace: tmi-platform
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: longhorn
resources:
requests:
storage: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: registry
namespace: tmi-platform
spec:
replicas: 1
selector:
matchLabels: { app: registry }
template:
metadata:
labels: { app: registry }
spec:
containers:
- name: registry
image: registry:2
ports:
- containerPort: 5000
volumeMounts:
- name: data
mountPath: /var/lib/registry
resources:
requests: { cpu: 50m, memory: 128Mi }
limits: { cpu: "1", memory: 512Mi }
volumes:
- name: data
persistentVolumeClaim:
claimName: registry-data
---
apiVersion: v1
kind: Service
metadata:
name: registry
namespace: tmi-platform
spec:
type: NodePort
selector: { app: registry }
ports:
- port: 5000
targetPort: 5000
nodePort: 30500
Loading
Loading