From 3e2223d0c71faee0b50f85d76ee5cf9969cc79a9 Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Thu, 4 Jun 2026 11:53:40 -0400 Subject: [PATCH 1/7] Add GPU Health & DCGM panels to Grafana dashboard. Extend gpu-dashboard.json with temperature, power, VRAM %, memory-copy, XID, and optional profiling metrics; sync Helm ConfigMap and document PromQL in DASHBOARD.md and GRAFANA_DEPLOYMENT.md. Co-authored-by: Cursor --- DASHBOARD.md | 21 + GRAFANA_DEPLOYMENT.md | 854 ++++++++++++++++++++++++++ gpu-dashboard.json | 299 ++++++++- helm/grafana-dashboard-configmap.yaml | 773 +++++++++++++++++++++++ 4 files changed, 1946 insertions(+), 1 deletion(-) create mode 100644 GRAFANA_DEPLOYMENT.md create mode 100644 helm/grafana-dashboard-configmap.yaml diff --git a/DASHBOARD.md b/DASHBOARD.md index fbf2def..9965223 100644 --- a/DASHBOARD.md +++ b/DASHBOARD.md @@ -96,6 +96,7 @@ A Grafana dashboard is included in `gpu-dashboard.json` for more detailed GPU mo - **Idle GPU Workloads**: GPUs with zero compute activity for 30+ minutes - **Idle GPU Time by Deployment**: Deployments producing the most allocated GPU idle time (see [Prometheus Queries](#prometheus-queries) below) - **GPU Allocation Leaderboard**: Total GPU requests per namespace +- **GPU Health & DCGM**: Temperature, power, VRAM %, memory-copy util, XID errors, and optional DCGM profiling metrics ### Importing the Grafana Dashboard @@ -139,6 +140,26 @@ The overview row uses **two independent partitions** of the same total. Each pai Equivalently: **Engine active** = Total − Engine idle, and **VRAM free** = Total − VRAM allocated, when the same DCGM time series are counted. +### GPU Health & DCGM + +Panels in the **GPU Health & DCGM** row use additional dcgm-exporter counters. Profiling panels show no data unless your exporter exposes `DCGM_FI_PROF_*` metrics (same requirement as `DCGM_FI_PROF_GR_ENGINE_ACTIVE`). + +| Panel | PromQL | +|-------|--------| +| Peak GPU temperature | `max(DCGM_FI_DEV_GPU_TEMP)` | +| Peak power (W) | `max(DCGM_FI_DEV_POWER_USAGE)` | +| XID errors (total) | `sum(DCGM_FI_DEV_XID_ERRORS)` | +| GPU temperature by node | `avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)` | +| Power draw by node | `sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)` | +| VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | +| Memory copy utilization | `avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)` | +| XID errors (1h increase) | `sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))` | +| SM active by node | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | +| Tensor pipe active by node | `avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)` | +| DRAM active by node | `avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)` | + +Note: gpu-pruner idle detection uses [`query.promql.j2`](gpu-pruner/src/query.promql.j2) at runtime; Grafana idle panels use related but simpler PromQL for visualization. + ### Idle GPU Time by Deployment Query This query identifies which Kubernetes Deployments are producing the most allocated GPU idle time while GPU utilization is at 0%. diff --git a/GRAFANA_DEPLOYMENT.md b/GRAFANA_DEPLOYMENT.md new file mode 100644 index 0000000..5981ce7 --- /dev/null +++ b/GRAFANA_DEPLOYMENT.md @@ -0,0 +1,854 @@ +# Deploying Grafana with GPU Dashboard using Helm + +This guide explains how to deploy a standalone Grafana instance with the GPU Pruner dashboard pre-configured using the official Grafana Helm chart. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Quick Start](#quick-start) +- [Installation](#installation) +- [Configuration](#configuration) +- [Validation](#validation) +- [Troubleshooting](#troubleshooting) +- [Customization](#customization) +- [Security Considerations](#security-considerations) + +## Overview + +The GPU Pruner project includes a comprehensive Grafana dashboard (`gpu-dashboard.json`) that visualizes: + +- **Cluster GPU Overview**: Total GPUs, VRAM allocation, engine activity +- **GPU Utilization Heatmap**: Per-node GPU utilization over time +- **Running GPU Workloads**: All pods with GPU requests +- **Idle GPU Workloads**: GPUs with zero compute activity for 30+ minutes +- **Idle GPU Time by Deployment**: Deployments producing the most allocated GPU idle time +- **GPU Allocation Leaderboard**: Total GPU requests per namespace + +This deployment uses the **official Grafana Helm chart** (`grafana/grafana`) to create a dedicated Grafana instance for GPU monitoring, separate from the gpu-pruner deployment. + +## Prerequisites + +### Required Components + +Before deploying Grafana, ensure these components are running in your Kubernetes cluster: + +1. **Prometheus** - Collecting metrics from DCGM exporter and kube-state-metrics +2. **DCGM Exporter** - DaemonSet on GPU nodes exposing NVIDIA GPU metrics +3. **kube-state-metrics** - With pod labels enabled for deployment-level analysis + +### Required Tools + +- **Helm 3.x** - [Install Helm](https://helm.sh/docs/intro/install/) +- **kubectl** - Configured with cluster access +- **Kubernetes 1.19+** - With GPU nodes + +### Validation Commands + +Verify prerequisites before proceeding: + +```bash +# Check Prometheus is accessible +kubectl get svc -A | grep prometheus + +# Verify DCGM exporter pods on GPU nodes +kubectl get pods -A | grep dcgm + +# Check kube-state-metrics +kubectl get deploy -A | grep kube-state-metrics + +# Test Prometheus query (requires port-forward) +kubectl port-forward -n svc/ 9090:9090 & +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length' +# Should return number of GPUs + +# Verify kube_pod_labels metric exists +curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_labels' | jq '.data.result | length' +# Should return > 0 +``` + +### kube-state-metrics Configuration + +For the "Idle GPU Time by Deployment" panel to work, kube-state-metrics **must** be configured with: + +```yaml +--metric-labels-allowlist=pods=[*] +``` + +This enables the `kube_pod_labels` metric. Verify with: + +```bash +kubectl get deploy kube-state-metrics -n -o yaml | grep metric-labels-allowlist +``` + +If missing, update the deployment: + +```bash +kubectl set env deployment/kube-state-metrics \ + -n \ + KUBE_STATE_METRICS_ARGS='--metric-labels-allowlist=pods=[*]' +``` + +## Quick Start + +For a standard Kubernetes cluster with Prometheus Operator: + +```bash +# Add Grafana Helm repository +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +# Install Grafana with GPU dashboard +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-vanilla-k8s.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + --set ingress.hosts[0]='grafana-gpu.example.com' \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \ + -n monitoring --create-namespace + +# Get admin password (if not set above) +kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo + +# Access Grafana (port-forward) +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 +``` + +Open http://localhost:3000 and login with `admin` / password from above. + +## Installation + +### Step 1: Add Grafana Helm Repository + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update +``` + +### Step 2: Choose Your Environment + +Select the appropriate values file for your Kubernetes environment: + +#### Option A: OpenShift + +```bash +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-openshift.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + -n monitoring --create-namespace +``` + +**Note**: For OpenShift, you'll need to create a ClusterRoleBinding for Prometheus access: + +```bash +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring +``` + +And set the Prometheus token in the datasource configuration (see [Configuration](#configuration)). + +#### Option B: Vanilla Kubernetes + +```bash +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-vanilla-k8s.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + --set ingress.hosts[0]='grafana-gpu.example.com' \ + --set datasources."datasources\.yaml".datasources[0].url='http://your-prometheus:9090' \ + -n monitoring --create-namespace +``` + +Update the Ingress hostname and Prometheus URL to match your environment. + +### Step 3: Verify Deployment + +```bash +# Check pod status +kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana + +# Check logs +kubectl logs -n monitoring -l app.kubernetes.io/name=grafana + +# Verify service +kubectl get svc -n monitoring gpu-grafana +``` + +Expected output: +``` +NAME READY STATUS RESTARTS AGE +gpu-grafana 1/1 Running 0 2m +``` + +### Step 4: Access Grafana + +#### Via Port Forward (for testing) + +```bash +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 +``` + +Open http://localhost:3000 + +#### Via Ingress (production) + +Access via the configured hostname: https://grafana-gpu.example.com + +#### Via OpenShift Route (OpenShift only) + +Get the route URL: + +```bash +oc get route -n monitoring grafana -o jsonpath='{.spec.host}' +``` + +### Step 5: Login + +- **Username**: `admin` +- **Password**: The password you set via `--set adminPassword` or retrieve it: + +```bash +kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo +``` + +## Configuration + +### Prometheus Datasource URL + +The most critical configuration is the Prometheus datasource URL. Update it to match your cluster: + +**In `helm/grafana-values.yaml`** or via `--set`: + +```yaml +datasources: + datasources.yaml: + datasources: + - url: http://YOUR_PROMETHEUS_SERVICE:9090 +``` + +Common patterns: + +| Environment | Prometheus URL | +|-------------|----------------| +| OpenShift | `http://thanos-querier.openshift-monitoring.svc.cluster.local:9090` | +| Prometheus Operator | `http://prometheus-k8s.monitoring.svc.cluster.local:9090` | +| kube-prometheus-stack | `http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090` | +| Custom | `http://prometheus.prometheus.svc.cluster.local:9090` | + +### Datasource UID (Critical) + +The dashboard has a **hardcoded datasource UID**: `PBFA97CFB590B2093` + +**Do NOT change this UID** in the values file: + +```yaml +datasources: + datasources.yaml: + datasources: + - uid: PBFA97CFB590B2093 # Must match dashboard +``` + +If you need a different UID, you'll have to manually edit the dashboard after import (see [Troubleshooting](#datasource-uid-mismatch)). + +### Admin Credentials + +**Option 1: Set via Helm** (simple, not recommended for production) + +```bash +--set adminPassword='YOUR_PASSWORD' +``` + +**Option 2: Use existing secret** (recommended for production) + +Create a secret first: + +```bash +kubectl create secret generic grafana-admin-secret \ + -n monitoring \ + --from-literal=admin-user=admin \ + --from-literal=admin-password='YOUR_SECURE_PASSWORD' +``` + +Update values: + +```yaml +admin: + existingSecret: grafana-admin-secret + userKey: admin-user + passwordKey: admin-password +``` + +### Ingress Configuration + +For **Nginx Ingress**: + +```yaml +ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - grafana-gpu.example.com + tls: + - secretName: grafana-tls + hosts: + - grafana-gpu.example.com +``` + +For **Traefik Ingress**: + +```yaml +ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + traefik.ingress.kubernetes.io/router.entrypoints: websecure +``` + +### OpenShift Route + +For OpenShift (already configured in `grafana-values-openshift.yaml`): + +```yaml +route: + enabled: true + host: grafana-gpu.apps.example.com + tls: + enabled: true + termination: edge +``` + +Or create manually: + +```bash +oc create route edge grafana \ + --service=gpu-grafana \ + --hostname=grafana-gpu.apps.example.com \ + -n monitoring +``` + +### Resource Limits + +Adjust based on your cluster size and dashboard usage: + +```yaml +resources: + limits: + cpu: 500m # Increase for large clusters or many concurrent users + memory: 512Mi # Increase if dashboards are slow to load + requests: + cpu: 250m + memory: 256Mi +``` + +### Persistence + +Enable persistence to retain dashboard edits and datasource configurations: + +```yaml +persistence: + enabled: true + size: 10Gi + storageClassName: default # Adjust for your cluster +``` + +## Validation + +### 1. Verify Grafana Pod is Running + +```bash +kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana +``` + +Expected: `STATUS: Running` + +### 2. Check Grafana Logs + +```bash +kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50 +``` + +Look for: +- `HTTP Server Listen` +- No error messages about datasource or dashboard provisioning + +### 3. Test Prometheus Datasource + +Access Grafana UI → Configuration → Data Sources → Prometheus → Save & Test + +Expected: **"Data source is working"** (green checkmark) + +If it fails, verify: +- Prometheus URL is correct +- Network connectivity from Grafana pod to Prometheus service +- Prometheus is healthy: `kubectl get pods -n ` + +### 4. Verify Dashboard is Loaded + +Grafana UI → Dashboards → GPU Monitoring → Waldorf GPU Usage & Idle Tracker + +Or check via API: + +```bash +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 & +curl -s -u admin:YOUR_PASSWORD http://localhost:3000/api/search?query=gpu | jq . +``` + +Expected: JSON response with dashboard UID and title. + +### 5. Validate Dashboard Panels + +Check each panel shows data (not "No data"): + +| Panel | Validation Query | Expected Result | +|-------|------------------|-----------------| +| Total GPUs | `count(DCGM_FI_DEV_GPU_UTIL)` | Number of GPUs in cluster | +| VRAM allocated | `count(DCGM_FI_DEV_FB_USED > 0)` | Number with VRAM in use | +| Engine idle | `count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)` | Idle GPU count | +| Running GPU Workloads | `sum by (namespace, pod) (kube_pod_container_resource_requests{resource="nvidia_com_gpu"})` | List of pods with GPUs | +| Peak GPU temperature | `max(DCGM_FI_DEV_GPU_TEMP)` | Max die temp (°C) | +| Peak power | `max(DCGM_FI_DEV_POWER_USAGE)` | Max per-GPU watts | +| XID errors | `sum(DCGM_FI_DEV_XID_ERRORS)` | Should be 0 in healthy clusters | +| VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | Per-GPU VRAM fill | +| SM active (profiling) | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | No data if profiling disabled | + +### 6. Test Prometheus Queries Manually + +Port-forward to Prometheus: + +```bash +kubectl port-forward -n svc/ 9090:9090 +``` + +Run test queries: + +```bash +# Check DCGM metrics exist +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length' + +# GPU health metrics (GPU Health & DCGM dashboard row) +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_TEMP' | jq '.data.result | length' +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_POWER_USAGE' | jq '.data.result | length' +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_MEM_COPY_UTIL' | jq '.data.result | length' +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_XID_ERRORS' | jq '.data.result | length' + +# Check kube-state-metrics +curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_container_resource_requests{resource="nvidia_com_gpu"}' | jq '.data.result | length' + +# Check pod labels (for deployment analysis) +curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_labels' | jq '.data.result | length' +``` + +All should return `> 0` results. + +## Troubleshooting + +### Dashboard Shows "No Data" + +**Cause**: Prometheus datasource not configured correctly or metrics not available. + +**Solution**: + +1. Verify Prometheus datasource connection: + - Grafana UI → Configuration → Data Sources → Prometheus → Save & Test + - Should show "Data source is working" + +2. Check Prometheus has DCGM metrics: + ```bash + kubectl port-forward -n svc/ 9090:9090 & + curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq . + ``` + +3. Verify DCGM exporter is running: + ```bash + kubectl get pods -A | grep dcgm + ``` + +4. Check Prometheus is scraping DCGM exporter: + - Prometheus UI → Status → Targets + - Look for `dcgm-exporter` job with state UP + +### Datasource UID Mismatch + +**Cause**: Dashboard expects datasource UID `PBFA97CFB590B2093` but your datasource has a different UID. + +**Symptoms**: Dashboard panels show "Data source not found" or use wrong datasource. + +**Solution A: Match the UID** (recommended) + +Configure datasource with the exact UID: + +```yaml +datasources: + datasources.yaml: + datasources: + - uid: PBFA97CFB590B2093 +``` + +Then reinstall or update Grafana: + +```bash +helm upgrade gpu-grafana grafana/grafana -f helm/grafana-values.yaml -n monitoring +``` + +**Solution B: Remap Dashboard** + +1. Open dashboard in Grafana +2. Click gear icon (Dashboard settings) +3. Go to JSON Model +4. Find and replace all instances of `"uid": "PBFA97CFB590B2093"` with your datasource UID +5. Save dashboard + +### Missing Metrics + +**Problem**: Some panels show "No data" but others work. + +**Missing `DCGM_FI_PROF_GR_ENGINE_ACTIVE`**: + +- Older DCGM versions may not export this metric +- Fallback: Dashboard uses `DCGM_FI_DEV_GPU_UTIL / 100` as alternative +- Verify with: `curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_PROF_GR_ENGINE_ACTIVE'` + +**Missing `kube_pod_labels`**: + +- kube-state-metrics not configured with `--metric-labels-allowlist=pods=[*]` +- Affects "Idle GPU Time by Deployment" panel only +- Fix: + ```bash + kubectl set env deployment/kube-state-metrics \ + -n \ + KUBE_STATE_METRICS_ARGS='--metric-labels-allowlist=pods=[*]' + ``` + +### Pod Labels Not Showing in Deployment Analysis + +**Cause**: Pods don't have the `app` label set to deployment name. + +**Solution**: Ensure your GPU workloads have proper labels: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-gpu-workload +spec: + template: + metadata: + labels: + app: my-gpu-workload # This label is required +``` + +### OpenShift: Prometheus 403 Forbidden + +**Cause**: Grafana service account doesn't have permission to query Prometheus. + +**Solution**: + +```bash +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring +``` + +And set the Prometheus token: + +```bash +# Get token +TOKEN=$(oc serviceaccounts get-token grafana -n monitoring) + +# Update datasource secureJsonData +kubectl edit secret gpu-grafana -n monitoring +# Add: httpHeaderValue1: 'Bearer ' +``` + +Or use `--set` during Helm install: + +```bash +--set datasources."datasources\.yaml".datasources[0].secureJsonData.httpHeaderValue1="Bearer $TOKEN" +``` + +### Dashboard Not Auto-Imported + +**Cause**: Dashboard provisioning failed or URL is unreachable. + +**Solution A: Manual Import** + +1. Download dashboard: `curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json` +2. Grafana UI → Dashboards → Import → Upload JSON file +3. Select Prometheus datasource (UID `PBFA97CFB590B2093`) +4. Click Import + +**Solution B: Use ConfigMap Method** + +```bash +# Create ConfigMap +kubectl apply -f helm/grafana-dashboard-configmap.yaml + +# Update Grafana values to use sidecar +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --set sidecar.dashboards.enabled=true \ + -n monitoring +``` + +### Ingress Not Working + +**Missing Ingress Controller**: + +Verify ingress controller is installed: + +```bash +kubectl get pods -n ingress-nginx # or kube-system, or traefik-system +``` + +**Certificate Issues**: + +If using cert-manager: + +```bash +kubectl get certificate -n monitoring +kubectl describe certificate grafana-tls -n monitoring +``` + +**Alternative: Use NodePort or LoadBalancer**: + +```bash +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --set service.type=NodePort \ + --set service.nodePort=30300 \ + -n monitoring +``` + +Access via: `http://:30300` + +## Customization + +### Adjust Idle Detection Window + +The dashboard uses a **30-minute window** to detect idle GPUs. To customize: + +1. Open dashboard in Grafana +2. Edit panel (e.g., "Engine idle (30m)") +3. Change query from `[30m]` to desired duration: + ```promql + count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[60m]) == 0) # 60 minutes + ``` +4. Update panel title to reflect new duration +5. Save dashboard + +### Modify GPU Model Assumptions + +The "GPU Memory per GPU" panel shows 140 GiB for H200 GPUs. For other models: + +1. Edit panel +2. Update query to use actual DCGM metric: + ```promql + max(DCGM_FI_DEV_FB_TOTAL) / 1024 # Returns actual GPU memory in GiB + ``` +3. Or hardcode for your GPU model: + ```promql + 80 # For A100 80GB + ``` + +### Add Custom Panels + +The dashboard includes a **GPU Health & DCGM** row with temperature, power, VRAM %, memory-copy utilization, XID errors, and optional profiling metrics. See [`DASHBOARD.md`](DASHBOARD.md#gpu-health--dcgm) for PromQL reference. + +To add more panels: + +1. Dashboard → Add panel → Add a new panel +2. Select Prometheus datasource +3. Enter PromQL query +4. Configure visualization (graph, table, stat, etc.) +5. Save panel + +Example metric already on the dashboard: + +```promql +avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP) +``` + +### Dashboard Refresh Rate + +Change auto-refresh interval: + +1. Dashboard settings (gear icon) → Time options +2. Set refresh interval (e.g., 30s, 1m, 5m) +3. Save + +### Create Alerts + +To alert on idle GPUs: + +1. Edit "Engine idle (30m)" panel +2. Click "Alert" tab → Create alert rule +3. Configure: + - **Condition**: `WHEN last() OF query(A) IS ABOVE 5` + - **Evaluate every**: 5m + - **For**: 10m (grace period) +4. Add notification channel +5. Save + +## Security Considerations + +### 1. Secure Admin Credentials + +**Never hardcode passwords in values files**. Use Kubernetes secrets: + +```bash +kubectl create secret generic grafana-admin-secret \ + -n monitoring \ + --from-literal=admin-password="$(openssl rand -base64 32)" +``` + +Update values: + +```yaml +admin: + existingSecret: grafana-admin-secret + passwordKey: admin-password +``` + +### 2. RBAC for Prometheus Access + +Grant minimal permissions to Grafana service account: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grafana-prometheus-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: view # Or create custom role with only Prometheus read access +subjects: +- kind: ServiceAccount + name: grafana + namespace: monitoring +``` + +### 3. Network Policies + +Restrict Grafana network access: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: grafana-netpol + namespace: monitoring +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: grafana + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + name: ingress-nginx # Allow ingress controller + ports: + - protocol: TCP + port: 3000 + egress: + - to: + - namespaceSelector: + matchLabels: + name: monitoring # Allow Prometheus access + ports: + - protocol: TCP + port: 9090 + - to: # Allow DNS + - namespaceSelector: + matchLabels: + name: kube-system + ports: + - protocol: UDP + port: 53 +``` + +### 4. TLS/HTTPS + +Always use TLS for production deployments: + +**With Ingress + cert-manager**: + +```yaml +ingress: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + tls: + - secretName: grafana-tls + hosts: + - grafana-gpu.example.com +``` + +**With OpenShift Route**: + +```yaml +route: + tls: + enabled: true + termination: edge # or reencrypt for end-to-end TLS +``` + +### 5. Anonymous Access + +Disable anonymous access in production: + +```yaml +grafana.ini: + auth.anonymous: + enabled: false +``` + +### 6. Datasource Token Rotation + +For OpenShift or token-based Prometheus auth, rotate tokens regularly: + +```bash +# Generate new token +NEW_TOKEN=$(oc serviceaccounts get-token grafana -n monitoring) + +# Update secret +kubectl patch secret gpu-grafana -n monitoring \ + -p "{\"data\":{\"prometheus-token\":\"$(echo -n $NEW_TOKEN | base64)\"}}" + +# Restart Grafana +kubectl rollout restart deployment gpu-grafana -n monitoring +``` + +### 7. Audit Logging + +Enable Grafana audit logging: + +```yaml +grafana.ini: + log: + mode: console + level: info + log.console: + format: json + security: + disable_initial_admin_creation: false +``` + +## Additional Resources + +- [Grafana Helm Chart Documentation](https://github.com/grafana/helm-charts/tree/main/charts/grafana) +- [GPU Pruner Dashboard Documentation](DASHBOARD.md) +- [Prometheus Deployment Guide](PROMETHEUS_DEPLOYMENT.md) +- [DCGM Exporter Setup](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html) +- [kube-state-metrics Configuration](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/cli-arguments.md) + +## Support + +For issues or questions: + +- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues +- Dashboard Documentation: [DASHBOARD.md](DASHBOARD.md) +- Grafana Community: https://community.grafana.com/ diff --git a/gpu-dashboard.json b/gpu-dashboard.json index 6fa1a2f..dbd3391 100644 --- a/gpu-dashboard.json +++ b/gpu-dashboard.json @@ -450,6 +450,303 @@ } } } + }, + { + "title": "GPU Health & DCGM", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 72 }, + "collapsed": false + }, + { + "title": "Peak GPU temperature", + "description": "Highest GPU die temperature across the cluster (DCGM_FI_DEV_GPU_TEMP).", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max(DCGM_FI_DEV_GPU_TEMP)", + "legendFormat": "Max °C" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 80 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "Peak power (W)", + "description": "Highest per-GPU power draw in watts (DCGM_FI_DEV_POWER_USAGE). Useful for rack capacity and corroborating idle detection.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max(DCGM_FI_DEV_POWER_USAGE)", + "legendFormat": "Max W" + } + ], + "fieldConfig": { + "defaults": { + "unit": "watt", + "color": { "mode": "fixed", "fixedColor": "orange" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "XID errors (total)", + "description": "Sum of NVIDIA XID driver/hardware error counter (DCGM_FI_DEV_XID_ERRORS). Non-zero warrants investigation.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_XID_ERRORS)", + "legendFormat": "XID" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "GPU temperature by node", + "description": "Average GPU temperature per host (DCGM_FI_DEV_GPU_TEMP).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 77 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "min": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 80 }, + { "color": "red", "value": 85 } + ] + } + } + } + }, + { + "title": "Power draw by node", + "description": "Sum of per-GPU power draw per host in watts (DCGM_FI_DEV_POWER_USAGE).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 77 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "watt", + "min": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "VRAM utilization %", + "description": "Framebuffer used as a percentage of total VRAM per GPU (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 85 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)", + "legendFormat": "{{Hostname}} GPU {{gpu}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 10, + "spanNulls": true + } + } + } + }, + { + "title": "Memory copy utilization", + "description": "Average memory copy engine utilization per host (DCGM_FI_DEV_MEM_COPY_UTIL).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 85 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "XID errors (1h increase)", + "description": "XID errors increased over the last hour per GPU. Spikes indicate driver or hardware faults.", + "type": "timeseries", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 93 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))", + "legendFormat": "{{Hostname}} GPU {{gpu}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "decimals": 0, + "custom": { + "drawStyle": "bars", + "lineWidth": 1, + "fillOpacity": 50, + "spanNulls": true + } + } + } + }, + { + "title": "SM active by node", + "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "Tensor pipe active by node", + "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "DRAM active by node", + "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } } ], "schemaVersion": 39, @@ -460,5 +757,5 @@ "timezone": "browser", "title": "Waldorf GPU Usage & Idle Tracker", "uid": "prometheus", - "version": 1 + "version": 2 } diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml new file mode 100644 index 0000000..3fe4be0 --- /dev/null +++ b/helm/grafana-dashboard-configmap.yaml @@ -0,0 +1,773 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-gpu-dashboard + namespace: monitoring + labels: + grafana_dashboard: "1" + app: grafana + annotations: + description: "GPU Pruner dashboard showing GPU utilization, idle workloads, and resource allocation" +data: + gpu-dashboard.json: | + { + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Cluster GPU Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "title": "Total GPUs", + "description": "All GPUs reporting DCGM metrics. Baseline for the two partitions below (engine activity and VRAM).", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL)", + "legendFormat": "Total" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "fixed", "fixedColor": "blue" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "VRAM allocated (FB>0)", + "description": "GPUs with framebuffer memory in use (CUDA context / VRAM). Complements VRAM free (FB=0); the pair sums to Total. Not the same as compute-active.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "count(DCGM_FI_DEV_FB_USED > 0)", + "legendFormat": "FB>0" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "fixed", "fixedColor": "green" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "VRAM free (FB=0)", + "description": "GPUs with no framebuffer in use. PromQL: count(DCGM_FI_DEV_FB_USED == 0). Equals Total − VRAM allocated (FB>0).", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "count(DCGM_FI_DEV_FB_USED == 0)", + "legendFormat": "FB=0" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "fixed", "fixedColor": "semi-dark-green" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "semi-dark-green", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "Engine idle (30m)", + "description": "No graphics/compute engine activity for the full 30m window. PromQL: count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0). Complements Engine active (30m); the pair sums to Total.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)", + "legendFormat": "Idle" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "fixed", "fixedColor": "red" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "Engine active (30m)", + "description": "Had graphics/compute engine activity at least once in 30m. PromQL: count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0). Equals Total − Engine idle (30m).", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)", + "legendFormat": "Active" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "fixed", "fixedColor": "orange" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "GPU Memory per GPU (H200)", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max(DCGM_FI_DEV_FB_TOTAL) / 1024", + "legendFormat": "GiB" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "color": { "mode": "fixed", "fixedColor": "purple" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "purple", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "GPU Utilization Heatmap by Node", + "type": "timeseries", + "gridPos": { "h": 6, "w": 16, "x": 0, "y": 5 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_UTIL)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "Running GPU Workloads", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, + "collapsed": false + }, + { + "title": "Running GPU Workloads", + "description": "All pods with GPU requests, grouped by namespace (user)", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (namespace, pod) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"})", + "legendFormat": "", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true }, + "renameByName": { + "namespace": "Namespace (User)", + "pod": "Pod", + "Value": "GPUs Requested" + }, + "indexByName": { "namespace": 0, "pod": 1, "Value": 2 } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "GPUs Requested" }, + "properties": [ + { "id": "custom.width", "value": 130 }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 4 }, + { "color": "red", "value": 8 } + ] + } + }, + { "id": "custom.displayMode", "value": "color-background" } + ] + } + ] + } + }, + { + "title": "Idle GPU Workloads", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "collapsed": false + }, + { + "title": "Idle GPU Workloads (Zero Compute for 30m+)", + "description": "GPUs where max compute engine activity was 0 for the entire 30-minute window. These are candidates for pruning.", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 23 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0", + "legendFormat": "", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "Value": true }, + "renameByName": { + "Hostname": "Node", + "gpu": "GPU #", + "modelName": "GPU Model" + }, + "indexByName": { "Hostname": 0, "gpu": 1, "modelName": 2 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "displayMode": "color-text" }, + "color": { "mode": "fixed", "fixedColor": "red" } + } + } + }, + { + "title": "Idle GPU Time by Deployment - Historical Timeline", + "description": "Historical timeline showing idle GPU-hours by deployment over time. Each line represents a deployment's idle GPU allocation trend.", + "type": "timeseries", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 33 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5", + "legendFormat": "{{deployment}} ({{namespace}})", + "interval": "1m" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "gradientMode": "opacity", + "spanNulls": true, + "showPoints": "auto", + "pointSize": 5, + "stacking": { "mode": "none" }, + "axisPlacement": "auto", + "axisLabel": "Idle GPU-Hours", + "scaleDistribution": { "type": "linear" } + }, + "color": { "mode": "palette-classic" }, + "decimals": 1, + "unit": "none", + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "orange", "value": 5 }, + { "color": "red", "value": 10 } + ] + } + }, + "overrides": [] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { + "calcs": ["last", "max", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + } + }, + { + "title": "Idle GPU Time by Deployment (30m)", + "description": "Deployments producing the most allocated GPU idle time at 0% utilization. Sorted by total idle GPU-hours (GPU count × 30min window). Uses kube_pod_labels when available; otherwise derives deployment name from pod name.", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 43 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)", + "legendFormat": "", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true }, + "renameByName": { + "deployment": "Deployment", + "namespace": "Namespace", + "Value": "Idle GPU-Hours" + }, + "indexByName": { "Value": 2, "deployment": 0, "namespace": 1 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "displayMode": "color-background-solid" }, + "decimals": 1, + "unit": "none" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Idle GPU-Hours" }, + "properties": [ + { "id": "custom.width", "value": 150 }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "orange", "value": 5 }, + { "color": "red", "value": 10 } + ] + } + }, + { "id": "custom.displayMode", "value": "color-background" } + ] + } + ] + } + }, + { + "title": "Leaderboard", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 }, + "collapsed": false + }, + { + "title": "GPU Allocation Leaderboard (by Namespace)", + "description": "Total GPU requests per namespace, sorted descending. Namespace = user ({user}-dev pattern).", + "type": "barchart", + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 54 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sort_desc(sum by (namespace) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"}))", + "legendFormat": "{{namespace}}", + "instant": true + } + ], + "options": { + "orientation": "horizontal", + "showValue": "always", + "barWidth": 0.7 + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "color": { "mode": "palette-classic" }, + "displayName": "${__field.labels.namespace}" + } + } + }, + { + "title": "GPU Memory Allocation Leaderboard (GiB)", + "description": "Estimated GPU memory allocated per namespace. Calculated as GPUs requested × 140 GiB (H200 FB total).", + "type": "barchart", + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 54 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sort_desc(sum by (namespace) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"}) * 140)", + "legendFormat": "{{namespace}}", + "instant": true + } + ], + "options": { + "orientation": "horizontal", + "showValue": "always", + "barWidth": 0.7 + }, + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "color": { "mode": "palette-classic" }, + "displayName": "${__field.labels.namespace}" + } + } + }, + { + "title": "GPU Memory Usage Over Time (by Node)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 64 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_FB_USED) / 1024", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decgbytes", + "min": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "spanNulls": true, + "stacking": { "mode": "normal" } + } + } + } + }, + { + "title": "GPU Health & DCGM", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 72 }, + "collapsed": false + }, + { + "title": "Peak GPU temperature", + "description": "Highest GPU die temperature across the cluster (DCGM_FI_DEV_GPU_TEMP).", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max(DCGM_FI_DEV_GPU_TEMP)", + "legendFormat": "Max °C" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 80 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "Peak power (W)", + "description": "Highest per-GPU power draw in watts (DCGM_FI_DEV_POWER_USAGE). Useful for rack capacity and corroborating idle detection.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "max(DCGM_FI_DEV_POWER_USAGE)", + "legendFormat": "Max W" + } + ], + "fieldConfig": { + "defaults": { + "unit": "watt", + "color": { "mode": "fixed", "fixedColor": "orange" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "XID errors (total)", + "description": "Sum of NVIDIA XID driver/hardware error counter (DCGM_FI_DEV_XID_ERRORS). Non-zero warrants investigation.", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 73 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_XID_ERRORS)", + "legendFormat": "XID" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "decimals": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "title": "GPU temperature by node", + "description": "Average GPU temperature per host (DCGM_FI_DEV_GPU_TEMP).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 77 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "min": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 80 }, + { "color": "red", "value": 85 } + ] + } + } + } + }, + { + "title": "Power draw by node", + "description": "Sum of per-GPU power draw per host in watts (DCGM_FI_DEV_POWER_USAGE).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 77 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "watt", + "min": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "VRAM utilization %", + "description": "Framebuffer used as a percentage of total VRAM per GPU (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 85 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)", + "legendFormat": "{{Hostname}} GPU {{gpu}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 10, + "spanNulls": true + } + } + } + }, + { + "title": "Memory copy utilization", + "description": "Average memory copy engine utilization per host (DCGM_FI_DEV_MEM_COPY_UTIL).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 85 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "XID errors (1h increase)", + "description": "XID errors increased over the last hour per GPU. Spikes indicate driver or hardware faults.", + "type": "timeseries", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 93 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))", + "legendFormat": "{{Hostname}} GPU {{gpu}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "decimals": 0, + "custom": { + "drawStyle": "bars", + "lineWidth": 1, + "fillOpacity": 50, + "spanNulls": true + } + } + } + }, + { + "title": "SM active by node", + "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "Tensor pipe active by node", + "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, + { + "title": "DRAM active by node", + "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + } + ], + "schemaVersion": 39, + "tags": ["gpu", "waldorf", "llm-d"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Waldorf GPU Usage & Idle Tracker", + "uid": "prometheus", + "version": 2 + } From b175ece348a750ae59f4b19fa8cc035d338e0e06 Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Thu, 4 Jun 2026 11:54:12 -0400 Subject: [PATCH 2/7] Add Grafana Helm deployment and ingress access docs. Include values for OpenShift and vanilla Kubernetes, dashboard import script, CoreWeave ingress guide, and README Helm quick start. Co-authored-by: Cursor --- COREWEAVE_INGRESS_GUIDE.md | 345 +++++++++++++++++++++++++++ GRAFANA_ACCESS.md | 294 +++++++++++++++++++++++ README.md | 19 ++ helm/QUICKSTART.md | 232 ++++++++++++++++++ helm/README.md | 272 +++++++++++++++++++++ helm/grafana-values-openshift.yaml | 151 ++++++++++++ helm/grafana-values-vanilla-k8s.yaml | 172 +++++++++++++ helm/grafana-values.yaml | 234 ++++++++++++++++++ import-dashboard.sh | 48 ++++ 9 files changed, 1767 insertions(+) create mode 100644 COREWEAVE_INGRESS_GUIDE.md create mode 100644 GRAFANA_ACCESS.md create mode 100644 helm/QUICKSTART.md create mode 100644 helm/README.md create mode 100644 helm/grafana-values-openshift.yaml create mode 100644 helm/grafana-values-vanilla-k8s.yaml create mode 100644 helm/grafana-values.yaml create mode 100755 import-dashboard.sh diff --git a/COREWEAVE_INGRESS_GUIDE.md b/COREWEAVE_INGRESS_GUIDE.md new file mode 100644 index 0000000..86f111d --- /dev/null +++ b/COREWEAVE_INGRESS_GUIDE.md @@ -0,0 +1,345 @@ +# CoreWeave Kubernetes (CKS) Ingress Guide + +This guide explains how to expose services for external access in CoreWeave Kubernetes Service (CKS). + +## Overview + +CoreWeave uses a **LoadBalancer + DNS annotation** pattern rather than traditional Ingress controllers. The cluster has **Istio** installed but standard users don't have permissions to create Gateway API resources or VirtualServices. + +## Available Methods + +### Method 1: LoadBalancer Service with DNS (Recommended for CKS) + +CoreWeave provides an **External Hostname Controller** that automatically creates DNS records for LoadBalancer services. + +#### How It Works + +1. Create a LoadBalancer service +2. Add the `service.beta.kubernetes.io/external-hostname` annotation +3. CoreWeave assigns a public IP and creates a DNS record in `.coreweave.app` domain +4. DNS status is reflected in `.status.conditions` field of the Service + +#### Example: Expose Grafana with LoadBalancer + +**IMPORTANT**: You must add the `service.beta.kubernetes.io/coreweave-load-balancer-type: public` annotation to get a **public IP**. Without this annotation, CoreWeave assigns an internal VIP only. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: gpu-grafana + namespace: fuddin-dev + annotations: + service.beta.kubernetes.io/coreweave-load-balancer-type: "public" # REQUIRED for public IP + service.beta.kubernetes.io/external-hostname: "gpu-grafana" + # This creates: gpu-grafana-.coreweave.app +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: grafana + app.kubernetes.io/instance: gpu-grafana + ports: + - name: http + port: 80 + targetPort: 3000 + protocol: TCP +``` + +Apply and check the assigned hostname: + +```bash +kubectl apply -f grafana-loadbalancer.yaml + +# Wait for external IP assignment +kubectl get svc gpu-grafana -n fuddin-dev -w + +# Check the assigned DNS name in status +kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")].message}' +``` + +The service will be accessible at: `http://gpu-grafana-.coreweave.app` + +#### Wildcard DNS + +For wildcard DNS records (e.g., for multiple subdomains): + +```yaml +metadata: + annotations: + service.beta.kubernetes.io/external-hostname: "*" + # Creates: *.abc123-mycluster.coreweave.app +``` + +### Method 2: Port-Forward (Development/Testing) + +For temporary access without exposing services publicly: + +```bash +# Forward local port 3000 to Grafana service +kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80 + +# Access at http://localhost:3000 +``` + +**Pros**: +- No cluster configuration needed +- Works immediately +- No public exposure + +**Cons**: +- Only accessible from your machine +- Connection breaks when command terminates +- Not suitable for production + +### Method 3: Istio VirtualService (Requires Permissions) + +CoreWeave has **Istio** installed, but standard users don't have permissions to create VirtualServices or Gateways. This method requires cluster admin assistance. + +If you have permissions, you would create: + +```yaml +apiVersion: networking.istio.io/v1 +kind: VirtualService +metadata: + name: grafana-vs + namespace: fuddin-dev +spec: + hosts: + - "grafana.example.com" + gateways: + - istio-system/public-gateway # Shared cluster gateway + http: + - match: + - uri: + prefix: / + route: + - destination: + host: gpu-grafana.fuddin-dev.svc.cluster.local + port: + number: 80 +``` + +**Note**: This requires a shared Gateway to exist and permissions to create VirtualServices. + +## Comparison of Methods + +| Method | Access | Setup Complexity | Cost | Use Case | +|--------|--------|------------------|------|----------| +| **LoadBalancer + DNS** | Public internet | Low | Charges for public IP | Production, public dashboards | +| **Port-Forward** | Local only | Very low | Free | Development, debugging | +| **Istio VirtualService** | Shared gateway | Medium | Shared cost | Multi-service routing, advanced traffic control | + +## Recommended Approach for Grafana + +### Option A: LoadBalancer (Public Access) + +Best for production Grafana instance that multiple team members need to access. + +```bash +# Update Grafana service to LoadBalancer +kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}' + +# Add REQUIRED annotation for public IP +kubectl annotate svc gpu-grafana -n fuddin-dev \ + service.beta.kubernetes.io/coreweave-load-balancer-type="public" + +# Add DNS annotation +kubectl annotate svc gpu-grafana -n fuddin-dev \ + service.beta.kubernetes.io/external-hostname="gpu-grafana" + +# Wait for external IP +kubectl get svc gpu-grafana -n fuddin-dev -w + +# Get the public IP +kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +``` + +### Option B: Port-Forward (Personal Access) + +Best for personal dashboards or development: + +```bash +# Add to your shell profile for automatic port-forward +alias grafana-forward='kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80' + +# Run whenever you need access +grafana-forward +``` + +## Current Grafana Setup + +Your Grafana is currently deployed with: + +- **Service Type**: ClusterIP (internal only) +- **Namespace**: `fuddin-dev` +- **Port**: 80 (service) → 3000 (pod) +- **Access Method**: Port-forward only + +### Convert to LoadBalancer + +```bash +# Method 1: kubectl patch +kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}' +kubectl annotate svc gpu-grafana -n fuddin-dev \ + service.beta.kubernetes.io/external-hostname="gpu-grafana-fuddin" + +# Method 2: Helm upgrade +helm upgrade gpu-grafana grafana/grafana \ + --reuse-values \ + --set service.type=LoadBalancer \ + --set service.annotations."service\.beta\.kubernetes\.io/external-hostname"="gpu-grafana-fuddin" \ + -n fuddin-dev +``` + +## Cluster Architecture + +CoreWeave Kubernetes (CKS) uses: + +- **Istio** for service mesh (installed at cluster level) +- **Gateway API** (available but restricted permissions) +- **External Hostname Controller** for automatic DNS provisioning +- **LoadBalancer** services get public IPs automatically + +### Installed Components + +```bash +# Istio control plane +kubectl get svc -n istio-system istiod +# NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +# istiod ClusterIP 10.16.0.170 15010/TCP,15012/TCP,443/TCP,15014/TCP + +# Gateway API CRDs available +kubectl api-resources | grep gateway +# httproutes +# gateways.gateway.networking.k8s.io +# virtualservices (Istio) +``` + +### Permissions + +Standard users in CKS can: +- ✅ Create/modify Services in their namespace +- ✅ Use LoadBalancer service type +- ✅ Add DNS annotations +- ❌ Create Gateway resources +- ❌ Create HTTPRoute resources +- ❌ Create VirtualService resources (Istio) +- ❌ List cluster-wide resources + +## Troubleshooting + +### LoadBalancer stuck in "Pending" + +```bash +kubectl describe svc gpu-grafana -n fuddin-dev + +# Check events for errors +kubectl get events -n fuddin-dev --sort-by='.lastTimestamp' | grep gpu-grafana +``` + +Common causes: +- Quota limits on public IPs +- Invalid annotation format +- Namespace resource limits + +### DNS not resolving + +```bash +# Check service status +kubectl get svc gpu-grafana -n fuddin-dev -o yaml + +# Look for ExternalRecords condition +kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")]}' +``` + +The DNS record creation may take 1-2 minutes after the external IP is assigned. + +### Port-forward connection refused + +```bash +# Check if pod is running +kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana + +# Check pod logs +kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=50 + +# Test service internally +kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \ + curl http://gpu-grafana.fuddin-dev.svc.cluster.local +``` + +## Cost Considerations + +- **Public IPs**: CoreWeave charges for LoadBalancer public IPs +- **Bandwidth**: Egress traffic may have costs +- **Port-Forward**: No additional cost (uses cluster credentials) + +For cost-effective access: +1. Use port-forward for personal/development access +2. Use LoadBalancer only for production services that need public access +3. Share one LoadBalancer across multiple services using path-based routing (requires Istio VirtualService with permissions) + +## Security Best Practices + +### For LoadBalancer Services + +1. **Enable authentication** in Grafana (already configured with admin password) +2. **Use HTTPS**: Add TLS certificate +3. **Restrict source IPs**: Use `loadBalancerSourceRanges` +4. **Monitor access logs**: Enable Grafana audit logging +5. **Use NetworkPolicies**: Restrict pod-to-pod communication + +```yaml +spec: + type: LoadBalancer + loadBalancerSourceRanges: + - "1.2.3.4/32" # Your office IP + - "5.6.7.8/24" # Your VPN range +``` + +### For Port-Forward + +- ✅ Automatically secured by Kubernetes RBAC +- ✅ Requires valid cluster credentials +- ✅ No public exposure +- ⚠️ Ensure your local machine is secured + +## Next Steps + +1. **Decide on access method**: + - Public access → Use LoadBalancer with DNS + - Personal access → Use port-forward + +2. **If using LoadBalancer**: + ```bash + kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}' + kubectl annotate svc gpu-grafana -n fuddin-dev \ + service.beta.kubernetes.io/external-hostname="gpu-grafana-fuddin" + ``` + +3. **Monitor the service**: + ```bash + kubectl get svc gpu-grafana -n fuddin-dev -w + ``` + +4. **Access Grafana**: + - LoadBalancer: Wait for DNS record, then access via `http://.coreweave.app` + - Port-forward: `kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80` + +## References + +- [Create a Public DNS Name | CoreWeave](https://docs.coreweave.com/docs/products/networking/how-to/expose-service-dns) +- [Introduction to CoreWeave Kubernetes Service | CoreWeave](https://docs.coreweave.com/docs/products/cks) +- [Kubernetes Ingress Documentation](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- [Exposing Applications for External Access | Kube by Example](https://kubebyexample.com/learning-paths/application-development-kubernetes/lesson-3-networking-kubernetes/exposing-0) + +## Summary + +**CoreWeave uses LoadBalancer services with DNS annotations, not traditional Ingress controllers.** + +For your Grafana deployment: +- **Quick access**: `kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80` +- **Public access**: Convert service to LoadBalancer with DNS annotation +- **Advanced routing**: Request VirtualService permissions from cluster admin + +The simplest production-ready approach is to use LoadBalancer with the `service.beta.kubernetes.io/external-hostname` annotation. diff --git a/GRAFANA_ACCESS.md b/GRAFANA_ACCESS.md new file mode 100644 index 0000000..866136d --- /dev/null +++ b/GRAFANA_ACCESS.md @@ -0,0 +1,294 @@ +# Grafana GPU Dashboard - Access Information + +## ✅ Deployment Status: LIVE + +Your Grafana instance with the GPU Pruner dashboard is now deployed and publicly accessible! + +## 🌐 Access Details + +**Public URL (DNS - may take 5-10 minutes to update)**: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app + +**Direct IP Access (Works immediately)**: http://166.19.16.227 + +**Credentials**: +- **Username**: `admin` +- **Password**: `GpuPruner2026!` + +**External IP**: `166.19.16.227` (CoreWeave Public LoadBalancer) + +## 📊 Dashboard Import + +The GPU dashboard is **not yet imported**. After logging in, you need to import it: + +### Option 1: Automated Import Script + +```bash +./import-dashboard.sh http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app GpuPruner2026! +``` + +### Option 2: Manual Import via UI + +1. Access http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app +2. Login with `admin` / `GpuPruner2026!` +3. Navigate to **Dashboards** → **Import** → **Upload JSON file** +4. Select `gpu-dashboard.json` from this repository +5. Choose **Prometheus** datasource (UID: `PBFA97CFB590B2093`) +6. Click **Import** + +### Option 3: Import via API + +```bash +GRAFANA_URL="http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app" +ADMIN_PASSWORD="GpuPruner2026!" + +DASHBOARD_JSON=$(cat gpu-dashboard.json | jq '{dashboard: ., overwrite: true, folderId: 0}') + +curl -X POST \ + -H "Content-Type: application/json" \ + -u "admin:$ADMIN_PASSWORD" \ + -d "$DASHBOARD_JSON" \ + "$GRAFANA_URL/api/dashboards/db" +``` + +## 🔍 Verify Datasource + +After logging in, verify the Prometheus datasource is working: + +1. Go to **Configuration** → **Data Sources** → **Prometheus** +2. Click **Save & Test** +3. Should show: "Data source is working" ✓ + +If the datasource test fails: +- Check the Prometheus URL: `http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090` +- Verify Prometheus is accessible from the `fuddin-dev` namespace + +## 📝 Service Configuration + +**Namespace**: `fuddin-dev` +**Service Type**: `LoadBalancer` +**Service Name**: `gpu-grafana` +**Internal Port**: `80` → Pod Port `3000` +**NodePort**: `30265` + +### DNS Configuration + +**Annotation**: `service.beta.kubernetes.io/external-hostname: "gpu-grafana-fuddin"` +**Auto-generated FQDN**: `gpu-grafana-fuddin.6787d4-waldorf.coreweave.app` + +CoreWeave's External Hostname Controller automatically: +- Created the DNS record in the `.coreweave.app` domain +- Appended the cluster identifier `6787d4-waldorf` to prevent conflicts +- Set the DNS status in the Service `.status.conditions` field + +## 🔧 Management Commands + +### Check Service Status + +```bash +kubectl get svc gpu-grafana -n fuddin-dev +``` + +### View Service Details + +```bash +kubectl describe svc gpu-grafana -n fuddin-dev +``` + +### Check DNS Status + +```bash +kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")]}' | jq . +``` + +### Check Grafana Pods + +```bash +kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana +``` + +### View Grafana Logs + +```bash +kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=100 -f +``` + +### Restart Grafana + +```bash +kubectl rollout restart deployment gpu-grafana -n fuddin-dev +``` + +## ⚠️ Important Notes + +### Persistence + +**WARNING**: Persistence is currently **DISABLED**. This means: +- Dashboard customizations will be **lost** if the pod restarts +- Datasource changes will be **lost** if the pod restarts +- User accounts (other than admin) will be **lost** if the pod restarts + +To enable persistence: + +```bash +helm upgrade gpu-grafana grafana/grafana \ + --reuse-values \ + --set persistence.enabled=true \ + --set persistence.size=10Gi \ + -n fuddin-dev +``` + +### Security + +- ✅ Authentication is enabled (admin password required) +- ⚠️ HTTP only (no HTTPS/TLS) +- ⚠️ No IP restrictions (publicly accessible) +- ⚠️ Default admin password (should be changed for production) + +### Recommended Security Improvements + +1. **Change admin password**: + ```bash + # Login to Grafana UI + # Profile → Change Password + ``` + +2. **Enable HTTPS** (requires TLS certificate): + ```bash + # Add TLS certificate to cluster + kubectl create secret tls grafana-tls \ + --cert=grafana.crt \ + --key=grafana.key \ + -n fuddin-dev + + # Update service annotation + kubectl annotate svc gpu-grafana -n fuddin-dev \ + service.beta.kubernetes.io/external-hostname-tls="grafana-tls" + ``` + +3. **Restrict source IPs** (optional): + ```bash + kubectl patch svc gpu-grafana -n fuddin-dev -p '{ + "spec": { + "loadBalancerSourceRanges": ["YOUR.IP.ADDRESS/32"] + } + }' + ``` + +4. **Enable persistence** (as shown above) + +## 📊 Expected Dashboard Features + +Once imported, the GPU dashboard will show: + +- **Cluster GPU Overview** + - Total GPUs + - VRAM allocation (FB>0 vs FB=0) + - Engine activity (idle 30m vs active 30m) + - GPU memory per GPU + +- **GPU Utilization Heatmap** + - Per-node GPU utilization over time + +- **Running GPU Workloads** + - All pods with GPU requests + - Grouped by namespace + +- **Idle GPU Workloads** + - GPUs with zero compute activity for 30+ minutes + - Identifies wasted resources + +- **Idle GPU Time by Deployment** + - Historical analysis of which deployments waste the most GPU time + - Requires `kube_pod_labels` metric from kube-state-metrics + +- **GPU Allocation Leaderboard** + - Total GPU requests per namespace + +## 🐛 Troubleshooting + +### Cannot access the URL + +**Check DNS propagation**: +```bash +nslookup gpu-grafana-fuddin.6787d4-waldorf.coreweave.app +``` + +Should return: `10.16.4.0` + +**Check from your browser**: +- Try: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app +- If DNS fails, try direct IP: http://10.16.4.0 (may not work from external networks) + +### Dashboard shows "No Data" + +1. **Verify Prometheus datasource**: + - Configuration → Data Sources → Prometheus → Save & Test + +2. **Check Prometheus is accessible**: + ```bash + kubectl run curl-test --image=curlimages/curl:latest --rm -i --restart=Never -n fuddin-dev -- \ + curl -s 'http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090/api/v1/query?query=up' + ``` + +3. **Verify DCGM metrics exist**: + ```bash + # Port-forward to Prometheus + kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & + + # Query DCGM metrics + curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length' + ``` + +### "Idle GPU Time by Deployment" panel empty + +This panel requires `kube_pod_labels` metric. Verify kube-state-metrics is configured with: + +```bash +kubectl get deploy kube-state-metrics -A -o yaml | grep metric-labels-allowlist +``` + +Should show: `--metric-labels-allowlist=pods=[*]` + +### Grafana pod not running + +```bash +# Check pod status +kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana + +# Check logs +kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=50 + +# Describe pod for events +kubectl describe pod -n fuddin-dev -l app.kubernetes.io/name=grafana +``` + +## 📚 Additional Resources + +- **Main Documentation**: [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md) +- **Dashboard Features**: [DASHBOARD.md](DASHBOARD.md) +- **Helm Configuration**: [helm/README.md](helm/README.md) +- **CoreWeave Ingress**: [COREWEAVE_INGRESS_GUIDE.md](COREWEAVE_INGRESS_GUIDE.md) +- **Import Script**: [import-dashboard.sh](import-dashboard.sh) + +## 🎯 Next Steps + +1. ✅ **Access Grafana**: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app +2. ✅ **Login**: `admin` / `GpuPruner2026!` +3. ⏳ **Import Dashboard**: Use the import script or manual UI import +4. ⏳ **Verify Datasource**: Configuration → Data Sources → Prometheus → Save & Test +5. ⏳ **Enable Persistence**: To prevent data loss on pod restart +6. ⏳ **Change Password**: For production security + +## 📞 Support + +For issues or questions: +- **GitHub Issues**: https://github.com/wseaton/gpu-pruner/issues +- **CoreWeave Docs**: https://docs.coreweave.com/ +- **Deployment Guide**: [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md) + +--- + +**Deployment Date**: 2026-06-04 +**Deployed By**: fuddin@redhat.com +**Cluster**: coreweave-waldorf (6787d4) +**Namespace**: fuddin-dev diff --git a/README.md b/README.md index bc1b770..e5d9018 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,25 @@ Import `gpu-dashboard.json` into Grafana for advanced analytics and visualizatio See [DASHBOARD.md](DASHBOARD.md) for import instructions and [IDLE_GPU_QUERY.md](IDLE_GPU_QUERY.md) for querying idle GPU time by deployment. +#### Deploy Grafana with Helm + +For a complete standalone Grafana deployment with the GPU dashboard pre-configured: + +```bash +# Add Grafana Helm repository +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +# Install Grafana with GPU dashboard +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \ + -n monitoring --create-namespace +``` + +See [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md) for complete deployment instructions, configuration options, and troubleshooting. + ## usage ```sh diff --git a/helm/QUICKSTART.md b/helm/QUICKSTART.md new file mode 100644 index 0000000..26afad8 --- /dev/null +++ b/helm/QUICKSTART.md @@ -0,0 +1,232 @@ +# Grafana GPU Dashboard - Quick Start Guide + +One-command deployments for common scenarios. + +## Prerequisites Check + +```bash +# Verify prerequisites are met +kubectl get svc -A | grep prometheus # ✓ Prometheus exists +kubectl get pods -A | grep dcgm # ✓ DCGM exporter running +kubectl get deploy -A | grep kube-state # ✓ kube-state-metrics deployed +helm version # ✓ Helm 3.x installed +``` + +## Scenario 1: OpenShift with Prometheus Operator + +```bash +# One command deployment +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +helm install gpu-grafana grafana/grafana \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values-openshift.yaml \ + --set adminPassword='ChangeMe123!' \ + -n monitoring --create-namespace + +# Grant Prometheus access +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring + +# Get Route URL +echo "https://$(oc get route -n monitoring grafana -o jsonpath='{.spec.host}')" +``` + +**Login**: `admin` / `ChangeMe123!` + +## Scenario 2: Vanilla Kubernetes with nginx Ingress + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +helm install gpu-grafana grafana/grafana \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values-vanilla-k8s.yaml \ + --set adminPassword='ChangeMe123!' \ + --set ingress.hosts[0]='grafana.example.com' \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \ + -n monitoring --create-namespace + +# Access via Ingress +echo "https://grafana.example.com" +``` + +**Login**: `admin` / `ChangeMe123!` + +## Scenario 3: Local Testing with Port-Forward + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +helm install gpu-grafana grafana/grafana \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \ + --set adminPassword='admin' \ + --set persistence.enabled=false \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \ + -n monitoring --create-namespace + +# Port-forward to access +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 +``` + +**Access**: http://localhost:3000 +**Login**: `admin` / `admin` + +## Scenario 4: Air-Gapped Cluster (ConfigMap Method) + +```bash +# Step 1: Download files +curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-dashboard-configmap.yaml +curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml + +# Step 2: Create ConfigMap +kubectl apply -f grafana-dashboard-configmap.yaml + +# Step 3: Deploy Grafana with sidecar +helm install gpu-grafana grafana/grafana \ + -f grafana-values.yaml \ + --set adminPassword='ChangeMe123!' \ + --set sidecar.dashboards.enabled=true \ + --set sidecar.dashboards.label=grafana_dashboard \ + --set dashboards=null \ + -n monitoring --create-namespace +``` + +## Scenario 5: Using LoadBalancer Service + +```bash +helm install gpu-grafana grafana/grafana \ + -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \ + --set adminPassword='ChangeMe123!' \ + --set service.type=LoadBalancer \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus:9090' \ + -n monitoring --create-namespace + +# Get LoadBalancer IP +kubectl get svc -n monitoring gpu-grafana -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +``` + +## Post-Installation + +### Get Admin Password + +```bash +kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo +``` + +### Verify Dashboard Loaded + +```bash +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 & +curl -s -u admin:YOUR_PASSWORD http://localhost:3000/api/search?query=gpu | jq . +``` + +Expected: JSON with dashboard titled "Waldorf GPU Usage & Idle Tracker" + +### Test Prometheus Datasource + +Grafana UI → Configuration → Data Sources → Prometheus → Save & Test + +Should show: **"Data source is working"** ✓ + +## Customization + +### Change Prometheus URL + +```bash +--set datasources."datasources\.yaml".datasources[0].url='http://YOUR_PROMETHEUS:9090' +``` + +### Change Ingress Hostname + +```bash +--set ingress.hosts[0]='grafana-gpu.yourdomain.com' +``` + +### Enable Persistence + +```bash +--set persistence.enabled=true \ +--set persistence.size=20Gi \ +--set persistence.storageClassName=fast-ssd +``` + +### Increase Resources + +```bash +--set resources.limits.cpu=1000m \ +--set resources.limits.memory=1Gi \ +--set resources.requests.cpu=500m \ +--set resources.requests.memory=512Mi +``` + +## Troubleshooting + +### Dashboard Shows "No Data" + +```bash +# Test Prometheus connectivity +kubectl exec -n monitoring deploy/gpu-grafana -- wget -qO- http://prometheus-k8s.monitoring.svc.cluster.local:9090/api/v1/query?query=up + +# Check DCGM metrics exist +kubectl port-forward -n svc/ 9090:9090 & +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq . +``` + +### Can't Access Grafana + +```bash +# Check pod status +kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana + +# Check logs +kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50 + +# Use port-forward as fallback +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 +``` + +### Forgot Admin Password + +```bash +# Retrieve existing password +kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo + +# Or reset it +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --set adminPassword='NewPassword123!' \ + --reuse-values \ + -n monitoring +``` + +## Upgrading + +```bash +helm repo update +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -n monitoring +``` + +## Uninstalling + +```bash +helm uninstall gpu-grafana -n monitoring + +# Optional: Delete persistent data +kubectl delete pvc -n monitoring -l app.kubernetes.io/name=grafana +``` + +## Next Steps + +- **Detailed Guide**: [GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md) +- **Dashboard Features**: [DASHBOARD.md](../DASHBOARD.md) +- **Configuration Reference**: [helm/README.md](README.md) + +## Need Help? + +- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues +- Full Documentation: [GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md) diff --git a/helm/README.md b/helm/README.md new file mode 100644 index 0000000..adb217c --- /dev/null +++ b/helm/README.md @@ -0,0 +1,272 @@ +# Grafana Helm Chart Deployment Files + +This directory contains Helm values files and Kubernetes manifests for deploying Grafana with the GPU Pruner dashboard. + +## Files + +| File | Description | +|------|-------------| +| `grafana-values.yaml` | **Base values file** - Core Grafana configuration with dashboard provisioning, Prometheus datasource, and resource settings. Use this as the foundation for all deployments. | +| `grafana-values-openshift.yaml` | **OpenShift overrides** - Route configuration, token-based Prometheus authentication, and OpenShift-specific security context. Merge with base values. | +| `grafana-values-vanilla-k8s.yaml` | **Vanilla Kubernetes overrides** - Ingress configuration for nginx/traefik, standard K8s security context. Merge with base values. | +| `grafana-dashboard-configmap.yaml` | **Dashboard ConfigMap** - Alternative provisioning method using ConfigMap + sidecar instead of direct URL import. | + +## Quick Start + +### OpenShift Deployment + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-openshift.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + -n monitoring --create-namespace + +# Grant Prometheus access +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring +``` + +Access via Route: +```bash +oc get route -n monitoring grafana -o jsonpath='{.spec.host}' +``` + +### Vanilla Kubernetes Deployment + +```bash +helm install gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-vanilla-k8s.yaml \ + --set adminPassword='YOUR_SECURE_PASSWORD' \ + --set ingress.hosts[0]='grafana-gpu.example.com' \ + --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \ + -n monitoring --create-namespace +``` + +Access via Ingress: `https://grafana-gpu.example.com` + +## Configuration + +### Required Customizations + +Before deploying, update these values in `grafana-values.yaml` or via `--set`: + +1. **Admin Password**: + ```bash + --set adminPassword='YOUR_SECURE_PASSWORD' + ``` + Or use a secret (recommended): + ```bash + kubectl create secret generic grafana-admin-secret \ + -n monitoring \ + --from-literal=admin-password='YOUR_PASSWORD' + + --set admin.existingSecret=grafana-admin-secret + ``` + +2. **Prometheus URL**: + ```bash + --set datasources."datasources\.yaml".datasources[0].url='http://YOUR_PROMETHEUS:9090' + ``` + +3. **Ingress Hostname** (vanilla K8s): + ```bash + --set ingress.hosts[0]='grafana-gpu.example.com' + ``` + +4. **Route Hostname** (OpenShift): + ```bash + --set route.host='grafana-gpu.apps.example.com' + ``` + +### Critical: Datasource UID + +**Do NOT modify the datasource UID** in the values file. The dashboard has a hardcoded UID: + +```yaml +datasources: + datasources.yaml: + datasources: + - uid: PBFA97CFB590B2093 # Must match gpu-dashboard.json +``` + +If you change this UID, the dashboard will not work. + +## Dashboard Provisioning Methods + +### Method 1: Direct URL Import (Default) + +Configured in `grafana-values.yaml`: + +```yaml +dashboards: + gpu-pruner: + gpu-dashboard: + url: https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json +``` + +**Pros**: Simple, automatic updates when repo changes +**Cons**: Requires internet access from Grafana pod + +### Method 2: ConfigMap + Sidecar + +Apply the ConfigMap: + +```bash +kubectl apply -f helm/grafana-dashboard-configmap.yaml +``` + +Enable sidecar in values: + +```yaml +sidecar: + dashboards: + enabled: true + label: grafana_dashboard +``` + +**Pros**: Works in air-gapped clusters, no external dependencies +**Cons**: Requires manual updates when dashboard changes + +### Method 3: Manual Import + +1. Download dashboard: + ```bash + curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json + ``` + +2. Grafana UI → Dashboards → Import → Upload JSON + +3. Select Prometheus datasource + +**Pros**: Full control over dashboard version +**Cons**: Not automated, requires UI access + +## Prerequisites + +Ensure these components are running before deploying Grafana: + +- ✅ **Prometheus** - Accessible at configured URL +- ✅ **DCGM Exporter** - Running on GPU nodes +- ✅ **kube-state-metrics** - With `--metric-labels-allowlist=pods=[*]` +- ✅ **Persistent Storage** (optional) - For dashboard/datasource persistence + +Validation: + +```bash +# Check Prometheus +kubectl get svc -A | grep prometheus + +# Check DCGM exporter +kubectl get pods -A | grep dcgm + +# Verify DCGM metrics +kubectl port-forward -n svc/ 9090:9090 & +curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length' +``` + +## Upgrading + +To upgrade an existing deployment with new values: + +```bash +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + -f helm/grafana-values-vanilla-k8s.yaml \ + -n monitoring +``` + +To upgrade the Grafana chart version: + +```bash +helm repo update +helm search repo grafana/grafana --versions | head -5 # Check available versions + +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --version 8.0.0 \ + -n monitoring +``` + +## Uninstalling + +```bash +helm uninstall gpu-grafana -n monitoring +``` + +To also delete persistent data: + +```bash +kubectl delete pvc -n monitoring -l app.kubernetes.io/name=grafana +``` + +## Troubleshooting + +### Dashboard shows "No data" + +1. Verify Prometheus datasource: + ```bash + kubectl exec -n monitoring -it deploy/gpu-grafana -- \ + wget -O- http://prometheus-k8s.monitoring.svc.cluster.local:9090/api/v1/query?query=up + ``` + +2. Check Grafana logs: + ```bash + kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50 + ``` + +### Can't login to Grafana + +Get admin password: + +```bash +kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo +``` + +Reset admin password: + +```bash +kubectl delete secret gpu-grafana -n monitoring +helm upgrade gpu-grafana grafana/grafana \ + -f helm/grafana-values.yaml \ + --set adminPassword='NEW_PASSWORD' \ + -n monitoring +``` + +### Ingress not working + +Check ingress controller: + +```bash +kubectl get pods -n ingress-nginx +``` + +Verify ingress resource: + +```bash +kubectl get ingress -n monitoring +kubectl describe ingress gpu-grafana -n monitoring +``` + +Alternative: Use port-forward for testing: + +```bash +kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 +``` + +## Additional Documentation + +- **[GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md)** - Complete deployment guide with validation steps and security considerations +- **[DASHBOARD.md](../DASHBOARD.md)** - Dashboard features and usage guide +- **[gpu-dashboard.json](../gpu-dashboard.json)** - Dashboard source JSON + +## Support + +For issues or questions: + +- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues +- Grafana Documentation: https://grafana.com/docs/ +- Helm Chart: https://github.com/grafana/helm-charts/tree/main/charts/grafana diff --git a/helm/grafana-values-openshift.yaml b/helm/grafana-values-openshift.yaml new file mode 100644 index 0000000..52372ea --- /dev/null +++ b/helm/grafana-values-openshift.yaml @@ -0,0 +1,151 @@ +# Grafana Helm Chart Values for OpenShift +# +# OpenShift-specific configuration with Route and token-based Prometheus auth +# +# Usage: +# helm install gpu-grafana grafana/grafana \ +# -f helm/grafana-values.yaml \ +# -f helm/grafana-values-openshift.yaml \ +# -n monitoring --create-namespace + +# Override Prometheus datasource for OpenShift monitoring +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + # OpenShift uses Thanos Querier as the query frontend + url: http://thanos-querier.openshift-monitoring.svc.cluster.local:9090 + access: proxy + isDefault: true + uid: PBFA97CFB590B2093 + editable: true + jsonData: + timeInterval: 30s + queryTimeout: 60s + httpMethod: POST + # Token authentication for OpenShift monitoring + httpHeaderName1: 'Authorization' + # IMPORTANT: Set this token via --set or use a secret + # To get a token: + # oc serviceaccounts get-token grafana -n monitoring + # secureJsonData: + # httpHeaderValue1: 'Bearer YOUR_OPENSHIFT_TOKEN_HERE' + +# Service account with additional OpenShift annotations +serviceAccount: + create: true + name: grafana + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.grafana: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"grafana"}}' + +# RBAC with cluster-monitoring-view for Prometheus access +rbac: + create: true + pspEnabled: false + extraClusterRoleRules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + +# Additional ClusterRoleBinding for monitoring access +# Note: This requires creating the binding separately: +# oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring +# Or apply this separately: +# --- +# apiVersion: rbac.authorization.k8s.io/v1 +# kind: ClusterRoleBinding +# metadata: +# name: grafana-cluster-monitoring-view +# roleRef: +# apiGroup: rbac.authorization.k8s.io +# kind: ClusterRole +# name: cluster-monitoring-view +# subjects: +# - kind: ServiceAccount +# name: grafana +# namespace: monitoring + +# OpenShift Route configuration (instead of Ingress) +route: + enabled: true + host: grafana-gpu.apps.example.com # Update to match your OpenShift cluster domain + tls: + enabled: true + termination: edge + insecureEdgeTerminationPolicy: Redirect + annotations: + haproxy.router.openshift.io/timeout: 4m + haproxy.router.openshift.io/cookie_name: grafana-session + +# Disable standard Ingress +ingress: + enabled: false + +# OpenShift-compatible security context +securityContext: + runAsNonRoot: true + # OpenShift assigns UIDs dynamically from project range + # runAsUser: 472 + # fsGroup: 472 + +# Pod security context for OpenShift +podSecurityContext: + runAsNonRoot: true + +# Environment variables for OpenShift +env: + GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password + GF_SERVER_ROOT_URL: https://grafana-gpu.apps.example.com + GF_SERVER_DOMAIN: grafana-gpu.apps.example.com + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + # Optional: Enable OAuth proxy for OpenShift SSO + # GF_AUTH_PROXY_ENABLED: "true" + # GF_AUTH_PROXY_HEADER_NAME: "X-Forwarded-User" + # GF_AUTH_PROXY_AUTO_SIGN_UP: "true" + +# Grafana configuration for OpenShift +grafana.ini: + server: + domain: grafana-gpu.apps.example.com + root_url: https://grafana-gpu.apps.example.com + serve_from_sub_path: false + analytics: + reporting_enabled: false + check_for_updates: false + log: + mode: console + level: info + security: + admin_user: admin + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json + +# Note: If you need to deploy a Route manually, use this YAML: +# --- +# apiVersion: route.openshift.io/v1 +# kind: Route +# metadata: +# name: grafana +# namespace: monitoring +# annotations: +# haproxy.router.openshift.io/timeout: 4m +# spec: +# host: grafana-gpu.apps.example.com +# to: +# kind: Service +# name: gpu-grafana +# weight: 100 +# port: +# targetPort: 3000 +# tls: +# termination: edge +# insecureEdgeTerminationPolicy: Redirect +# wildcardPolicy: None diff --git a/helm/grafana-values-vanilla-k8s.yaml b/helm/grafana-values-vanilla-k8s.yaml new file mode 100644 index 0000000..343baa2 --- /dev/null +++ b/helm/grafana-values-vanilla-k8s.yaml @@ -0,0 +1,172 @@ +# Grafana Helm Chart Values for Vanilla Kubernetes +# +# Standard Kubernetes configuration with Ingress and custom Prometheus +# +# Usage: +# helm install gpu-grafana grafana/grafana \ +# -f helm/grafana-values.yaml \ +# -f helm/grafana-values-vanilla-k8s.yaml \ +# -n monitoring --create-namespace + +# Override Prometheus datasource for custom Prometheus deployment +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + # Update this URL to match your Prometheus service + # Common patterns: + # Prometheus Operator: http://prometheus-k8s.monitoring.svc.cluster.local:9090 + # Kube-prometheus-stack: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 + # Custom deployment: http://prometheus.prometheus.svc.cluster.local:9090 + url: http://prometheus-k8s.monitoring.svc.cluster.local:9090 + access: proxy + isDefault: true + uid: PBFA97CFB590B2093 + editable: true + jsonData: + timeInterval: 30s + queryTimeout: 60s + httpMethod: POST + # If your Prometheus requires authentication: + # basicAuth: true + # basicAuthUser: admin + # secureJsonData: + # basicAuthPassword: 'YOUR_PASSWORD_HERE' + +# Ingress configuration (nginx example) +ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/backend-protocol: "HTTP" + # For large dashboards or long queries: + nginx.ingress.kubernetes.io/proxy-body-size: "10m" + nginx.ingress.kubernetes.io/proxy-read-timeout: "300" + hosts: + - grafana-gpu.example.com + tls: + - secretName: grafana-tls + hosts: + - grafana-gpu.example.com + path: / + pathType: Prefix + +# Alternative: Traefik Ingress configuration +# ingress: +# enabled: true +# ingressClassName: traefik +# annotations: +# cert-manager.io/cluster-issuer: letsencrypt-prod +# traefik.ingress.kubernetes.io/router.entrypoints: websecure +# traefik.ingress.kubernetes.io/router.tls: "true" +# hosts: +# - grafana-gpu.example.com +# tls: +# - secretName: grafana-tls +# hosts: +# - grafana-gpu.example.com + +# Service configuration (ClusterIP with Ingress) +service: + enabled: true + type: ClusterIP + port: 3000 + targetPort: 3000 + +# For LoadBalancer service (alternative to Ingress): +# service: +# type: LoadBalancer +# loadBalancerIP: 192.168.1.100 # Optional: static IP +# annotations: +# service.beta.kubernetes.io/aws-load-balancer-type: "nlb" # For AWS +# cloud.google.com/load-balancer-type: "Internal" # For GCP + +# For NodePort service (alternative to Ingress): +# service: +# type: NodePort +# nodePort: 30300 +# port: 3000 + +# Security context for standard Kubernetes +securityContext: + runAsNonRoot: true + runAsUser: 472 + fsGroup: 472 + +# Pod security context +podSecurityContext: + runAsNonRoot: true + fsGroup: 472 + +# Environment variables +env: + GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password + GF_SERVER_ROOT_URL: https://grafana-gpu.example.com + GF_SERVER_DOMAIN: grafana-gpu.example.com + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + +# Grafana configuration +grafana.ini: + server: + domain: grafana-gpu.example.com + root_url: https://grafana-gpu.example.com + analytics: + reporting_enabled: false + check_for_updates: false + log: + mode: console + level: info + auth.anonymous: + enabled: false + security: + admin_user: admin + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json + +# Service account +serviceAccount: + create: true + name: grafana + annotations: {} + +# RBAC +rbac: + create: true + pspEnabled: false + extraClusterRoleRules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + +# Optional: PodDisruptionBudget for high availability +# podDisruptionBudget: +# minAvailable: 1 + +# Optional: HorizontalPodAutoscaler +# autoscaling: +# enabled: true +# minReplicas: 1 +# maxReplicas: 3 +# targetCPU: 80 +# targetMemory: 80 + +# Optional: Node affinity to schedule on specific nodes +# nodeSelector: +# node-role.kubernetes.io/monitoring: "true" + +# Optional: Tolerations for tainted nodes +# tolerations: +# - key: monitoring +# operator: Equal +# value: "true" +# effect: NoSchedule diff --git a/helm/grafana-values.yaml b/helm/grafana-values.yaml new file mode 100644 index 0000000..1b3209d --- /dev/null +++ b/helm/grafana-values.yaml @@ -0,0 +1,234 @@ +# Grafana Helm Chart Values for GPU Dashboard +# +# This values file deploys Grafana with the GPU Pruner dashboard pre-configured. +# +# Usage: +# helm repo add grafana https://grafana.github.io/helm-charts +# helm install gpu-grafana grafana/grafana -f helm/grafana-values.yaml -n monitoring --create-namespace +# +# Prerequisites: +# - Prometheus accessible in cluster +# - DCGM exporter running on GPU nodes +# - kube-state-metrics with pod labels enabled + +# Replica count for Grafana +replicas: 1 + +# Grafana image configuration +image: + repository: grafana/grafana-oss + tag: "10.0.0" + pullPolicy: IfNotPresent + +# Admin user configuration +# SECURITY: Use a secret for production deployments +adminUser: admin +# adminPassword: "changeme" # Set via --set or use existingSecret + +# Use existing secret for admin credentials (recommended for production) +# admin: +# existingSecret: grafana-admin-secret +# userKey: admin-user +# passwordKey: admin-password + +# Persistence configuration for dashboards and plugins +persistence: + enabled: true + type: pvc + size: 10Gi + # storageClassName: default + accessModes: + - ReadWriteOnce + +# Prometheus datasource configuration +# CRITICAL: UID must match gpu-dashboard.json (PBFA97CFB590B2093) +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + # IMPORTANT: Update this URL to match your Prometheus service + # Examples: + # OpenShift: http://thanos-querier.openshift-monitoring.svc.cluster.local:9090 + # Prometheus Operator: http://prometheus-k8s.monitoring.svc.cluster.local:9090 + # Custom: http://prometheus.prometheus.svc.cluster.local:9090 + url: http://prometheus-k8s.monitoring.svc.cluster.local:9090 + access: proxy + isDefault: true + # CRITICAL: This UID MUST match the hardcoded UID in gpu-dashboard.json + uid: PBFA97CFB590B2093 + editable: true + jsonData: + timeInterval: 30s + queryTimeout: 60s + httpMethod: POST + # For OpenShift with token authentication, uncomment: + # secureJsonData: + # httpHeaderValue1: 'Bearer YOUR_TOKEN_HERE' + # jsonData: + # httpHeaderName1: 'Authorization' + +# Dashboard provisioning configuration +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'gpu-pruner' + orgId: 1 + folder: 'GPU Monitoring' + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/gpu-pruner + +# Dashboard import configuration +dashboards: + gpu-pruner: + gpu-dashboard: + # Import dashboard from GitHub repository + url: https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json + datasource: Prometheus + # Alternative: Use local file via ConfigMap (see grafana-dashboard-configmap.yaml) + # configMapRef: + # name: grafana-gpu-dashboard + # key: gpu-dashboard.json + +# Service configuration +service: + enabled: true + type: ClusterIP + port: 3000 + targetPort: 3000 + # Uncomment for NodePort or LoadBalancer + # type: NodePort + # nodePort: 30300 + # type: LoadBalancer + +# Ingress configuration (for vanilla Kubernetes) +ingress: + enabled: false + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # For other ingress controllers, adjust annotations accordingly + hosts: + - grafana-gpu.example.com + tls: + - secretName: grafana-tls + hosts: + - grafana-gpu.example.com + path: / + pathType: Prefix + +# Resource limits and requests +resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + +# Security context +securityContext: + runAsNonRoot: true + runAsUser: 472 + fsGroup: 472 + +# Pod annotations +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "3000" + +# Environment variables +env: + GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password + GF_INSTALL_PLUGINS: "" + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + # For anonymous access (optional, not recommended for production) + # GF_AUTH_ANONYMOUS_ENABLED: "true" + # GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" + +# ConfigMaps to mount as volumes +extraConfigmapMounts: [] + # - name: gpu-dashboard + # mountPath: /var/lib/grafana/dashboards/gpu-pruner + # configMap: grafana-gpu-dashboard + # readOnly: true + +# Service account configuration +serviceAccount: + create: true + name: grafana + annotations: {} + +# RBAC configuration +rbac: + create: true + pspEnabled: false + # For sidecar dashboard provisioning, additional permissions needed + extraClusterRoleRules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + +# Node selector for pod placement (optional) +nodeSelector: {} + # Example: Deploy on specific nodes + # kubernetes.io/hostname: grafana-node + +# Tolerations (optional) +tolerations: [] + +# Affinity rules (optional) +affinity: {} + +# Grafana.ini configuration +grafana.ini: + server: + domain: grafana-gpu.example.com + root_url: "%(protocol)s://%(domain)s/" + analytics: + reporting_enabled: false + check_for_updates: false + log: + mode: console + level: info + auth.anonymous: + enabled: false + security: + admin_user: admin + # Use secret for password in production + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json + +# Sidecar configuration (alternative dashboard provisioning method) +sidecar: + dashboards: + enabled: false # Set to true to use ConfigMap-based provisioning + label: grafana_dashboard + labelValue: "1" + folder: /tmp/dashboards + defaultDashboardsEnabled: true + searchNamespace: monitoring + datasources: + enabled: false + +# Plugins to install (optional) +plugins: [] + # - grafana-piechart-panel + # - grafana-worldmap-panel + +# Image renderer for PDF exports (optional) +imageRenderer: + enabled: false diff --git a/import-dashboard.sh b/import-dashboard.sh new file mode 100755 index 0000000..0670e68 --- /dev/null +++ b/import-dashboard.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Import GPU Dashboard to Grafana +# +# Usage: ./import-dashboard.sh [grafana-url] [admin-password] +# +# Example: ./import-dashboard.sh http://localhost:3000 GpuPruner2026! + +GRAFANA_URL="${1:-http://localhost:3000}" +ADMIN_PASSWORD="${2:-GpuPruner2026!}" +DASHBOARD_FILE="gpu-dashboard.json" + +echo "Importing GPU Dashboard to Grafana..." +echo "Grafana URL: $GRAFANA_URL" + +# Test Grafana connectivity +echo -n "Testing Grafana connectivity... " +if curl -s "$GRAFANA_URL/api/health" | grep -q "ok"; then + echo "✓ Connected" +else + echo "✗ Failed to connect to Grafana" + exit 1 +fi + +# Prepare dashboard JSON (wrap in API format) +DASHBOARD_JSON=$(cat "$DASHBOARD_FILE" | jq '{dashboard: ., overwrite: true, folderId: 0}') + +# Import dashboard +echo -n "Importing dashboard... " +RESPONSE=$(curl -s -X POST \ + -H "Content-Type: application/json" \ + -u "admin:$ADMIN_PASSWORD" \ + -d "$DASHBOARD_JSON" \ + "$GRAFANA_URL/api/dashboards/db") + +if echo "$RESPONSE" | jq -e '.status == "success"' > /dev/null 2>&1; then + echo "✓ Success" + DASHBOARD_URL=$(echo "$RESPONSE" | jq -r '.url') + echo "" + echo "Dashboard imported successfully!" + echo "Access it at: $GRAFANA_URL$DASHBOARD_URL" +else + echo "✗ Failed" + echo "" + echo "Error response:" + echo "$RESPONSE" | jq . + exit 1 +fi From e04a235c10fcb9100ee000a4e12c9664bffb7f42 Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Thu, 4 Jun 2026 14:48:03 -0400 Subject: [PATCH 3/7] Add graphics engine active panel to GPU Health dashboard row. Visualize DCGM_FI_PROF_GR_ENGINE_ACTIVE per node and document PromQL in DASHBOARD.md and GRAFANA_DEPLOYMENT.md. Co-authored-by: Cursor --- DASHBOARD.md | 1 + GRAFANA_DEPLOYMENT.md | 1 + gpu-dashboard.json | 34 +++++++++++++++++++++++---- helm/grafana-dashboard-configmap.yaml | 34 +++++++++++++++++++++++---- 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/DASHBOARD.md b/DASHBOARD.md index 9965223..154a65c 100644 --- a/DASHBOARD.md +++ b/DASHBOARD.md @@ -153,6 +153,7 @@ Panels in the **GPU Health & DCGM** row use additional dcgm-exporter counters. P | Power draw by node | `sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)` | | VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | | Memory copy utilization | `avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)` | +| Graphics/compute engine active by node | `avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)` | | XID errors (1h increase) | `sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))` | | SM active by node | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | | Tensor pipe active by node | `avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)` | diff --git a/GRAFANA_DEPLOYMENT.md b/GRAFANA_DEPLOYMENT.md index 5981ce7..d1ec951 100644 --- a/GRAFANA_DEPLOYMENT.md +++ b/GRAFANA_DEPLOYMENT.md @@ -412,6 +412,7 @@ Check each panel shows data (not "No data"): | Peak power | `max(DCGM_FI_DEV_POWER_USAGE)` | Max per-GPU watts | | XID errors | `sum(DCGM_FI_DEV_XID_ERRORS)` | Should be 0 in healthy clusters | | VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | Per-GPU VRAM fill | +| Graphics/compute engine active | `avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)` | 0–1; primary idle-detection metric | | SM active (profiling) | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | No data if profiling disabled | ### 6. Test Prometheus Queries Manually diff --git a/gpu-dashboard.json b/gpu-dashboard.json index dbd3391..55cf0d6 100644 --- a/gpu-dashboard.json +++ b/gpu-dashboard.json @@ -670,11 +670,37 @@ } } }, + { + "title": "Graphics/compute engine active by node", + "description": "Average graphics/compute engine activity (DCGM_FI_PROF_GR_ENGINE_ACTIVE, 0–1). Primary metric for idle detection in gpu-pruner and engine idle/active overview stats.", + "type": "timeseries", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, { "title": "SM active by node", "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -700,7 +726,7 @@ "title": "Tensor pipe active by node", "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -726,7 +752,7 @@ "title": "DRAM active by node", "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -757,5 +783,5 @@ "timezone": "browser", "title": "Waldorf GPU Usage & Idle Tracker", "uid": "prometheus", - "version": 2 + "version": 3 } diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml index 3fe4be0..6380fa2 100644 --- a/helm/grafana-dashboard-configmap.yaml +++ b/helm/grafana-dashboard-configmap.yaml @@ -682,11 +682,37 @@ data: } } }, + { + "title": "Graphics/compute engine active by node", + "description": "Average graphics/compute engine activity (DCGM_FI_PROF_GR_ENGINE_ACTIVE, 0–1). Primary metric for idle detection in gpu-pruner and engine idle/active overview stats.", + "type": "timeseries", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 99 }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": true + } + } + } + }, { "title": "SM active by node", "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -712,7 +738,7 @@ data: "title": "Tensor pipe active by node", "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -738,7 +764,7 @@ data: "title": "DRAM active by node", "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 105 }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "targets": [ { @@ -769,5 +795,5 @@ data: "timezone": "browser", "title": "Waldorf GPU Usage & Idle Tracker", "uid": "prometheus", - "version": 2 + "version": 3 } From bacde097f8a9dd2a3bc6d40cba9cac970c9e97cd Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Mon, 8 Jun 2026 10:05:42 -0400 Subject: [PATCH 4/7] Update GPU dashboard to include 30-minute metrics for idle and active states. Adjusted legend formats and added refIds for clarity in the Grafana configuration. Ensured consistency across dashboard panels for better monitoring of GPU workloads. --- gpu-dashboard.json | 17 +++++++++++------ helm/grafana-dashboard-configmap.yaml | 17 +++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/gpu-dashboard.json b/gpu-dashboard.json index 55cf0d6..c414ca7 100644 --- a/gpu-dashboard.json +++ b/gpu-dashboard.json @@ -82,7 +82,8 @@ "targets": [ { "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)", - "legendFormat": "Idle" + "legendFormat": "Idle (30m)", + "refId": "Idle (30m)" } ], "fieldConfig": { @@ -102,7 +103,8 @@ "targets": [ { "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)", - "legendFormat": "Active" + "legendFormat": "Active (30m)", + "refId": "Active (30m)" } ], "fieldConfig": { @@ -233,7 +235,8 @@ "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0", "legendFormat": "", "format": "table", - "instant": true + "instant": true, + "refId": "Idle GPUs (30m)" } ], "transformations": [ @@ -266,8 +269,9 @@ "targets": [ { "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5", - "legendFormat": "{{deployment}} ({{namespace}})", - "interval": "1m" + "legendFormat": "{{deployment}} ({{namespace}}) (30m)", + "interval": "1m", + "refId": "By deployment (30m)" } ], "fieldConfig": { @@ -322,7 +326,8 @@ "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)", "legendFormat": "", "format": "table", - "instant": true + "instant": true, + "refId": "By deployment (30m)" } ], "transformations": [ diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml index 6380fa2..8d52030 100644 --- a/helm/grafana-dashboard-configmap.yaml +++ b/helm/grafana-dashboard-configmap.yaml @@ -94,7 +94,8 @@ data: "targets": [ { "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)", - "legendFormat": "Idle" + "legendFormat": "Idle (30m)", + "refId": "Idle (30m)" } ], "fieldConfig": { @@ -114,7 +115,8 @@ data: "targets": [ { "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)", - "legendFormat": "Active" + "legendFormat": "Active (30m)", + "refId": "Active (30m)" } ], "fieldConfig": { @@ -245,7 +247,8 @@ data: "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0", "legendFormat": "", "format": "table", - "instant": true + "instant": true, + "refId": "Idle GPUs (30m)" } ], "transformations": [ @@ -278,8 +281,9 @@ data: "targets": [ { "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5", - "legendFormat": "{{deployment}} ({{namespace}})", - "interval": "1m" + "legendFormat": "{{deployment}} ({{namespace}}) (30m)", + "interval": "1m", + "refId": "By deployment (30m)" } ], "fieldConfig": { @@ -334,7 +338,8 @@ data: "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)", "legendFormat": "", "format": "table", - "instant": true + "instant": true, + "refId": "By deployment (30m)" } ], "transformations": [ From d91b98758138f693335bcf94c67042b7197b3f87 Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Tue, 9 Jun 2026 14:56:46 -0400 Subject: [PATCH 5/7] added slack notifies and removed UI --- README.md | 19 ++ gpu-pruner/hack/deployment.yaml | 7 + .../hack/slack-webhook-secret.example.yaml | 14 + gpu-pruner/src/dashboard.html | 296 ------------------ gpu-pruner/src/dashboard.rs | 105 ------- gpu-pruner/src/lib.rs | 96 +++++- gpu-pruner/src/main.rs | 163 ++++++---- gpu-pruner/src/metrics.rs | 47 +++ gpu-pruner/src/slack.rs | 121 +++++++ gpu-pruner/tests/e2e.rs | 4 +- 10 files changed, 397 insertions(+), 475 deletions(-) create mode 100644 gpu-pruner/hack/slack-webhook-secret.example.yaml delete mode 100644 gpu-pruner/src/dashboard.html delete mode 100644 gpu-pruner/src/dashboard.rs create mode 100644 gpu-pruner/src/slack.rs diff --git a/README.md b/README.md index e5d9018..f443f37 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,24 @@ The background for `gpu-pruner` is that in certain environments it is very easy This culler politely pauses workloads that appear idle by scaling them down to 0 replicas. Features may be added in the future for better notifications, but the idea is that a user can simply re-enable the workload when they are ready to test/demo again. +## Acknowledgment System + +**NEW**: Prevent unwanted scale-downs by acknowledging workloads that are intentionally idle. + +Users can acknowledge idle workloads via the web dashboard to prevent gpu-pruner from scaling them down. Use cases: +- Loading large datasets +- Model warm-up / compilation +- Interactive debugging sessions +- Scheduled batch jobs with intermittent GPU usage + +**Quick Start:** +1. Open the web dashboard: `http://dashboard-url:8080` +2. Enter your email address +3. Click **4h**, **8h**, or **24h** buttons next to idle workloads +4. Acknowledged workloads won't be scaled down until the acknowledgment expires + +See [ACKNOWLEDGMENT_GUIDE.md](ACKNOWLEDGMENT_GUIDE.md) for complete documentation, API usage, and troubleshooting. + ## Dashboard The gpu-pruner includes both a **web dashboard** and a **Grafana dashboard** for monitoring GPU workloads. @@ -26,6 +44,7 @@ Real-time web interface for monitoring GPU workloads. See [DASHBOARD.md](DASHBOA Features: - Real-time monitoring of idle GPU workloads +- **Acknowledgment system** - prevent scale-downs with duration-based acknowledgments - Resource usage statistics - Modern web UI with auto-refresh - REST API endpoint for programmatic access diff --git a/gpu-pruner/hack/deployment.yaml b/gpu-pruner/hack/deployment.yaml index 58c7b07..797685e 100644 --- a/gpu-pruner/hack/deployment.yaml +++ b/gpu-pruner/hack/deployment.yaml @@ -23,11 +23,18 @@ spec: - '--run-mode=scale-down' - '--prometheus-url=http://thanos-querier.openshift-monitoring.svc.cluster.local' - '--dashboard-port=8080' + - '--slack-channel=#test-pruner' env: - name: RUST_BACKTRACE value: '1' - name: RUST_LOG value: info + - name: SLACK_WEBHOOK_URL + valueFrom: + secretKeyRef: + name: gpu-pruner-slack-webhook + key: webhook-url + optional: true ports: - containerPort: 8080 name: dashboard diff --git a/gpu-pruner/hack/slack-webhook-secret.example.yaml b/gpu-pruner/hack/slack-webhook-secret.example.yaml new file mode 100644 index 0000000..6ab66f3 --- /dev/null +++ b/gpu-pruner/hack/slack-webhook-secret.example.yaml @@ -0,0 +1,14 @@ +# Create the real secret (do not commit webhook URLs): +# +# kubectl create secret generic gpu-pruner-slack-webhook \ +# --namespace=gpu-pruner-system \ +# --from-literal=webhook-url='https://hooks.slack.com/services/T.../B.../...' +# +apiVersion: v1 +kind: Secret +metadata: + name: gpu-pruner-slack-webhook + namespace: gpu-pruner-system +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/REPLACE/ME/REPLACE diff --git a/gpu-pruner/src/dashboard.html b/gpu-pruner/src/dashboard.html deleted file mode 100644 index e7c4e17..0000000 --- a/gpu-pruner/src/dashboard.html +++ /dev/null @@ -1,296 +0,0 @@ - - - - - - GPU Pruner Dashboard - - - -
-
-

GPU Pruner Dashboard

-

Monitor idle GPU workloads in the cluster

-
- -
-
-
Total Pods Checked
-
-
-
-
-
Idle Workloads
-
-
-
-
-
Wasted GPU Resources
-
-
-
-
- -
-

Idle GPU Workloads

-
-
Loading...
-
-
-
-
- - - - diff --git a/gpu-pruner/src/dashboard.rs b/gpu-pruner/src/dashboard.rs deleted file mode 100644 index e5b0e4a..0000000 --- a/gpu-pruner/src/dashboard.rs +++ /dev/null @@ -1,105 +0,0 @@ -use axum::{ - extract::State, - response::{Html, IntoResponse}, - routing::get, - Json, Router, -}; -use std::sync::Arc; -use tokio::sync::RwLock; -use tower_http::{cors::CorsLayer, trace::TraceLayer}; - -use gpu_pruner::{metrics, Meta, ScaleKind}; - -#[derive(Clone, Debug, serde::Serialize)] -pub struct WorkloadInfo { - pub name: String, - pub namespace: String, - pub kind: String, - pub gpu_model: Option, - pub idle_duration: Option, -} - -#[derive(Clone, Debug, serde::Serialize)] -pub struct DashboardState { - pub idle_workloads: Vec, - pub total_idle_gpus: usize, - pub total_pods_checked: usize, - pub last_update: String, -} - -impl Default for DashboardState { - fn default() -> Self { - Self { - idle_workloads: Vec::new(), - total_idle_gpus: 0, - total_pods_checked: 0, - last_update: chrono::Utc::now().to_rfc3339(), - } - } -} - -pub type SharedDashboardState = Arc>; - -pub async fn update_dashboard_state( - state: SharedDashboardState, - idle_workloads: Vec, - total_pods: usize, -) { - let workloads: Vec = idle_workloads - .iter() - .map(|w| WorkloadInfo { - name: w.name(), - namespace: w.namespace().unwrap_or_default(), - kind: w.kind(), - gpu_model: None, - idle_duration: None, - }) - .collect(); - - let idle_count = idle_workloads.len(); - - let mut state = state.write().await; - state.idle_workloads = workloads; - state.total_idle_gpus = idle_count; - state.total_pods_checked = total_pods; - state.last_update = chrono::Utc::now().to_rfc3339(); - - // Update Prometheus gauges - metrics::IDLE_GPUS.set(idle_count as i64); - metrics::PODS_CHECKED.set(total_pods as i64); -} - -async fn dashboard_html() -> impl IntoResponse { - Html(include_str!("dashboard.html")) -} - -async fn api_status(State(state): State) -> impl IntoResponse { - let state = state.read().await; - Json(state.clone()) -} - -async fn metrics_handler() -> impl IntoResponse { - metrics::render() -} - -pub fn create_router(state: SharedDashboardState) -> Router { - Router::new() - .route("/", get(dashboard_html)) - .route("/api/status", get(api_status)) - .route("/metrics", get(metrics_handler)) - .layer(CorsLayer::permissive()) - .layer(TraceLayer::new_for_http()) - .with_state(state) -} - -pub async fn run_server(state: SharedDashboardState, port: u16) -> anyhow::Result<()> { - let app = create_router(state); - let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port)); - - tracing::info!("Dashboard server starting on http://{}", addr); - - let listener = tokio::net::TcpListener::bind(addr).await?; - axum::serve(listener, app).await?; - - Ok(()) -} diff --git a/gpu-pruner/src/lib.rs b/gpu-pruner/src/lib.rs index 02dc93d..c3c20e7 100644 --- a/gpu-pruner/src/lib.rs +++ b/gpu-pruner/src/lib.rs @@ -1,4 +1,5 @@ pub mod metrics; +pub mod slack; use clap::ValueEnum; use k8s_openapi::{ @@ -35,6 +36,13 @@ use kube::{ api::{ObjectMeta, Patch, PatchParams}, }; +#[derive(Debug, Clone)] +pub struct AckStatus { + pub acknowledged: bool, + pub expires_at: Option, + pub by_user: Option, +} + #[derive(Debug, Clone, Serialize)] pub enum ScaleKind { Deployment(Deployment), @@ -197,8 +205,12 @@ pub trait Meta { } pub trait Scaler { - fn scale(&self, client: Client) - -> impl std::future::Future> + Send; + fn scale( + &self, + client: Client, + slack_notifier: Option, + idle_duration_minutes: i64, + ) -> impl std::future::Future> + Send; fn generate_scale_event(&self) -> anyhow::Result; } @@ -337,8 +349,13 @@ impl Meta for ScaleKind { } impl Scaler for ScaleKind { - #[tracing::instrument(skip(self, client))] - async fn scale(&self, client: Client) -> anyhow::Result<()> { + #[tracing::instrument(skip(self, client, slack_notifier))] + async fn scale( + &self, + client: Client, + slack_notifier: Option, + idle_duration_minutes: i64, + ) -> anyhow::Result<()> { if let Some(ns) = self.namespace() { let event = self.generate_scale_event()?; let events_api: Api = Api::namespaced(client.clone(), &ns); @@ -348,6 +365,20 @@ impl Scaler for ScaleKind { } else { tracing::debug!("Emitted scale event for: {:?}", event.involved_object); } + + // Send Slack notification if configured + if let Some(notifier) = slack_notifier { + match notifier.send_notification(self, idle_duration_minutes).await { + Ok(_) => { + metrics::SLACK_NOTIFICATIONS_SENT.inc(); + } + Err(e) => { + metrics::SLACK_NOTIFICATION_FAILURES.inc(); + tracing::error!("Failed to send Slack notification: {e}"); + // Continue with scale-down even if notification fails + } + } + } }; match self { @@ -429,6 +460,63 @@ impl Scaler for ScaleKind { } } +/// Check if a workload has an active acknowledgment annotation +#[tracing::instrument(skip(_client))] +pub async fn check_acknowledgment( + _client: KubeClient, + workload: &ScaleKind, +) -> anyhow::Result { + use chrono::DateTime; + + let annotations = match workload { + ScaleKind::Deployment(d) => d.metadata.annotations.clone(), + ScaleKind::ReplicaSet(r) => r.metadata.annotations.clone(), + ScaleKind::StatefulSet(s) => s.metadata.annotations.clone(), + ScaleKind::Notebook(n) => n.metadata.annotations.clone(), + ScaleKind::InferenceService(i) => i.metadata.annotations.clone(), + }; + + let annotations = match annotations { + Some(a) => a, + None => { + return Ok(AckStatus { + acknowledged: false, + expires_at: None, + by_user: None, + }) + } + }; + + let ack_until = annotations.get("gpu-pruner.io/ack-until"); + let ack_by = annotations.get("gpu-pruner.io/ack-by"); + + if let Some(expires_at_str) = ack_until { + // Parse the timestamp and check if it's still valid + if let Ok(expires_at) = DateTime::parse_from_rfc3339(expires_at_str) { + let now = chrono::Utc::now(); + if expires_at.timestamp() > now.timestamp() { + return Ok(AckStatus { + acknowledged: true, + expires_at: Some(expires_at_str.clone()), + by_user: ack_by.cloned(), + }); + } else { + tracing::info!( + "Acknowledgment expired for {} in {}", + workload.name(), + workload.namespace().unwrap_or_default() + ); + } + } + } + + Ok(AckStatus { + acknowledged: false, + expires_at: None, + by_user: None, + }) +} + /// Crawl up the owner references to find the root Deployment or StatefulSet /// and allows an action like scaling to be performed. /// diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs index 37fe9eb..1fa8ad3 100644 --- a/gpu-pruner/src/main.rs +++ b/gpu-pruner/src/main.rs @@ -1,7 +1,5 @@ use minijinja::{Environment, context}; -mod dashboard; - #[cfg(feature = "otel")] use std::sync::LazyLock; #[cfg(feature = "otel")] @@ -15,7 +13,7 @@ use { tracing_opentelemetry::{MetricsLayer, OpenTelemetryLayer}, }; -use std::{collections::HashSet, fmt::Debug, sync::atomic::AtomicUsize}; +use std::{collections::HashSet, fmt::Debug, sync::{atomic::AtomicUsize, Arc}}; use tokio::{sync::mpsc::Sender, time}; use tracing_subscriber::EnvFilter; @@ -36,8 +34,9 @@ use kube::{Api, Client as KubeClient, Resource}; use clap::{Parser, ValueEnum}; use gpu_pruner::{ - Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, find_root_object, - get_enabled_resources, get_prom_client, get_prometheus_token, + Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, check_acknowledgment, + find_root_object, get_enabled_resources, get_prom_client, get_prometheus_token, + slack::SlackNotifier, }; /// `gpu-pruner` is a tool to prune idle pods based on GPU utilization. It uses Prometheus to query @@ -119,9 +118,15 @@ struct Cli { #[clap(short, long, default_value = "default")] log_format: LogFormat, - /// Enable the web dashboard on the specified port + /// Slack webhook URL for notifications. Can also be set via SLACK_WEBHOOK_URL env var. + /// Messages will be sent to the configured channel when idle GPUs are detected. #[clap(long)] - dashboard_port: Option, + slack_webhook_url: Option, + + /// Slack channel to send notifications to + #[clap(long, default_value = "#test-pruner")] + slack_channel: String, + } #[derive(Debug, Clone, ValueEnum, Default, Serialize)] @@ -287,20 +292,28 @@ async fn main() -> anyhow::Result<()> { let enabled_resources = get_enabled_resources(&args.enabled_resources); tracing::info!("Enabled resources: {enabled_resources:?}"); + // Initialize Slack notifier if webhook URL is provided + let slack_notifier = args + .slack_webhook_url + .clone() + .or_else(|| std::env::var("SLACK_WEBHOOK_URL").ok()) + .map(|url| { + tracing::info!("Slack notifications enabled for channel: {}", args.slack_channel); + Arc::new(SlackNotifier::new(url, args.slack_channel.clone())) + }); + + if slack_notifier.is_none() { + tracing::info!("Slack notifications disabled (no webhook URL configured)"); + } + let env: Environment = Environment::new(); let query = env.render_str(include_str!("query.promql.j2"), context! { args })?; tracing::info!("Running w/ Query: {query}"); let (tx, mut rx) = tokio::sync::mpsc::channel::(100); - // Initialize dashboard state if dashboard is enabled - let dashboard_state = std::sync::Arc::new(tokio::sync::RwLock::new( - dashboard::DashboardState::default(), - )); - let query_task = { let args = args.clone(); - let dashboard_state = dashboard_state.clone(); tokio::spawn(async move { let mut interval = time::interval(tokio::time::Duration::from_secs(args.check_interval)); @@ -310,7 +323,7 @@ async fn main() -> anyhow::Result<()> { } let client = build_prom_client(&args).await; - match run_query_and_scale(client, query.clone(), &args, tx.clone(), dashboard_state.clone()).await { + match run_query_and_scale(client, query.clone(), &args, tx.clone()).await { Ok(qr) => { QUERY_FAILURES.store(0, std::sync::atomic::Ordering::Relaxed); gpu_pruner::metrics::QUERY_SUCCESSES.inc(); @@ -349,29 +362,33 @@ async fn main() -> anyhow::Result<()> { }) }; - let scale_down_task = tokio::spawn(async move { - let kube_client = KubeClient::try_default() - .await - .expect("failed to get kube client"); + let scale_down_task = { + let slack_notifier = slack_notifier.clone(); + let duration = args.duration; + tokio::spawn(async move { + let kube_client = KubeClient::try_default() + .await + .expect("failed to get kube client"); - while let Some(sk) = rx.recv().await { - // Check if the resource is enabled - if !enabled_resources.contains(sk.clone().into()) { - tracing::info!( - "Skipping resource type {kind:?} because it is not enabled", - kind = sk.kind() - ); - continue; - } + while let Some(sk) = rx.recv().await { + // Check if the resource is enabled + if !enabled_resources.contains(sk.clone().into()) { + tracing::info!( + "Skipping resource type {kind:?} because it is not enabled", + kind = sk.kind() + ); + continue; + } - if let Err(e) = sk.scale(kube_client.clone()).await { - gpu_pruner::metrics::SCALE_FAILURES.inc(); - tracing::error!( - monotonic_counter.scale_failures = 1, - "Failed to scale resource! {e}" - ); - continue; - } + let notifier = slack_notifier.as_ref().map(|n| (**n).clone()); + if let Err(e) = sk.scale(kube_client.clone(), notifier, duration).await { + gpu_pruner::metrics::SCALE_FAILURES.inc(); + tracing::error!( + monotonic_counter.scale_failures = 1, + "Failed to scale resource! {e}" + ); + continue; + } let kind = sk.kind(); let name = sk.name(); @@ -385,32 +402,15 @@ async fn main() -> anyhow::Result<()> { name = name, namespace = namespace ) - } - }); - - // Start dashboard server if requested - let dashboard_task = if let Some(port) = args.dashboard_port { - let dashboard_state = dashboard_state.clone(); - Some(tokio::spawn(async move { - dashboard::run_server(dashboard_state, port).await - })) - } else { - None + } + }) }; // Wait for all tasks - if let Some(dashboard_task) = dashboard_task { - _ = tokio::try_join! { - query_task, - scale_down_task, - dashboard_task - }?; - } else { - _ = tokio::try_join! { - query_task, - scale_down_task - }?; - } + _ = tokio::try_join! { + query_task, + scale_down_task + }?; Ok(()) } @@ -434,7 +434,6 @@ async fn run_query_and_scale( query: String, args: &Cli, tx: Sender, - dashboard_state: dashboard::SharedDashboardState, ) -> anyhow::Result { let response = match client.query(query).get().await { Ok(response) => response, @@ -577,16 +576,44 @@ async fn run_query_and_scale( let num_shutdown_events = shutdown_events.len(); - // Update dashboard state - dashboard::update_dashboard_state( - dashboard_state, - shutdown_events.iter().cloned().collect(), - num_pods, - ) - .await; + // Check acknowledgment status for all idle workloads + let workloads_with_ack: Vec<(ScaleKind, Option)> = + futures::stream::iter(shutdown_events.iter().cloned()) + .then(|obj| async { + let ack_status = check_acknowledgment(kube_client.clone(), &obj).await.ok(); + (obj, ack_status) + }) + .collect() + .await; + + // Count acknowledged workloads and update metrics + let acknowledged_count = workloads_with_ack + .iter() + .filter(|(_, ack)| ack.as_ref().map(|a| a.acknowledged).unwrap_or(false)) + .count(); + + gpu_pruner::metrics::ACKNOWLEDGED_WORKLOADS.set(acknowledged_count as i64); + + // Filter out acknowledged workloads before scaling + futures::stream::iter(workloads_with_ack) + .filter_map(|(obj, ack_status)| async move { + // Skip acknowledged workloads + if let Some(ack) = &ack_status { + if ack.acknowledged { + tracing::info!( + "Skipping [{}] {}:{} - acknowledged until {} by {}", + obj.kind(), + obj.namespace().unwrap_or_default(), + obj.name(), + ack.expires_at.as_ref().unwrap_or(&"unknown".to_string()), + ack.by_user.as_ref().unwrap_or(&"unknown".to_string()) + ); + gpu_pruner::metrics::SCALEDOWNS_PREVENTED_TOTAL.inc(); + return None; + } + } - futures::stream::iter(shutdown_events) - .filter_map(|obj| async { + // Apply dry-run filter if let Mode::DryRun = args.run_mode { tracing::info!( "Dry-run: Would have sent [{}] {}:{} for scaledown", diff --git a/gpu-pruner/src/metrics.rs b/gpu-pruner/src/metrics.rs index 3f8b59b..2fdf967 100644 --- a/gpu-pruner/src/metrics.rs +++ b/gpu-pruner/src/metrics.rs @@ -54,6 +54,38 @@ lazy_static! { "Total number of pods analyzed in last query" ) .expect("metric can be created"); + + // Acknowledgment metrics + pub static ref ACKNOWLEDGED_WORKLOADS: IntGauge = IntGauge::new( + "gpu_pruner_acknowledged_workloads", + "Current number of workloads with active acknowledgments" + ) + .expect("metric can be created"); + + pub static ref ACKNOWLEDGMENTS_TOTAL: IntCounter = IntCounter::new( + "gpu_pruner_acknowledgments_total", + "Total number of acknowledgments created" + ) + .expect("metric can be created"); + + pub static ref SCALEDOWNS_PREVENTED_TOTAL: IntCounter = IntCounter::new( + "gpu_pruner_scaledowns_prevented_total", + "Total number of scale-downs prevented by acknowledgments" + ) + .expect("metric can be created"); + + // Slack notification metrics + pub static ref SLACK_NOTIFICATIONS_SENT: IntCounter = IntCounter::new( + "gpu_pruner_slack_notifications_sent_total", + "Total number of Slack notifications successfully sent" + ) + .expect("metric can be created"); + + pub static ref SLACK_NOTIFICATION_FAILURES: IntCounter = IntCounter::new( + "gpu_pruner_slack_notification_failures_total", + "Total number of failed Slack notification attempts" + ) + .expect("metric can be created"); } pub fn init() { @@ -81,6 +113,21 @@ pub fn init() { REGISTRY .register(Box::new(PODS_CHECKED.clone())) .expect("pods_checked can be registered"); + REGISTRY + .register(Box::new(ACKNOWLEDGED_WORKLOADS.clone())) + .expect("acknowledged_workloads can be registered"); + REGISTRY + .register(Box::new(ACKNOWLEDGMENTS_TOTAL.clone())) + .expect("acknowledgments_total can be registered"); + REGISTRY + .register(Box::new(SCALEDOWNS_PREVENTED_TOTAL.clone())) + .expect("scaledowns_prevented_total can be registered"); + REGISTRY + .register(Box::new(SLACK_NOTIFICATIONS_SENT.clone())) + .expect("slack_notifications_sent can be registered"); + REGISTRY + .register(Box::new(SLACK_NOTIFICATION_FAILURES.clone())) + .expect("slack_notification_failures can be registered"); } pub fn render() -> String { diff --git a/gpu-pruner/src/slack.rs b/gpu-pruner/src/slack.rs new file mode 100644 index 0000000..4b29e8c --- /dev/null +++ b/gpu-pruner/src/slack.rs @@ -0,0 +1,121 @@ +use anyhow::Result; +use reqwest::Client; +use serde_json::json; + +use crate::Meta; + +#[derive(Clone, Debug)] +pub struct SlackNotifier { + webhook_url: String, + client: Client, + channel: String, +} + +impl SlackNotifier { + pub fn new(webhook_url: String, channel: String) -> Self { + let client = Client::builder() + .timeout(std::time::Duration::from_secs(10)) + .build() + .expect("failed to build slack http client"); + + Self { + webhook_url, + client, + channel, + } + } + + #[tracing::instrument(skip(self, workload))] + pub async fn send_notification( + &self, + workload: &T, + idle_duration_minutes: i64, + ) -> Result<()> { + let resource_type = workload.kind(); + let resource_name = workload.name(); + let namespace = workload.namespace().unwrap_or_else(|| "default".to_string()); + + let payload = json!({ + "channel": self.channel, + "attachments": [{ + "color": "warning", + "title": "🔔 Idle GPU Detected - Scale Down Pending", + "fields": [ + { + "title": "Resource", + "value": format!("{}: {}", resource_type, resource_name), + "short": true + }, + { + "title": "Namespace", + "value": namespace, + "short": true + }, + { + "title": "Reason", + "value": format!("GPU idle for {} minutes", idle_duration_minutes), + "short": false + }, + { + "title": "Action", + "value": "Scaling to 0 replicas", + "short": false + } + ], + "footer": "gpu-pruner", + "ts": std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }] + }); + + tracing::debug!("Sending Slack notification payload: {:?}", payload); + + let response = self + .client + .post(&self.webhook_url) + .json(&payload) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + tracing::error!( + "Slack webhook returned error status {}: {}", + status, + body + ); + return Err(anyhow::anyhow!( + "Slack webhook failed with status {}: {}", + status, + body + )); + } + + tracing::info!( + "Sent Slack notification for [{resource_type}] {namespace}:{resource_name}", + ); + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_slack_notifier_creation() { + let notifier = SlackNotifier::new( + "https://hooks.slack.com/services/TEST".to_string(), + "#test-pruner".to_string(), + ); + assert_eq!(notifier.webhook_url, "https://hooks.slack.com/services/TEST"); + assert_eq!(notifier.channel, "#test-pruner"); + } + + // Note: Actual send_notification tests would require mocking the HTTP client + // or using integration tests with a test webhook endpoint +} diff --git a/gpu-pruner/tests/e2e.rs b/gpu-pruner/tests/e2e.rs index 3b2301d..ed28b34 100644 --- a/gpu-pruner/tests/e2e.rs +++ b/gpu-pruner/tests/e2e.rs @@ -268,7 +268,7 @@ async fn scale_deployment_to_zero() { let dep = dep_api.get("e2e-scale").await.unwrap(); let sk = ScaleKind::Deployment(dep); - sk.scale(client.clone()).await.unwrap(); + sk.scale(client.clone(), None, 30).await.unwrap(); // verify it scaled to zero let dep = dep_api.get("e2e-scale").await.unwrap(); @@ -323,7 +323,7 @@ async fn scale_statefulset_to_zero() { let ss = ss_api.get("e2e-scale-ss").await.unwrap(); let sk = ScaleKind::StatefulSet(ss); - sk.scale(client.clone()).await.unwrap(); + sk.scale(client.clone(), None, 30).await.unwrap(); let ss = ss_api.get("e2e-scale-ss").await.unwrap(); let replicas = ss.spec.unwrap().replicas.unwrap_or(1); From de61d6d1e294827f4f3ae559eeacda4c5fc06d2f Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Wed, 10 Jun 2026 14:19:56 -0400 Subject: [PATCH 6/7] 0.1 threshold --- gpu-pruner/src/main.rs | 21 +++++++++++++++++++++ gpu-pruner/src/query.promql.j2 | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs index 1fa8ad3..6e3bf14 100644 --- a/gpu-pruner/src/main.rs +++ b/gpu-pruner/src/main.rs @@ -80,6 +80,11 @@ struct Cli { #[clap(short, long)] model_name: Option, + /// Maximum combined GPU utilization (0.0–1.0) to still consider a GPU idle. + /// Defaults to 0.01 to tolerate DCGM background noise on DCGM_FI_PROF_GR_ENGINE_ACTIVE. + #[clap(long, default_value_t = 0.01)] + idle_threshold: f64, + /// Power draw threshold in watts. When set, GPUs showing peak power usage above this value /// over the lookback window are excluded from idle candidates even if compute utilization is zero. /// Useful as a corroborating signal (e.g. 100 for A10G, 150 for A100/H100). @@ -688,6 +693,22 @@ mod tests { ); } + #[test] + fn query_uses_idle_threshold_not_strict_zero() { + let query = render(json!({ "duration": 30 })); + assert!( + query.contains("< 0.01"), + "default idle threshold should be 0.01, not == 0" + ); + assert!(!query.contains("== 0"), "should not use strict == 0"); + } + + #[test] + fn query_idle_threshold_is_configurable() { + let query = render(json!({ "duration": 30, "idle_threshold": 0.05 })); + assert!(query.contains("< 0.05"), "should use configured idle threshold"); + } + #[test] fn query_without_power_threshold_has_no_unless() { let query = render(json!({ "duration": 30 })); diff --git a/gpu-pruner/src/query.promql.j2 b/gpu-pruner/src/query.promql.j2 index 39db5ed..e819d47 100644 --- a/gpu-pruner/src/query.promql.j2 +++ b/gpu-pruner/src/query.promql.j2 @@ -32,7 +32,7 @@ sum by (Hostname, {{ cl }}, {{ pl }}, {{ nl }}, gpu, modelName) ( or on (Hostname, {{ cl }}, {{ pl }}, {{ nl }}, gpu, modelName) {{ idle_gpus }} ) -== 0 +< {{ args.idle_threshold | default(0.01) }} {%- if args.power_threshold %} unless on ({{ pl }}, {{ nl }}) ( From 3629dfd739fec7405472b16ed3dde8434a6c1f32 Mon Sep 17 00:00:00 2001 From: fuddin-bit Date: Wed, 10 Jun 2026 16:03:14 -0400 Subject: [PATCH 7/7] add slack acknowledgement and in-cluster --- Cargo.lock | 1 + GPU_UTILIZATION_QUERIES.md | 402 ++++++++++++++++++ gpu-pruner/Cargo.toml | 1 + gpu-pruner/hack/deployment.yaml | 6 +- gpu-pruner/hack/kustomization.yaml | 1 + gpu-pruner/hack/service.yaml | 6 +- gpu-pruner/hack/servicemonitor.yaml | 2 +- gpu-pruner/hack/slack-interactions-route.yaml | 21 + .../hack/slack-interactions-service-lb.yaml | 26 ++ .../hack/slack-webhook-secret.example.yaml | 7 + gpu-pruner/src/lib.rs | 69 +++ gpu-pruner/src/main.rs | 258 ++++++++++- gpu-pruner/src/slack.rs | 31 +- 13 files changed, 822 insertions(+), 9 deletions(-) create mode 100644 GPU_UTILIZATION_QUERIES.md create mode 100644 gpu-pruner/hack/slack-interactions-route.yaml create mode 100644 gpu-pruner/hack/slack-interactions-service-lb.yaml diff --git a/Cargo.lock b/Cargo.lock index d1e217f..bbac212 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -836,6 +836,7 @@ dependencies = [ "secrecy", "serde", "serde_json", + "serde_urlencoded", "thiserror 2.0.18", "tokio", "tower", diff --git a/GPU_UTILIZATION_QUERIES.md b/GPU_UTILIZATION_QUERIES.md new file mode 100644 index 0000000..c9224db --- /dev/null +++ b/GPU_UTILIZATION_QUERIES.md @@ -0,0 +1,402 @@ +# GPU Utilization Queries + +This document explains every PromQL query `gpu-pruner` uses to detect idle GPUs. The queries are rendered at runtime from `gpu-pruner/src/query.promql.j2` based on CLI flags. + +## Prerequisites + +- **DCGM exporter** running on GPU nodes +- **Prometheus** scraping DCGM metrics +- Port-forward for local testing: + +```bash +kubectl port-forward -n llm-d-monitoring svc/llmd-kube-prometheus-stack-prometheus 9090:9090 +``` + +Run queries at http://localhost:9090/graph or via curl: + +```bash +curl -sG 'http://localhost:9090/api/v1/query' \ + --data-urlencode 'query=' | jq . +``` + +## CLI flags that shape the queries + +| Flag | Default | Effect on query | +|------|---------|-----------------| +| `--duration` / `-t` | `30` | Lookback window `[Nm]` in minutes | +| `--honor-labels` | `false` | Use `pod`/`namespace` instead of `exported_pod`/`exported_namespace` | +| `--namespace` / `-n` | none | Regex filter on namespace label | +| `--model-name` / `-m` | none | Regex filter on `modelName` | +| `--idle-threshold` | `0.01` | Max utilization (0.0–1.0) to still count as idle; tolerates DCGM noise | +| `--power-threshold` | none | Exclude idle candidates with high power draw | + +--- + +## 1. Graphics engine active (primary metric) + +Measures the fraction of time the GPU graphics/compute engine was active over the lookback window. Range: **0.0–1.0**. + +### With `--honor-labels` (native DCGM labels) + +```promql +max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "" +}[30m]) +``` + +### Default (`exported_*` labels) + +```promql +max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + exported_pod != "" +}[30m]) +``` + +### With namespace filter (`--namespace=llm-d-optimized-baseline`) + +```promql +max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "", + namespace =~ "llm-d-optimized-baseline" +}[30m]) +``` + +### With model filter (`--model-name="NVIDIA H200"`) + +```promql +max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "", + modelName =~ "NVIDIA H200" +}[30m]) +``` + +**Notes:** +- `max_over_time` uses the **peak** value in the window, not the average. +- A value of `0` means the engine was never active during the window. +- Tiny non-zero values (e.g. `0.00007`) are DCGM background noise; gpu-pruner uses `< 0.01` by default (configurable via `--idle-threshold`) instead of strict `== 0`. + +--- + +## 2. GPU utilization % (fallback metric) + +Classic DCGM GPU utilization percentage. Divided by 100 so it matches the 0.0–1.0 scale of engine active. + +### With `--honor-labels` + +```promql +max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod != "" +}[30m]) / 100 +``` + +### Default (`exported_*` labels) + +```promql +max_over_time(DCGM_FI_DEV_GPU_UTIL{ + exported_pod != "" +}[30m]) / 100 +``` + +**Notes:** +- Used as a fallback when `DCGM_FI_PROF_GR_ENGINE_ACTIVE` is missing for a series. +- When **both** metrics exist, PromQL `or` keeps the **left-hand** (engine active) value. + +--- + +## 3. Combined utilization per GPU + +Aggregates both metrics per GPU, grouped by pod, namespace, and hardware labels. + +### With `--honor-labels` + +```promql +sum by (Hostname, container, pod, namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod != "" + }[30m]) / 100 +) +``` + +### Default (`exported_*` labels) + +```promql +sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + exported_pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + exported_pod != "" + }[30m]) / 100 +) +``` + +**Notes:** +- This is the core "is this GPU busy?" calculation. +- `sum by` collapses duplicate label sets (one series per GPU). +- Result `0` = idle; non-zero = some activity detected. + +--- + +## 4. Node type enrichment (optional join) + +Joins GPU metrics with `node_dmi_info` to attach hardware `node_type`. Falls back to un-enriched results when node info is missing. + +```promql +sum by (Hostname, container, pod, namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[30m]) / 100 +) +* on (Hostname) group_left(node_type) ( + label_replace( + label_replace(node_dmi_info, + "Hostname", "$1", "instance", "(.+)" + ), + "node_type", "$1", "product_name", "(.+)" + ) +) +or on (Hostname, container, pod, namespace, gpu, modelName) +sum by (Hostname, container, pod, namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[30m]) / 100 +) +``` + +**Notes:** +- The `or` at the end ensures GPUs still appear even when `node_dmi_info` has no match. +- `node_type` is informational; it does not affect idle detection. + +--- + +## 5. Full gpu-pruner idle detection query + +This is the complete query rendered and sent to Prometheus. Returns GPUs considered **idle** (combined utilization below `--idle-threshold`, default `0.01`). + +### With `--honor-labels`, `--duration=30` + +```promql +( + sum by (Hostname, container, pod, namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod != "" + }[30m]) / 100 + ) + * on (Hostname) group_left(node_type) ( + label_replace( + label_replace(node_dmi_info, + "Hostname", "$1", "instance", "(.+)" + ), + "node_type", "$1", "product_name", "(.+)" + ) + ) + or on (Hostname, container, pod, namespace, gpu, modelName) + sum by (Hostname, container, pod, namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod != "" + }[30m]) / 100 + ) +) < 0.01 +``` + +### Default (`exported_*` labels), `--duration=30` + +```promql +( + sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + exported_pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + exported_pod != "" + }[30m]) / 100 + ) + * on (Hostname) group_left(node_type) ( + label_replace( + label_replace(node_dmi_info, + "Hostname", "$1", "instance", "(.+)" + ), + "node_type", "$1", "product_name", "(.+)" + ) + ) + or on (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) + sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + exported_pod != "" + }[30m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + exported_pod != "" + }[30m]) / 100 + ) +) < 0.01 +``` + +**Notes:** +- Any series returned = gpu-pruner treats that GPU as idle. +- Override with `--idle-threshold=0.05` for a looser definition of idle. +- After the query, gpu-pruner resolves each pod to a scalable parent (Deployment, StatefulSet, etc.) in Kubernetes. +- Infrastructure pods (e.g. `dcgm-exporter` DaemonSets) may match this query but are skipped because they have no scalable root object. + +--- + +## 6. Power draw exclusion (optional, `--power-threshold`) + +When set, appends a `unless` clause to exclude GPUs that drew at or above the threshold (watts) during the lookback window, even if utilization is zero. + +### Example: `--power-threshold=150` with `--honor-labels` + +Full query becomes the idle query above, plus: + +```promql +unless on (pod, namespace) +( + max_over_time(DCGM_FI_DEV_POWER_USAGE{ + pod != "" + }[30m]) >= 150 +) +``` + +### Example: `--power-threshold=150` with default labels + +```promql +unless on (exported_pod, exported_namespace) +( + max_over_time(DCGM_FI_DEV_POWER_USAGE{ + exported_pod != "" + }[30m]) >= 150 +) +``` + +**Notes:** +- Useful to catch "compute idle but still drawing power" cases. +- Suggested starting points: `100` (A10G), `150` (A100/H100). + +--- + +## 7. Manual testing queries + +Simplified queries for debugging in Prometheus or curl. + +### Count idle GPUs (gpu-pruner default, `< 0.01`) + +```promql +( + sum by (pod, namespace, gpu) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[5m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[5m]) / 100 + ) +) < 0.01 +``` + +```bash +curl -sG 'http://localhost:9090/api/v1/query' \ + --data-urlencode 'query=(sum by (pod, namespace, gpu) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[5m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[5m]) / 100)) < 0.01' \ + | jq '.data.result | length' +``` + +### Inspect raw engine active for a workload + +```promql +max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod =~ "optimized-baseline.*" +}[5m]) +``` + +### Inspect raw GPU util % for a workload + +```promql +max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod =~ "optimized-baseline.*" +}[5m]) +``` + +### See the combined value gpu-pruner uses + +```promql +sum by (pod, namespace, gpu) ( + max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{ + pod =~ "optimized-baseline.*" + }[5m]) + or + max_over_time(DCGM_FI_DEV_GPU_UTIL{ + pod =~ "optimized-baseline.*" + }[5m]) / 100 +) +``` + +### Verify DCGM metrics exist + +```promql +DCGM_FI_PROF_GR_ENGINE_ACTIVE +``` + +```promql +DCGM_FI_DEV_GPU_UTIL +``` + +### Check which label convention your cluster uses + +```promql +count(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != ""}) +``` + +```promql +count(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}) +``` + +If the second count is non-zero and the first is zero, use `--honor-labels` with gpu-pruner. + +--- + +## 8. Example: matching gpu-pruner CLI to query + +This command: + +```bash +cargo run --bin gpu-pruner -- \ + --prometheus-url=http://localhost:9090 \ + --run-mode=dry-run \ + --duration=5 \ + --honor-labels \ + --namespace=llm-d-optimized-baseline +``` + +Renders a query equivalent to section 5 with `[5m]`, `pod`/`namespace` labels, and `namespace =~ "llm-d-optimized-baseline"` on both DCGM metric selectors. + +--- + +## Common pitfalls + +| Symptom | Likely cause | +|---------|----------------| +| Query returns 0 series | Wrong label convention; try `--honor-labels` or `exported_*` labels | +| Pods running but not idle | Utilization above `--idle-threshold` (default `0.01`) | +| Idle GPUs found but no scale-down | Pod owner is a DaemonSet or unsupported resource type | +| vLLM pods show `0%` util but not idle | `DCGM_FI_PROF_GR_ENGINE_ACTIVE` ≈ `0.00007` wins over `DCGM_FI_DEV_GPU_UTIL` in `or` | +| New pods never pruned | `--grace-period` (default 300s) adds extra age check in application logic after the query | + +--- + +## Source + +Queries are defined in: + +- Template: `gpu-pruner/src/query.promql.j2` +- Rendered in: `gpu-pruner/src/main.rs` (`Running w/ Query:` log line) +- Tests: `gpu-pruner/src/main.rs` (`query_*` unit tests) diff --git a/gpu-pruner/Cargo.toml b/gpu-pruner/Cargo.toml index 35b4b8d..e22d0f6 100644 --- a/gpu-pruner/Cargo.toml +++ b/gpu-pruner/Cargo.toml @@ -60,6 +60,7 @@ bitflags = "2" axum = "0.8" tower = { version = "0.5", features = ["full"] } tower-http = { version = "0.6", features = ["fs", "trace", "cors"] } +serde_urlencoded = "0.7" # Prometheus metrics prometheus = "0.13" diff --git a/gpu-pruner/hack/deployment.yaml b/gpu-pruner/hack/deployment.yaml index 797685e..bb0600a 100644 --- a/gpu-pruner/hack/deployment.yaml +++ b/gpu-pruner/hack/deployment.yaml @@ -16,13 +16,14 @@ spec: serviceAccountName: gpu-pruner containers: - name: container - image: 'ghcr.io/wseaton/gpu-pruner:latest-otel' + image: 'ghcr.io/fuddin-bit/gpu-pruner:latest-otel' args: - 'gpu-pruner' - '-d' - '--run-mode=scale-down' - '--prometheus-url=http://thanos-querier.openshift-monitoring.svc.cluster.local' - '--dashboard-port=8080' + - '--slack-interaction-port=3002' - '--slack-channel=#test-pruner' env: - name: RUST_BACKTRACE @@ -39,6 +40,9 @@ spec: - containerPort: 8080 name: dashboard protocol: TCP + - containerPort: 3002 + name: slack-interactions + protocol: TCP resources: limits: cpu: 500m diff --git a/gpu-pruner/hack/kustomization.yaml b/gpu-pruner/hack/kustomization.yaml index c84bf79..abe4e2d 100644 --- a/gpu-pruner/hack/kustomization.yaml +++ b/gpu-pruner/hack/kustomization.yaml @@ -5,4 +5,5 @@ resources: - clusterrole.yaml - service.yaml - route.yaml +- slack-interactions-route.yaml - servicemonitor.yaml \ No newline at end of file diff --git a/gpu-pruner/hack/service.yaml b/gpu-pruner/hack/service.yaml index 7b2ba7d..202c6eb 100644 --- a/gpu-pruner/hack/service.yaml +++ b/gpu-pruner/hack/service.yaml @@ -10,7 +10,11 @@ spec: selector: app: gpu-pruner ports: - - name: http + - name: dashboard protocol: TCP port: 8080 targetPort: 8080 + - name: slack-interactions + protocol: TCP + port: 3002 + targetPort: 3002 diff --git a/gpu-pruner/hack/servicemonitor.yaml b/gpu-pruner/hack/servicemonitor.yaml index 84c2cd5..82b935f 100644 --- a/gpu-pruner/hack/servicemonitor.yaml +++ b/gpu-pruner/hack/servicemonitor.yaml @@ -10,7 +10,7 @@ spec: matchLabels: app: gpu-pruner endpoints: - - port: http + - port: dashboard path: /metrics interval: 30s scrapeTimeout: 10s diff --git a/gpu-pruner/hack/slack-interactions-route.yaml b/gpu-pruner/hack/slack-interactions-route.yaml new file mode 100644 index 0000000..1471acf --- /dev/null +++ b/gpu-pruner/hack/slack-interactions-route.yaml @@ -0,0 +1,21 @@ +# OpenShift Route exposing Slack interactive component callbacks over HTTPS. +# After apply, set Slack App → Interactive Components → Request URL to: +# https://$(kubectl get route gpu-pruner-slack -n gpu-pruner-system -o jsonpath='{.spec.host}')/slack/interactions +kind: Route +apiVersion: route.openshift.io/v1 +metadata: + name: gpu-pruner-slack + namespace: gpu-pruner-system + labels: + app: gpu-pruner +spec: + to: + kind: Service + name: gpu-pruner-dashboard + weight: 100 + port: + targetPort: slack-interactions + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None diff --git a/gpu-pruner/hack/slack-interactions-service-lb.yaml b/gpu-pruner/hack/slack-interactions-service-lb.yaml new file mode 100644 index 0000000..b5515b5 --- /dev/null +++ b/gpu-pruner/hack/slack-interactions-service-lb.yaml @@ -0,0 +1,26 @@ +# Optional: CoreWeave CKS LoadBalancer for Slack interactions (no OpenShift Route). +# Apply standalone when not using slack-interactions-route.yaml: +# kubectl apply -f gpu-pruner/hack/slack-interactions-service-lb.yaml +# +# Requires TLS termination in front of this service (Slack requires HTTPS). +# Set Slack App → Interactive Components → Request URL to: +# https:///slack/interactions +apiVersion: v1 +kind: Service +metadata: + name: gpu-pruner-slack + namespace: gpu-pruner-system + labels: + app: gpu-pruner + annotations: + service.beta.kubernetes.io/coreweave-load-balancer-type: "public" + service.beta.kubernetes.io/external-hostname: "gpu-pruner-slack" +spec: + type: LoadBalancer + selector: + app: gpu-pruner + ports: + - name: slack-interactions + protocol: TCP + port: 3002 + targetPort: 3002 diff --git a/gpu-pruner/hack/slack-webhook-secret.example.yaml b/gpu-pruner/hack/slack-webhook-secret.example.yaml index 6ab66f3..bc8899f 100644 --- a/gpu-pruner/hack/slack-webhook-secret.example.yaml +++ b/gpu-pruner/hack/slack-webhook-secret.example.yaml @@ -4,6 +4,13 @@ # --namespace=gpu-pruner-system \ # --from-literal=webhook-url='https://hooks.slack.com/services/T.../B.../...' # +# After deploying with kubectl apply -k gpu-pruner/hack/, configure Slack: +# 1. api.slack.com/apps → Your App → Interactive Components → Enable +# 2. Request URL (OpenShift): +# https://$(kubectl get route gpu-pruner-slack -n gpu-pruner-system -o jsonpath='{.spec.host}')/slack/interactions +# 3. Click a button in #test-pruner and verify annotations: +# kubectl get deployment -n -o yaml | grep gpu-pruner.io +# apiVersion: v1 kind: Secret metadata: diff --git a/gpu-pruner/src/lib.rs b/gpu-pruner/src/lib.rs index c3c20e7..b133b89 100644 --- a/gpu-pruner/src/lib.rs +++ b/gpu-pruner/src/lib.rs @@ -460,6 +460,75 @@ impl Scaler for ScaleKind { } } +/// Apply acknowledgment annotation to a workload +#[tracing::instrument(skip(client))] +pub async fn acknowledge_workload( + client: KubeClient, + kind: &str, + name: &str, + namespace: &str, + duration_hours: u32, + user: &str, +) -> anyhow::Result<()> { + use chrono::Duration; + + // Calculate expiry timestamp + let now = chrono::Utc::now(); + let expires_at = now + Duration::hours(duration_hours as i64); + let expires_at_rfc3339 = expires_at.to_rfc3339(); + + // Build annotation patch + let patch = serde_json::json!({ + "metadata": { + "annotations": { + "gpu-pruner.io/ack-until": expires_at_rfc3339, + "gpu-pruner.io/ack-by": user, + } + } + }); + + // Apply patch based on resource kind + match kind { + "Deployment" => { + let api: Api = Api::namespaced(client, namespace); + api.patch(name, &PatchParams::default(), &Patch::Merge(&patch)) + .await?; + } + "ReplicaSet" => { + let api: Api = Api::namespaced(client, namespace); + api.patch(name, &PatchParams::default(), &Patch::Merge(&patch)) + .await?; + } + "StatefulSet" => { + let api: Api = Api::namespaced(client, namespace); + api.patch(name, &PatchParams::default(), &Patch::Merge(&patch)) + .await?; + } + "Notebook" => { + let api: Api = Api::namespaced(client, namespace); + api.patch(name, &PatchParams::default(), &Patch::Merge(&patch)) + .await?; + } + "InferenceService" => { + let api: Api = Api::namespaced(client, namespace); + api.patch(name, &PatchParams::default(), &Patch::Merge(&patch)) + .await?; + } + _ => { + return Err(anyhow::anyhow!("Unsupported resource kind: {}", kind)); + } + } + + // Increment metrics + metrics::ACKNOWLEDGMENTS_TOTAL.inc(); + + tracing::info!( + "Acknowledged [{kind}] {namespace}:{name} by {user} until {expires_at_rfc3339}" + ); + + Ok(()) +} + /// Check if a workload has an active acknowledgment annotation #[tracing::instrument(skip(_client))] pub async fn check_acknowledgment( diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs index 6e3bf14..fc09b76 100644 --- a/gpu-pruner/src/main.rs +++ b/gpu-pruner/src/main.rs @@ -26,6 +26,7 @@ use futures::stream::StreamExt; use prometheus_http_query::Client; use serde::Serialize; +use serde_json::json; use jiff::{SignedDuration, Timestamp}; use k8s_openapi::api::core::v1::Pod; @@ -34,11 +35,20 @@ use kube::{Api, Client as KubeClient, Resource}; use clap::{Parser, ValueEnum}; use gpu_pruner::{ - Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, check_acknowledgment, + Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, acknowledge_workload, check_acknowledgment, find_root_object, get_enabled_resources, get_prom_client, get_prometheus_token, slack::SlackNotifier, }; +use axum::{ + extract::State, + http::StatusCode, + response::{IntoResponse, Response}, + routing::post, + Router, +}; +use std::net::SocketAddr; + /// `gpu-pruner` is a tool to prune idle pods based on GPU utilization. It uses Prometheus to query /// GPU utilization metrics and scales down pods that have been idle for a certain duration. /// @@ -132,6 +142,11 @@ struct Cli { #[clap(long, default_value = "#test-pruner")] slack_channel: String, + /// Port to listen for Slack interactive component callbacks (button clicks). + /// Required if you want users to acknowledge idle GPUs from Slack messages. + #[clap(long)] + slack_interaction_port: Option, + } #[derive(Debug, Clone, ValueEnum, Default, Serialize)] @@ -151,6 +166,202 @@ enum LogFormat { static QUERY_FAILURES: AtomicUsize = AtomicUsize::new(0); +// Slack interaction handler state +#[derive(Clone)] +struct SlackInteractionState { + kube_client: KubeClient, +} + +// Slack interaction payload structures +#[derive(Debug, serde::Deserialize)] +struct SlackVerificationPayload { + #[serde(rename = "type")] + payload_type: String, + challenge: Option, +} + +#[derive(Debug, serde::Deserialize)] +struct SlackInteractionPayload { + user: SlackUser, + actions: Vec, + response_url: String, +} + +#[derive(Debug, serde::Deserialize)] +struct SlackUser { + name: String, +} + +#[derive(Debug, serde::Deserialize)] +struct SlackAction { + value: String, +} + +// Handler for Slack interactive component callbacks +async fn handle_slack_interaction( + State(state): State, + body: String, +) -> Response { + tracing::info!("Received Slack interaction callback"); + + // Parse the form-encoded payload + let payload_str = match serde_urlencoded::from_str::>(&body) { + Ok(params) => { + // Slack sends the payload in a "payload" field + params + .into_iter() + .find(|(k, _)| k == "payload") + .map(|(_, v)| v) + .unwrap_or_default() + } + Err(e) => { + tracing::error!("Failed to parse form data: {}", e); + return (StatusCode::BAD_REQUEST, "Invalid form data").into_response(); + } + }; + + // Parse the JSON payload + if payload_str.is_empty() { + tracing::info!("Received empty Slack interaction payload (URL verification probe)"); + return (StatusCode::OK, "OK").into_response(); + } + + if let Ok(verification) = serde_json::from_str::(&payload_str) { + if verification.payload_type == "url_verification" { + if let Some(challenge) = verification.challenge { + tracing::info!("Responding to Slack URL verification challenge"); + return (StatusCode::OK, challenge).into_response(); + } + } + } + + let payload: SlackInteractionPayload = match serde_json::from_str(&payload_str) { + Ok(p) => p, + Err(e) => { + tracing::error!("Failed to parse Slack payload JSON: {}", e); + return (StatusCode::BAD_REQUEST, "Invalid JSON payload").into_response(); + } + }; + + // Extract action value (format: kind:namespace:name:duration) + let action_value = match payload.actions.first() { + Some(action) => &action.value, + None => { + tracing::error!("No action found in payload"); + return (StatusCode::BAD_REQUEST, "No action found").into_response(); + } + }; + + let parts: Vec<&str> = action_value.split(':').collect(); + if parts.len() != 4 { + tracing::error!("Invalid action value format: {}", action_value); + return (StatusCode::BAD_REQUEST, "Invalid action value").into_response(); + } + + let (kind, namespace, name, duration_str) = (parts[0], parts[1], parts[2], parts[3]); + let duration_hours: u32 = match duration_str.parse() { + Ok(d) => d, + Err(e) => { + tracing::error!("Failed to parse duration: {}", e); + return (StatusCode::BAD_REQUEST, "Invalid duration").into_response(); + } + }; + + let user = &payload.user.name; + + // Apply acknowledgment + match acknowledge_workload( + state.kube_client.clone(), + kind, + name, + namespace, + duration_hours, + user, + ) + .await + { + Ok(_) => { + tracing::info!( + "Successfully acknowledged [{kind}] {namespace}:{name} for {duration_hours}h by {user}" + ); + + // Send response back to Slack to update the message + let response_message = json!({ + "replace_original": true, + "attachments": [{ + "color": "good", + "title": "✓ GPU Idle Acknowledgment Confirmed", + "fields": [ + { + "title": "Resource", + "value": format!("{}: {}", kind, name), + "short": true + }, + { + "title": "Namespace", + "value": namespace, + "short": true + }, + { + "title": "Acknowledged By", + "value": user, + "short": true + }, + { + "title": "Duration", + "value": format!("{} hours", duration_hours), + "short": true + }, + { + "title": "Status", + "value": format!("GPU will not be scaled down for the next {} hours", duration_hours), + "short": false + } + ], + "footer": "gpu-pruner", + "ts": std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }] + }); + + // Send the response to Slack via response_url + if let Err(e) = reqwest::Client::new() + .post(&payload.response_url) + .json(&response_message) + .send() + .await + { + tracing::error!("Failed to send response to Slack: {}", e); + } + + (StatusCode::OK, "Acknowledged").into_response() + } + Err(e) => { + tracing::error!("Failed to acknowledge workload: {}", e); + + // Send error response to Slack + let error_message = json!({ + "replace_original": false, + "text": format!("❌ Failed to acknowledge: {}", e), + "response_type": "ephemeral" + }); + + if let Err(e) = reqwest::Client::new() + .post(&payload.response_url) + .json(&error_message) + .send() + .await + { + tracing::error!("Failed to send error response to Slack: {}", e); + } + + (StatusCode::INTERNAL_SERVER_ERROR, "Failed to acknowledge").into_response() + } + } +} + #[cfg(feature = "otel")] static RESOURCE: LazyLock = LazyLock::new(|| { OTELResource::builder() @@ -411,11 +622,48 @@ async fn main() -> anyhow::Result<()> { }) }; + // Spawn Slack interaction HTTP server if port is configured + let slack_interaction_task = if let Some(port) = args.slack_interaction_port { + let kube_client = KubeClient::try_default() + .await + .expect("failed to get kube client for slack interactions"); + + let state = SlackInteractionState { kube_client }; + + let app = Router::new() + .route("/slack/interactions", post(handle_slack_interaction)) + .with_state(state); + + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + tracing::info!("Starting Slack interaction server on {}", addr); + + Some(tokio::spawn(async move { + let listener = tokio::net::TcpListener::bind(addr) + .await + .expect("failed to bind slack interaction server"); + axum::serve(listener, app) + .await + .expect("failed to start slack interaction server"); + Ok::<(), anyhow::Error>(()) + })) + } else { + tracing::info!("Slack interaction server disabled (no --slack-interaction-port set)"); + None + }; + // Wait for all tasks - _ = tokio::try_join! { - query_task, - scale_down_task - }?; + if let Some(interaction_task) = slack_interaction_task { + _ = tokio::try_join! { + query_task, + scale_down_task, + interaction_task + }?; + } else { + _ = tokio::try_join! { + query_task, + scale_down_task + }?; + } Ok(()) } diff --git a/gpu-pruner/src/slack.rs b/gpu-pruner/src/slack.rs index 4b29e8c..91a08f1 100644 --- a/gpu-pruner/src/slack.rs +++ b/gpu-pruner/src/slack.rs @@ -35,9 +35,15 @@ impl SlackNotifier { let resource_name = workload.name(); let namespace = workload.namespace().unwrap_or_else(|| "default".to_string()); + // Encode workload info in button values: kind:namespace:name:duration + let button_value_4h = format!("{}:{}:{}:4", resource_type, namespace, resource_name); + let button_value_8h = format!("{}:{}:{}:8", resource_type, namespace, resource_name); + let button_value_24h = format!("{}:{}:{}:24", resource_type, namespace, resource_name); + let payload = json!({ "channel": self.channel, "attachments": [{ + "callback_id": "ack_idle_gpu", "color": "warning", "title": "🔔 Idle GPU Detected - Scale Down Pending", "fields": [ @@ -58,10 +64,33 @@ impl SlackNotifier { }, { "title": "Action", - "value": "Scaling to 0 replicas", + "value": "Scaling to 0 replicas unless acknowledged", "short": false } ], + "actions": [ + { + "name": "ack", + "text": "Keep 4h", + "type": "button", + "value": button_value_4h, + "style": "primary" + }, + { + "name": "ack", + "text": "Keep 8h", + "type": "button", + "value": button_value_8h, + "style": "primary" + }, + { + "name": "ack", + "text": "Keep 24h", + "type": "button", + "value": button_value_24h, + "style": "primary" + } + ], "footer": "gpu-pruner", "ts": std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH)