From 3e2223d0c71faee0b50f85d76ee5cf9969cc79a9 Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Thu, 4 Jun 2026 11:53:40 -0400
Subject: [PATCH 1/7] Add GPU Health & DCGM panels to Grafana dashboard.

Extend gpu-dashboard.json with temperature, power, VRAM %, memory-copy,
XID, and optional profiling metrics; sync Helm ConfigMap and document
PromQL in DASHBOARD.md and GRAFANA_DEPLOYMENT.md.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 DASHBOARD.md                          |  21 +
 GRAFANA_DEPLOYMENT.md                 | 854 ++++++++++++++++++++++++++
 gpu-dashboard.json                    | 299 ++++++++-
 helm/grafana-dashboard-configmap.yaml | 773 +++++++++++++++++++++++
 4 files changed, 1946 insertions(+), 1 deletion(-)
 create mode 100644 GRAFANA_DEPLOYMENT.md
 create mode 100644 helm/grafana-dashboard-configmap.yaml

diff --git a/DASHBOARD.md b/DASHBOARD.md
index fbf2def..9965223 100644
--- a/DASHBOARD.md
+++ b/DASHBOARD.md
@@ -96,6 +96,7 @@ A Grafana dashboard is included in `gpu-dashboard.json` for more detailed GPU mo
 - **Idle GPU Workloads**: GPUs with zero compute activity for 30+ minutes
 - **Idle GPU Time by Deployment**: Deployments producing the most allocated GPU idle time (see [Prometheus Queries](#prometheus-queries) below)
 - **GPU Allocation Leaderboard**: Total GPU requests per namespace
+- **GPU Health & DCGM**: Temperature, power, VRAM %, memory-copy util, XID errors, and optional DCGM profiling metrics
 
 ### Importing the Grafana Dashboard
 
@@ -139,6 +140,26 @@ The overview row uses **two independent partitions** of the same total. Each pai
 
 Equivalently: **Engine active** = Total − Engine idle, and **VRAM free** = Total − VRAM allocated, when the same DCGM time series are counted.
 
+### GPU Health & DCGM
+
+Panels in the **GPU Health & DCGM** row use additional dcgm-exporter counters. Profiling panels show no data unless your exporter exposes `DCGM_FI_PROF_*` metrics (same requirement as `DCGM_FI_PROF_GR_ENGINE_ACTIVE`).
+
+| Panel | PromQL |
+|-------|--------|
+| Peak GPU temperature | `max(DCGM_FI_DEV_GPU_TEMP)` |
+| Peak power (W) | `max(DCGM_FI_DEV_POWER_USAGE)` |
+| XID errors (total) | `sum(DCGM_FI_DEV_XID_ERRORS)` |
+| GPU temperature by node | `avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)` |
+| Power draw by node | `sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)` |
+| VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` |
+| Memory copy utilization | `avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)` |
+| XID errors (1h increase) | `sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))` |
+| SM active by node | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` |
+| Tensor pipe active by node | `avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)` |
+| DRAM active by node | `avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)` |
+
+Note: gpu-pruner idle detection uses [`query.promql.j2`](gpu-pruner/src/query.promql.j2) at runtime; Grafana idle panels use related but simpler PromQL for visualization.
+
 ### Idle GPU Time by Deployment Query
 
 This query identifies which Kubernetes Deployments are producing the most allocated GPU idle time while GPU utilization is at 0%.
diff --git a/GRAFANA_DEPLOYMENT.md b/GRAFANA_DEPLOYMENT.md
new file mode 100644
index 0000000..5981ce7
--- /dev/null
+++ b/GRAFANA_DEPLOYMENT.md
@@ -0,0 +1,854 @@
+# Deploying Grafana with GPU Dashboard using Helm
+
+This guide explains how to deploy a standalone Grafana instance with the GPU Pruner dashboard pre-configured using the official Grafana Helm chart.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Prerequisites](#prerequisites)
+- [Quick Start](#quick-start)
+- [Installation](#installation)
+- [Configuration](#configuration)
+- [Validation](#validation)
+- [Troubleshooting](#troubleshooting)
+- [Customization](#customization)
+- [Security Considerations](#security-considerations)
+
+## Overview
+
+The GPU Pruner project includes a comprehensive Grafana dashboard (`gpu-dashboard.json`) that visualizes:
+
+- **Cluster GPU Overview**: Total GPUs, VRAM allocation, engine activity
+- **GPU Utilization Heatmap**: Per-node GPU utilization over time
+- **Running GPU Workloads**: All pods with GPU requests
+- **Idle GPU Workloads**: GPUs with zero compute activity for 30+ minutes
+- **Idle GPU Time by Deployment**: Deployments producing the most allocated GPU idle time
+- **GPU Allocation Leaderboard**: Total GPU requests per namespace
+
+This deployment uses the **official Grafana Helm chart** (`grafana/grafana`) to create a dedicated Grafana instance for GPU monitoring, separate from the gpu-pruner deployment.
+
+## Prerequisites
+
+### Required Components
+
+Before deploying Grafana, ensure these components are running in your Kubernetes cluster:
+
+1. **Prometheus** - Collecting metrics from DCGM exporter and kube-state-metrics
+2. **DCGM Exporter** - DaemonSet on GPU nodes exposing NVIDIA GPU metrics
+3. **kube-state-metrics** - With pod labels enabled for deployment-level analysis
+
+### Required Tools
+
+- **Helm 3.x** - [Install Helm](https://helm.sh/docs/intro/install/)
+- **kubectl** - Configured with cluster access
+- **Kubernetes 1.19+** - With GPU nodes
+
+### Validation Commands
+
+Verify prerequisites before proceeding:
+
+```bash
+# Check Prometheus is accessible
+kubectl get svc -A | grep prometheus
+
+# Verify DCGM exporter pods on GPU nodes
+kubectl get pods -A | grep dcgm
+
+# Check kube-state-metrics
+kubectl get deploy -A | grep kube-state-metrics
+
+# Test Prometheus query (requires port-forward)
+kubectl port-forward -n <prometheus-namespace> svc/<prometheus-service> 9090:9090 &
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length'
+# Should return number of GPUs
+
+# Verify kube_pod_labels metric exists
+curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_labels' | jq '.data.result | length'
+# Should return > 0
+```
+
+### kube-state-metrics Configuration
+
+For the "Idle GPU Time by Deployment" panel to work, kube-state-metrics **must** be configured with:
+
+```yaml
+--metric-labels-allowlist=pods=[*]
+```
+
+This enables the `kube_pod_labels` metric. Verify with:
+
+```bash
+kubectl get deploy kube-state-metrics -n <namespace> -o yaml | grep metric-labels-allowlist
+```
+
+If missing, update the deployment:
+
+```bash
+kubectl set env deployment/kube-state-metrics \
+  -n <namespace> \
+  KUBE_STATE_METRICS_ARGS='--metric-labels-allowlist=pods=[*]'
+```
+
+## Quick Start
+
+For a standard Kubernetes cluster with Prometheus Operator:
+
+```bash
+# Add Grafana Helm repository
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+# Install Grafana with GPU dashboard
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-vanilla-k8s.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  --set ingress.hosts[0]='grafana-gpu.example.com' \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \
+  -n monitoring --create-namespace
+
+# Get admin password (if not set above)
+kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+
+# Access Grafana (port-forward)
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000
+```
+
+Open http://localhost:3000 and login with `admin` / password from above.
+
+## Installation
+
+### Step 1: Add Grafana Helm Repository
+
+```bash
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+```
+
+### Step 2: Choose Your Environment
+
+Select the appropriate values file for your Kubernetes environment:
+
+#### Option A: OpenShift
+
+```bash
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-openshift.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  -n monitoring --create-namespace
+```
+
+**Note**: For OpenShift, you'll need to create a ClusterRoleBinding for Prometheus access:
+
+```bash
+oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring
+```
+
+And set the Prometheus token in the datasource configuration (see [Configuration](#configuration)).
+
+#### Option B: Vanilla Kubernetes
+
+```bash
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-vanilla-k8s.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  --set ingress.hosts[0]='grafana-gpu.example.com' \
+  --set datasources."datasources\.yaml".datasources[0].url='http://your-prometheus:9090' \
+  -n monitoring --create-namespace
+```
+
+Update the Ingress hostname and Prometheus URL to match your environment.
+
+### Step 3: Verify Deployment
+
+```bash
+# Check pod status
+kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
+
+# Check logs
+kubectl logs -n monitoring -l app.kubernetes.io/name=grafana
+
+# Verify service
+kubectl get svc -n monitoring gpu-grafana
+```
+
+Expected output:
+```
+NAME          READY   STATUS    RESTARTS   AGE
+gpu-grafana   1/1     Running   0          2m
+```
+
+### Step 4: Access Grafana
+
+#### Via Port Forward (for testing)
+
+```bash
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000
+```
+
+Open http://localhost:3000
+
+#### Via Ingress (production)
+
+Access via the configured hostname: https://grafana-gpu.example.com
+
+#### Via OpenShift Route (OpenShift only)
+
+Get the route URL:
+
+```bash
+oc get route -n monitoring grafana -o jsonpath='{.spec.host}'
+```
+
+### Step 5: Login
+
+- **Username**: `admin`
+- **Password**: The password you set via `--set adminPassword` or retrieve it:
+
+```bash
+kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+```
+
+## Configuration
+
+### Prometheus Datasource URL
+
+The most critical configuration is the Prometheus datasource URL. Update it to match your cluster:
+
+**In `helm/grafana-values.yaml`** or via `--set`:
+
+```yaml
+datasources:
+  datasources.yaml:
+    datasources:
+      - url: http://YOUR_PROMETHEUS_SERVICE:9090
+```
+
+Common patterns:
+
+| Environment | Prometheus URL |
+|-------------|----------------|
+| OpenShift | `http://thanos-querier.openshift-monitoring.svc.cluster.local:9090` |
+| Prometheus Operator | `http://prometheus-k8s.monitoring.svc.cluster.local:9090` |
+| kube-prometheus-stack | `http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090` |
+| Custom | `http://prometheus.prometheus.svc.cluster.local:9090` |
+
+### Datasource UID (Critical)
+
+The dashboard has a **hardcoded datasource UID**: `PBFA97CFB590B2093`
+
+**Do NOT change this UID** in the values file:
+
+```yaml
+datasources:
+  datasources.yaml:
+    datasources:
+      - uid: PBFA97CFB590B2093  # Must match dashboard
+```
+
+If you need a different UID, you'll have to manually edit the dashboard after import (see [Troubleshooting](#datasource-uid-mismatch)).
+
+### Admin Credentials
+
+**Option 1: Set via Helm** (simple, not recommended for production)
+
+```bash
+--set adminPassword='YOUR_PASSWORD'
+```
+
+**Option 2: Use existing secret** (recommended for production)
+
+Create a secret first:
+
+```bash
+kubectl create secret generic grafana-admin-secret \
+  -n monitoring \
+  --from-literal=admin-user=admin \
+  --from-literal=admin-password='YOUR_SECURE_PASSWORD'
+```
+
+Update values:
+
+```yaml
+admin:
+  existingSecret: grafana-admin-secret
+  userKey: admin-user
+  passwordKey: admin-password
+```
+
+### Ingress Configuration
+
+For **Nginx Ingress**:
+
+```yaml
+ingress:
+  enabled: true
+  ingressClassName: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+  hosts:
+    - grafana-gpu.example.com
+  tls:
+    - secretName: grafana-tls
+      hosts:
+        - grafana-gpu.example.com
+```
+
+For **Traefik Ingress**:
+
+```yaml
+ingress:
+  enabled: true
+  ingressClassName: traefik
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+```
+
+### OpenShift Route
+
+For OpenShift (already configured in `grafana-values-openshift.yaml`):
+
+```yaml
+route:
+  enabled: true
+  host: grafana-gpu.apps.example.com
+  tls:
+    enabled: true
+    termination: edge
+```
+
+Or create manually:
+
+```bash
+oc create route edge grafana \
+  --service=gpu-grafana \
+  --hostname=grafana-gpu.apps.example.com \
+  -n monitoring
+```
+
+### Resource Limits
+
+Adjust based on your cluster size and dashboard usage:
+
+```yaml
+resources:
+  limits:
+    cpu: 500m      # Increase for large clusters or many concurrent users
+    memory: 512Mi  # Increase if dashboards are slow to load
+  requests:
+    cpu: 250m
+    memory: 256Mi
+```
+
+### Persistence
+
+Enable persistence to retain dashboard edits and datasource configurations:
+
+```yaml
+persistence:
+  enabled: true
+  size: 10Gi
+  storageClassName: default  # Adjust for your cluster
+```
+
+## Validation
+
+### 1. Verify Grafana Pod is Running
+
+```bash
+kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
+```
+
+Expected: `STATUS: Running`
+
+### 2. Check Grafana Logs
+
+```bash
+kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50
+```
+
+Look for:
+- `HTTP Server Listen`
+- No error messages about datasource or dashboard provisioning
+
+### 3. Test Prometheus Datasource
+
+Access Grafana UI → Configuration → Data Sources → Prometheus → Save & Test
+
+Expected: **"Data source is working"** (green checkmark)
+
+If it fails, verify:
+- Prometheus URL is correct
+- Network connectivity from Grafana pod to Prometheus service
+- Prometheus is healthy: `kubectl get pods -n <prometheus-namespace>`
+
+### 4. Verify Dashboard is Loaded
+
+Grafana UI → Dashboards → GPU Monitoring → Waldorf GPU Usage & Idle Tracker
+
+Or check via API:
+
+```bash
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 &
+curl -s -u admin:YOUR_PASSWORD http://localhost:3000/api/search?query=gpu | jq .
+```
+
+Expected: JSON response with dashboard UID and title.
+
+### 5. Validate Dashboard Panels
+
+Check each panel shows data (not "No data"):
+
+| Panel | Validation Query | Expected Result |
+|-------|------------------|-----------------|
+| Total GPUs | `count(DCGM_FI_DEV_GPU_UTIL)` | Number of GPUs in cluster |
+| VRAM allocated | `count(DCGM_FI_DEV_FB_USED > 0)` | Number with VRAM in use |
+| Engine idle | `count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)` | Idle GPU count |
+| Running GPU Workloads | `sum by (namespace, pod) (kube_pod_container_resource_requests{resource="nvidia_com_gpu"})` | List of pods with GPUs |
+| Peak GPU temperature | `max(DCGM_FI_DEV_GPU_TEMP)` | Max die temp (°C) |
+| Peak power | `max(DCGM_FI_DEV_POWER_USAGE)` | Max per-GPU watts |
+| XID errors | `sum(DCGM_FI_DEV_XID_ERRORS)` | Should be 0 in healthy clusters |
+| VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | Per-GPU VRAM fill |
+| SM active (profiling) | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | No data if profiling disabled |
+
+### 6. Test Prometheus Queries Manually
+
+Port-forward to Prometheus:
+
+```bash
+kubectl port-forward -n <prometheus-namespace> svc/<prometheus-service> 9090:9090
+```
+
+Run test queries:
+
+```bash
+# Check DCGM metrics exist
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length'
+
+# GPU health metrics (GPU Health & DCGM dashboard row)
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_TEMP' | jq '.data.result | length'
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_POWER_USAGE' | jq '.data.result | length'
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_MEM_COPY_UTIL' | jq '.data.result | length'
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_XID_ERRORS' | jq '.data.result | length'
+
+# Check kube-state-metrics
+curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_container_resource_requests{resource="nvidia_com_gpu"}' | jq '.data.result | length'
+
+# Check pod labels (for deployment analysis)
+curl -s 'http://localhost:9090/api/v1/query?query=kube_pod_labels' | jq '.data.result | length'
+```
+
+All should return `> 0` results.
+
+## Troubleshooting
+
+### Dashboard Shows "No Data"
+
+**Cause**: Prometheus datasource not configured correctly or metrics not available.
+
+**Solution**:
+
+1. Verify Prometheus datasource connection:
+   - Grafana UI → Configuration → Data Sources → Prometheus → Save & Test
+   - Should show "Data source is working"
+
+2. Check Prometheus has DCGM metrics:
+   ```bash
+   kubectl port-forward -n <prometheus-namespace> svc/<prometheus-service> 9090:9090 &
+   curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq .
+   ```
+
+3. Verify DCGM exporter is running:
+   ```bash
+   kubectl get pods -A | grep dcgm
+   ```
+
+4. Check Prometheus is scraping DCGM exporter:
+   - Prometheus UI → Status → Targets
+   - Look for `dcgm-exporter` job with state UP
+
+### Datasource UID Mismatch
+
+**Cause**: Dashboard expects datasource UID `PBFA97CFB590B2093` but your datasource has a different UID.
+
+**Symptoms**: Dashboard panels show "Data source not found" or use wrong datasource.
+
+**Solution A: Match the UID** (recommended)
+
+Configure datasource with the exact UID:
+
+```yaml
+datasources:
+  datasources.yaml:
+    datasources:
+      - uid: PBFA97CFB590B2093
+```
+
+Then reinstall or update Grafana:
+
+```bash
+helm upgrade gpu-grafana grafana/grafana -f helm/grafana-values.yaml -n monitoring
+```
+
+**Solution B: Remap Dashboard**
+
+1. Open dashboard in Grafana
+2. Click gear icon (Dashboard settings)
+3. Go to JSON Model
+4. Find and replace all instances of `"uid": "PBFA97CFB590B2093"` with your datasource UID
+5. Save dashboard
+
+### Missing Metrics
+
+**Problem**: Some panels show "No data" but others work.
+
+**Missing `DCGM_FI_PROF_GR_ENGINE_ACTIVE`**:
+
+- Older DCGM versions may not export this metric
+- Fallback: Dashboard uses `DCGM_FI_DEV_GPU_UTIL / 100` as alternative
+- Verify with: `curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_PROF_GR_ENGINE_ACTIVE'`
+
+**Missing `kube_pod_labels`**:
+
+- kube-state-metrics not configured with `--metric-labels-allowlist=pods=[*]`
+- Affects "Idle GPU Time by Deployment" panel only
+- Fix:
+  ```bash
+  kubectl set env deployment/kube-state-metrics \
+    -n <namespace> \
+    KUBE_STATE_METRICS_ARGS='--metric-labels-allowlist=pods=[*]'
+  ```
+
+### Pod Labels Not Showing in Deployment Analysis
+
+**Cause**: Pods don't have the `app` label set to deployment name.
+
+**Solution**: Ensure your GPU workloads have proper labels:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-gpu-workload
+spec:
+  template:
+    metadata:
+      labels:
+        app: my-gpu-workload  # This label is required
+```
+
+### OpenShift: Prometheus 403 Forbidden
+
+**Cause**: Grafana service account doesn't have permission to query Prometheus.
+
+**Solution**:
+
+```bash
+oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring
+```
+
+And set the Prometheus token:
+
+```bash
+# Get token
+TOKEN=$(oc serviceaccounts get-token grafana -n monitoring)
+
+# Update datasource secureJsonData
+kubectl edit secret gpu-grafana -n monitoring
+# Add: httpHeaderValue1: 'Bearer <TOKEN>'
+```
+
+Or use `--set` during Helm install:
+
+```bash
+--set datasources."datasources\.yaml".datasources[0].secureJsonData.httpHeaderValue1="Bearer $TOKEN"
+```
+
+### Dashboard Not Auto-Imported
+
+**Cause**: Dashboard provisioning failed or URL is unreachable.
+
+**Solution A: Manual Import**
+
+1. Download dashboard: `curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json`
+2. Grafana UI → Dashboards → Import → Upload JSON file
+3. Select Prometheus datasource (UID `PBFA97CFB590B2093`)
+4. Click Import
+
+**Solution B: Use ConfigMap Method**
+
+```bash
+# Create ConfigMap
+kubectl apply -f helm/grafana-dashboard-configmap.yaml
+
+# Update Grafana values to use sidecar
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --set sidecar.dashboards.enabled=true \
+  -n monitoring
+```
+
+### Ingress Not Working
+
+**Missing Ingress Controller**:
+
+Verify ingress controller is installed:
+
+```bash
+kubectl get pods -n ingress-nginx  # or kube-system, or traefik-system
+```
+
+**Certificate Issues**:
+
+If using cert-manager:
+
+```bash
+kubectl get certificate -n monitoring
+kubectl describe certificate grafana-tls -n monitoring
+```
+
+**Alternative: Use NodePort or LoadBalancer**:
+
+```bash
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --set service.type=NodePort \
+  --set service.nodePort=30300 \
+  -n monitoring
+```
+
+Access via: `http://<node-ip>:30300`
+
+## Customization
+
+### Adjust Idle Detection Window
+
+The dashboard uses a **30-minute window** to detect idle GPUs. To customize:
+
+1. Open dashboard in Grafana
+2. Edit panel (e.g., "Engine idle (30m)")
+3. Change query from `[30m]` to desired duration:
+   ```promql
+   count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[60m]) == 0)  # 60 minutes
+   ```
+4. Update panel title to reflect new duration
+5. Save dashboard
+
+### Modify GPU Model Assumptions
+
+The "GPU Memory per GPU" panel shows 140 GiB for H200 GPUs. For other models:
+
+1. Edit panel
+2. Update query to use actual DCGM metric:
+   ```promql
+   max(DCGM_FI_DEV_FB_TOTAL) / 1024  # Returns actual GPU memory in GiB
+   ```
+3. Or hardcode for your GPU model:
+   ```promql
+   80  # For A100 80GB
+   ```
+
+### Add Custom Panels
+
+The dashboard includes a **GPU Health & DCGM** row with temperature, power, VRAM %, memory-copy utilization, XID errors, and optional profiling metrics. See [`DASHBOARD.md`](DASHBOARD.md#gpu-health--dcgm) for PromQL reference.
+
+To add more panels:
+
+1. Dashboard → Add panel → Add a new panel
+2. Select Prometheus datasource
+3. Enter PromQL query
+4. Configure visualization (graph, table, stat, etc.)
+5. Save panel
+
+Example metric already on the dashboard:
+
+```promql
+avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)
+```
+
+### Dashboard Refresh Rate
+
+Change auto-refresh interval:
+
+1. Dashboard settings (gear icon) → Time options
+2. Set refresh interval (e.g., 30s, 1m, 5m)
+3. Save
+
+### Create Alerts
+
+To alert on idle GPUs:
+
+1. Edit "Engine idle (30m)" panel
+2. Click "Alert" tab → Create alert rule
+3. Configure:
+   - **Condition**: `WHEN last() OF query(A) IS ABOVE 5`
+   - **Evaluate every**: 5m
+   - **For**: 10m (grace period)
+4. Add notification channel
+5. Save
+
+## Security Considerations
+
+### 1. Secure Admin Credentials
+
+**Never hardcode passwords in values files**. Use Kubernetes secrets:
+
+```bash
+kubectl create secret generic grafana-admin-secret \
+  -n monitoring \
+  --from-literal=admin-password="$(openssl rand -base64 32)"
+```
+
+Update values:
+
+```yaml
+admin:
+  existingSecret: grafana-admin-secret
+  passwordKey: admin-password
+```
+
+### 2. RBAC for Prometheus Access
+
+Grant minimal permissions to Grafana service account:
+
+```yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: grafana-prometheus-reader
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: view  # Or create custom role with only Prometheus read access
+subjects:
+- kind: ServiceAccount
+  name: grafana
+  namespace: monitoring
+```
+
+### 3. Network Policies
+
+Restrict Grafana network access:
+
+```yaml
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: grafana-netpol
+  namespace: monitoring
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: grafana
+  policyTypes:
+  - Ingress
+  - Egress
+  ingress:
+  - from:
+    - namespaceSelector:
+        matchLabels:
+          name: ingress-nginx  # Allow ingress controller
+    ports:
+    - protocol: TCP
+      port: 3000
+  egress:
+  - to:
+    - namespaceSelector:
+        matchLabels:
+          name: monitoring  # Allow Prometheus access
+    ports:
+    - protocol: TCP
+      port: 9090
+  - to:  # Allow DNS
+    - namespaceSelector:
+        matchLabels:
+          name: kube-system
+    ports:
+    - protocol: UDP
+      port: 53
+```
+
+### 4. TLS/HTTPS
+
+Always use TLS for production deployments:
+
+**With Ingress + cert-manager**:
+
+```yaml
+ingress:
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+  tls:
+    - secretName: grafana-tls
+      hosts:
+        - grafana-gpu.example.com
+```
+
+**With OpenShift Route**:
+
+```yaml
+route:
+  tls:
+    enabled: true
+    termination: edge  # or reencrypt for end-to-end TLS
+```
+
+### 5. Anonymous Access
+
+Disable anonymous access in production:
+
+```yaml
+grafana.ini:
+  auth.anonymous:
+    enabled: false
+```
+
+### 6. Datasource Token Rotation
+
+For OpenShift or token-based Prometheus auth, rotate tokens regularly:
+
+```bash
+# Generate new token
+NEW_TOKEN=$(oc serviceaccounts get-token grafana -n monitoring)
+
+# Update secret
+kubectl patch secret gpu-grafana -n monitoring \
+  -p "{\"data\":{\"prometheus-token\":\"$(echo -n $NEW_TOKEN | base64)\"}}"
+
+# Restart Grafana
+kubectl rollout restart deployment gpu-grafana -n monitoring
+```
+
+### 7. Audit Logging
+
+Enable Grafana audit logging:
+
+```yaml
+grafana.ini:
+  log:
+    mode: console
+    level: info
+  log.console:
+    format: json
+  security:
+    disable_initial_admin_creation: false
+```
+
+## Additional Resources
+
+- [Grafana Helm Chart Documentation](https://github.com/grafana/helm-charts/tree/main/charts/grafana)
+- [GPU Pruner Dashboard Documentation](DASHBOARD.md)
+- [Prometheus Deployment Guide](PROMETHEUS_DEPLOYMENT.md)
+- [DCGM Exporter Setup](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html)
+- [kube-state-metrics Configuration](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/cli-arguments.md)
+
+## Support
+
+For issues or questions:
+
+- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues
+- Dashboard Documentation: [DASHBOARD.md](DASHBOARD.md)
+- Grafana Community: https://community.grafana.com/
diff --git a/gpu-dashboard.json b/gpu-dashboard.json
index 6fa1a2f..dbd3391 100644
--- a/gpu-dashboard.json
+++ b/gpu-dashboard.json
@@ -450,6 +450,303 @@
           }
         }
       }
+    },
+    {
+      "title": "GPU Health & DCGM",
+      "type": "row",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 72 },
+      "collapsed": false
+    },
+    {
+      "title": "Peak GPU temperature",
+      "description": "Highest GPU die temperature across the cluster (DCGM_FI_DEV_GPU_TEMP).",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 0, "y": 73 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "max(DCGM_FI_DEV_GPU_TEMP)",
+          "legendFormat": "Max °C"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "celsius",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 80 },
+              { "color": "red", "value": 85 }
+            ]
+          }
+        }
+      },
+      "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+    },
+    {
+      "title": "Peak power (W)",
+      "description": "Highest per-GPU power draw in watts (DCGM_FI_DEV_POWER_USAGE). Useful for rack capacity and corroborating idle detection.",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 4, "y": 73 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "max(DCGM_FI_DEV_POWER_USAGE)",
+          "legendFormat": "Max W"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "watt",
+          "color": { "mode": "fixed", "fixedColor": "orange" },
+          "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] }
+        }
+      },
+      "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+    },
+    {
+      "title": "XID errors (total)",
+      "description": "Sum of NVIDIA XID driver/hardware error counter (DCGM_FI_DEV_XID_ERRORS). Non-zero warrants investigation.",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 8, "y": 73 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_XID_ERRORS)",
+          "legendFormat": "XID"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "decimals": 0,
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          }
+        }
+      },
+      "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+    },
+    {
+      "title": "GPU temperature by node",
+      "description": "Average GPU temperature per host (DCGM_FI_DEV_GPU_TEMP).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 77 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "celsius",
+          "min": 0,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 80 },
+              { "color": "red", "value": 85 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Power draw by node",
+      "description": "Sum of per-GPU power draw per host in watts (DCGM_FI_DEV_POWER_USAGE).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 77 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "watt",
+          "min": 0,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "VRAM utilization %",
+      "description": "Framebuffer used as a percentage of total VRAM per GPU (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 85 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)",
+          "legendFormat": "{{Hostname}} GPU {{gpu}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 1,
+            "fillOpacity": 10,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "Memory copy utilization",
+      "description": "Average memory copy engine utilization per host (DCGM_FI_DEV_MEM_COPY_UTIL).",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 85 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "XID errors (1h increase)",
+      "description": "XID errors increased over the last hour per GPU. Spikes indicate driver or hardware faults.",
+      "type": "timeseries",
+      "gridPos": { "h": 6, "w": 24, "x": 0, "y": 93 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))",
+          "legendFormat": "{{Hostname}} GPU {{gpu}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "decimals": 0,
+          "custom": {
+            "drawStyle": "bars",
+            "lineWidth": 1,
+            "fillOpacity": 50,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "SM active by node",
+      "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "Tensor pipe active by node",
+      "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
+    },
+    {
+      "title": "DRAM active by node",
+      "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
     }
   ],
   "schemaVersion": 39,
@@ -460,5 +757,5 @@
   "timezone": "browser",
   "title": "Waldorf GPU Usage & Idle Tracker",
   "uid": "prometheus",
-  "version": 1
+  "version": 2
 }
diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml
new file mode 100644
index 0000000..3fe4be0
--- /dev/null
+++ b/helm/grafana-dashboard-configmap.yaml
@@ -0,0 +1,773 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-gpu-dashboard
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+    app: grafana
+  annotations:
+    description: "GPU Pruner dashboard showing GPU utilization, idle workloads, and resource allocation"
+data:
+  gpu-dashboard.json: |
+    {
+      "annotations": {
+        "list": []
+      },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 1,
+      "links": [],
+      "panels": [
+        {
+          "title": "Cluster GPU Overview",
+          "type": "row",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+          "collapsed": false
+        },
+        {
+          "title": "Total GPUs",
+          "description": "All GPUs reporting DCGM metrics. Baseline for the two partitions below (engine activity and VRAM).",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL)",
+              "legendFormat": "Total"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": { "mode": "fixed", "fixedColor": "blue" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "VRAM allocated (FB>0)",
+          "description": "GPUs with framebuffer memory in use (CUDA context / VRAM). Complements VRAM free (FB=0); the pair sums to Total. Not the same as compute-active.",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "count(DCGM_FI_DEV_FB_USED > 0)",
+              "legendFormat": "FB>0"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": { "mode": "fixed", "fixedColor": "green" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "VRAM free (FB=0)",
+          "description": "GPUs with no framebuffer in use. PromQL: count(DCGM_FI_DEV_FB_USED == 0). Equals Total − VRAM allocated (FB>0).",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "count(DCGM_FI_DEV_FB_USED == 0)",
+              "legendFormat": "FB=0"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": { "mode": "fixed", "fixedColor": "semi-dark-green" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "semi-dark-green", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "Engine idle (30m)",
+          "description": "No graphics/compute engine activity for the full 30m window. PromQL: count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0). Complements Engine active (30m); the pair sums to Total.",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)",
+              "legendFormat": "Idle"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": { "mode": "fixed", "fixedColor": "red" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "Engine active (30m)",
+          "description": "Had graphics/compute engine activity at least once in 30m. PromQL: count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0). Equals Total − Engine idle (30m).",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)",
+              "legendFormat": "Active"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": { "mode": "fixed", "fixedColor": "orange" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "GPU Memory per GPU (H200)",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "max(DCGM_FI_DEV_FB_TOTAL) / 1024",
+              "legendFormat": "GiB"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "decgbytes", 
+              "color": { "mode": "fixed", "fixedColor": "purple" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "purple", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "GPU Utilization Heatmap by Node",
+          "type": "timeseries",
+          "gridPos": { "h": 6, "w": 16, "x": 0, "y": 5 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_UTIL)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "Running GPU Workloads",
+          "type": "row",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 },
+          "collapsed": false
+        },
+        {
+          "title": "Running GPU Workloads",
+          "description": "All pods with GPU requests, grouped by namespace (user)",
+          "type": "table",
+          "gridPos": { "h": 10, "w": 24, "x": 0, "y": 12 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sum by (namespace, pod) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"})",
+              "legendFormat": "",
+              "format": "table",
+              "instant": true
+            }
+          ],
+          "transformations": [
+            {
+              "id": "organize",
+              "options": {
+                "excludeByName": { "Time": true, "__name__": true },
+                "renameByName": {
+                  "namespace": "Namespace (User)",
+                  "pod": "Pod",
+                  "Value": "GPUs Requested"
+                },
+                "indexByName": { "namespace": 0, "pod": 1, "Value": 2 }
+              }
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": [
+              {
+                "matcher": { "id": "byName", "options": "GPUs Requested" },
+                "properties": [
+                  { "id": "custom.width", "value": 130 },
+                  {
+                    "id": "thresholds",
+                    "value": {
+                      "mode": "absolute",
+                      "steps": [
+                        { "color": "green", "value": null },
+                        { "color": "yellow", "value": 4 },
+                        { "color": "red", "value": 8 }
+                      ]
+                    }
+                  },
+                  { "id": "custom.displayMode", "value": "color-background" }
+                ]
+              }
+            ]
+          }
+        },
+        {
+          "title": "Idle GPU Workloads",
+          "type": "row",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
+          "collapsed": false
+        },
+        {
+          "title": "Idle GPU Workloads (Zero Compute for 30m+)",
+          "description": "GPUs where max compute engine activity was 0 for the entire 30-minute window. These are candidates for pruning.",
+          "type": "table",
+          "gridPos": { "h": 10, "w": 24, "x": 0, "y": 23 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0",
+              "legendFormat": "",
+              "format": "table",
+              "instant": true
+            }
+          ],
+          "transformations": [
+            {
+              "id": "organize",
+              "options": {
+                "excludeByName": { "Time": true, "__name__": true, "Value": true },
+                "renameByName": {
+                  "Hostname": "Node",
+                  "gpu": "GPU #",
+                  "modelName": "GPU Model"
+                },
+                "indexByName": { "Hostname": 0, "gpu": 1, "modelName": 2 }
+              }
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "displayMode": "color-text" },
+              "color": { "mode": "fixed", "fixedColor": "red" }
+            }
+          }
+        },
+        {
+          "title": "Idle GPU Time by Deployment - Historical Timeline",
+          "description": "Historical timeline showing idle GPU-hours by deployment over time. Each line represents a deployment's idle GPU allocation trend.",
+          "type": "timeseries",
+          "gridPos": { "h": 10, "w": 24, "x": 0, "y": 33 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5",
+              "legendFormat": "{{deployment}} ({{namespace}})",
+              "interval": "1m"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 10,
+                "gradientMode": "opacity",
+                "spanNulls": true,
+                "showPoints": "auto",
+                "pointSize": 5,
+                "stacking": { "mode": "none" },
+                "axisPlacement": "auto",
+                "axisLabel": "Idle GPU-Hours",
+                "scaleDistribution": { "type": "linear" }
+              },
+              "color": { "mode": "palette-classic" },
+              "decimals": 1,
+              "unit": "none",
+              "min": 0,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 2 },
+                  { "color": "orange", "value": 5 },
+                  { "color": "red", "value": 10 }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "tooltip": { "mode": "multi", "sort": "desc" },
+            "legend": {
+              "calcs": ["last", "max", "mean"],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            }
+          }
+        },
+        {
+          "title": "Idle GPU Time by Deployment (30m)",
+          "description": "Deployments producing the most allocated GPU idle time at 0% utilization. Sorted by total idle GPU-hours (GPU count × 30min window). Uses kube_pod_labels when available; otherwise derives deployment name from pod name.",
+          "type": "table",
+          "gridPos": { "h": 10, "w": 24, "x": 0, "y": 43 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)",
+              "legendFormat": "",
+              "format": "table",
+              "instant": true
+            }
+          ],
+          "transformations": [
+            {
+              "id": "organize",
+              "options": {
+                "excludeByName": { "Time": true, "__name__": true },
+                "renameByName": {
+                  "deployment": "Deployment",
+                  "namespace": "Namespace",
+                  "Value": "Idle GPU-Hours"
+                },
+                "indexByName": { "Value": 2, "deployment": 0, "namespace": 1 }
+              }
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "custom": { "displayMode": "color-background-solid" },
+              "decimals": 1,
+              "unit": "none"
+            },
+            "overrides": [
+              {
+                "matcher": { "id": "byName", "options": "Idle GPU-Hours" },
+                "properties": [
+                  { "id": "custom.width", "value": 150 },
+                  {
+                    "id": "thresholds",
+                    "value": {
+                      "mode": "absolute",
+                      "steps": [
+                        { "color": "green", "value": null },
+                        { "color": "yellow", "value": 2 },
+                        { "color": "orange", "value": 5 },
+                        { "color": "red", "value": 10 }
+                      ]
+                    }
+                  },
+                  { "id": "custom.displayMode", "value": "color-background" }
+                ]
+              }
+            ]
+          }
+        },
+        {
+          "title": "Leaderboard",
+          "type": "row",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 },
+          "collapsed": false
+        },
+        {
+          "title": "GPU Allocation Leaderboard (by Namespace)",
+          "description": "Total GPU requests per namespace, sorted descending. Namespace = user ({user}-dev pattern).",
+          "type": "barchart",
+          "gridPos": { "h": 10, "w": 12, "x": 0, "y": 54 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (namespace) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"}))",
+              "legendFormat": "{{namespace}}",
+              "instant": true
+            }
+          ],
+          "options": {
+            "orientation": "horizontal",
+            "showValue": "always",
+            "barWidth": 0.7
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "color": { "mode": "palette-classic" },
+              "displayName": "${__field.labels.namespace}"
+            }
+          }
+        },
+        {
+          "title": "GPU Memory Allocation Leaderboard (GiB)",
+          "description": "Estimated GPU memory allocated per namespace. Calculated as GPUs requested × 140 GiB (H200 FB total).",
+          "type": "barchart",
+          "gridPos": { "h": 10, "w": 12, "x": 12, "y": 54 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (namespace) (kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\", namespace!~\"cw-.*\"}) * 140)",
+              "legendFormat": "{{namespace}}",
+              "instant": true
+            }
+          ],
+          "options": {
+            "orientation": "horizontal",
+            "showValue": "always",
+            "barWidth": 0.7
+          },
+          "fieldConfig": {
+            "defaults": {
+              "unit": "decgbytes",
+              "color": { "mode": "palette-classic" },
+              "displayName": "${__field.labels.namespace}"
+            }
+          }
+        },
+        {
+          "title": "GPU Memory Usage Over Time (by Node)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 64 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sum by (Hostname) (DCGM_FI_DEV_FB_USED) / 1024",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "decgbytes",
+              "min": 0,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 20,
+                "spanNulls": true,
+                "stacking": { "mode": "normal" }
+              }
+            }
+          }
+        },
+        {
+          "title": "GPU Health & DCGM",
+          "type": "row",
+          "gridPos": { "h": 1, "w": 24, "x": 0, "y": 72 },
+          "collapsed": false
+        },
+        {
+          "title": "Peak GPU temperature",
+          "description": "Highest GPU die temperature across the cluster (DCGM_FI_DEV_GPU_TEMP).",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 0, "y": 73 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "max(DCGM_FI_DEV_GPU_TEMP)",
+              "legendFormat": "Max °C"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "celsius",
+              "color": { "mode": "thresholds" },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 80 },
+                  { "color": "red", "value": 85 }
+                ]
+              }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "Peak power (W)",
+          "description": "Highest per-GPU power draw in watts (DCGM_FI_DEV_POWER_USAGE). Useful for rack capacity and corroborating idle detection.",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 4, "y": 73 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "max(DCGM_FI_DEV_POWER_USAGE)",
+              "legendFormat": "Max W"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "watt",
+              "color": { "mode": "fixed", "fixedColor": "orange" },
+              "thresholds": { "mode": "absolute", "steps": [{ "color": "orange", "value": null }] }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "XID errors (total)",
+          "description": "Sum of NVIDIA XID driver/hardware error counter (DCGM_FI_DEV_XID_ERRORS). Non-zero warrants investigation.",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 4, "x": 8, "y": 73 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_XID_ERRORS)",
+              "legendFormat": "XID"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "decimals": 0,
+              "color": { "mode": "thresholds" },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "red", "value": 1 }
+                ]
+              }
+            }
+          },
+          "options": { "colorMode": "background", "graphMode": "none", "textMode": "value" }
+        },
+        {
+          "title": "GPU temperature by node",
+          "description": "Average GPU temperature per host (DCGM_FI_DEV_GPU_TEMP).",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 77 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_DEV_GPU_TEMP)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "celsius",
+              "min": 0,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 80 },
+                  { "color": "red", "value": 85 }
+                ]
+              }
+            }
+          }
+        },
+        {
+          "title": "Power draw by node",
+          "description": "Sum of per-GPU power draw per host in watts (DCGM_FI_DEV_POWER_USAGE).",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 77 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "watt",
+              "min": 0,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "VRAM utilization %",
+          "description": "Framebuffer used as a percentage of total VRAM per GPU (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL).",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 85 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)",
+              "legendFormat": "{{Hostname}} GPU {{gpu}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 1,
+                "fillOpacity": 10,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "Memory copy utilization",
+          "description": "Average memory copy engine utilization per host (DCGM_FI_DEV_MEM_COPY_UTIL).",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 85 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "XID errors (1h increase)",
+          "description": "XID errors increased over the last hour per GPU. Spikes indicate driver or hardware faults.",
+          "type": "timeseries",
+          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 93 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))",
+              "legendFormat": "{{Hostname}} GPU {{gpu}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "decimals": 0,
+              "custom": {
+                "drawStyle": "bars",
+                "lineWidth": 1,
+                "fillOpacity": 50,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "SM active by node",
+          "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "Tensor pipe active by node",
+          "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
+        {
+          "title": "DRAM active by node",
+          "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_PROF_DRAM_ACTIVE)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["gpu", "waldorf", "llm-d"],
+      "templating": { "list": [] },
+      "time": { "from": "now-6h", "to": "now" },
+      "timepicker": {},
+      "timezone": "browser",
+      "title": "Waldorf GPU Usage & Idle Tracker",
+      "uid": "prometheus",
+      "version": 2
+    }

From b175ece348a750ae59f4b19fa8cc035d338e0e06 Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Thu, 4 Jun 2026 11:54:12 -0400
Subject: [PATCH 2/7] Add Grafana Helm deployment and ingress access docs.

Include values for OpenShift and vanilla Kubernetes, dashboard import
script, CoreWeave ingress guide, and README Helm quick start.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 COREWEAVE_INGRESS_GUIDE.md           | 345 +++++++++++++++++++++++++++
 GRAFANA_ACCESS.md                    | 294 +++++++++++++++++++++++
 README.md                            |  19 ++
 helm/QUICKSTART.md                   | 232 ++++++++++++++++++
 helm/README.md                       | 272 +++++++++++++++++++++
 helm/grafana-values-openshift.yaml   | 151 ++++++++++++
 helm/grafana-values-vanilla-k8s.yaml | 172 +++++++++++++
 helm/grafana-values.yaml             | 234 ++++++++++++++++++
 import-dashboard.sh                  |  48 ++++
 9 files changed, 1767 insertions(+)
 create mode 100644 COREWEAVE_INGRESS_GUIDE.md
 create mode 100644 GRAFANA_ACCESS.md
 create mode 100644 helm/QUICKSTART.md
 create mode 100644 helm/README.md
 create mode 100644 helm/grafana-values-openshift.yaml
 create mode 100644 helm/grafana-values-vanilla-k8s.yaml
 create mode 100644 helm/grafana-values.yaml
 create mode 100755 import-dashboard.sh

diff --git a/COREWEAVE_INGRESS_GUIDE.md b/COREWEAVE_INGRESS_GUIDE.md
new file mode 100644
index 0000000..86f111d
--- /dev/null
+++ b/COREWEAVE_INGRESS_GUIDE.md
@@ -0,0 +1,345 @@
+# CoreWeave Kubernetes (CKS) Ingress Guide
+
+This guide explains how to expose services for external access in CoreWeave Kubernetes Service (CKS).
+
+## Overview
+
+CoreWeave uses a **LoadBalancer + DNS annotation** pattern rather than traditional Ingress controllers. The cluster has **Istio** installed but standard users don't have permissions to create Gateway API resources or VirtualServices.
+
+## Available Methods
+
+### Method 1: LoadBalancer Service with DNS (Recommended for CKS)
+
+CoreWeave provides an **External Hostname Controller** that automatically creates DNS records for LoadBalancer services.
+
+#### How It Works
+
+1. Create a LoadBalancer service
+2. Add the `service.beta.kubernetes.io/external-hostname` annotation
+3. CoreWeave assigns a public IP and creates a DNS record in `.coreweave.app` domain
+4. DNS status is reflected in `.status.conditions` field of the Service
+
+#### Example: Expose Grafana with LoadBalancer
+
+**IMPORTANT**: You must add the `service.beta.kubernetes.io/coreweave-load-balancer-type: public` annotation to get a **public IP**. Without this annotation, CoreWeave assigns an internal VIP only.
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-grafana
+  namespace: fuddin-dev
+  annotations:
+    service.beta.kubernetes.io/coreweave-load-balancer-type: "public"  # REQUIRED for public IP
+    service.beta.kubernetes.io/external-hostname: "gpu-grafana"
+    # This creates: gpu-grafana-<hash>.coreweave.app
+spec:
+  type: LoadBalancer
+  selector:
+    app.kubernetes.io/name: grafana
+    app.kubernetes.io/instance: gpu-grafana
+  ports:
+    - name: http
+      port: 80
+      targetPort: 3000
+      protocol: TCP
+```
+
+Apply and check the assigned hostname:
+
+```bash
+kubectl apply -f grafana-loadbalancer.yaml
+
+# Wait for external IP assignment
+kubectl get svc gpu-grafana -n fuddin-dev -w
+
+# Check the assigned DNS name in status
+kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")].message}'
+```
+
+The service will be accessible at: `http://gpu-grafana-<hash>.coreweave.app`
+
+#### Wildcard DNS
+
+For wildcard DNS records (e.g., for multiple subdomains):
+
+```yaml
+metadata:
+  annotations:
+    service.beta.kubernetes.io/external-hostname: "*"
+    # Creates: *.abc123-mycluster.coreweave.app
+```
+
+### Method 2: Port-Forward (Development/Testing)
+
+For temporary access without exposing services publicly:
+
+```bash
+# Forward local port 3000 to Grafana service
+kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80
+
+# Access at http://localhost:3000
+```
+
+**Pros**: 
+- No cluster configuration needed
+- Works immediately
+- No public exposure
+
+**Cons**:
+- Only accessible from your machine
+- Connection breaks when command terminates
+- Not suitable for production
+
+### Method 3: Istio VirtualService (Requires Permissions)
+
+CoreWeave has **Istio** installed, but standard users don't have permissions to create VirtualServices or Gateways. This method requires cluster admin assistance.
+
+If you have permissions, you would create:
+
+```yaml
+apiVersion: networking.istio.io/v1
+kind: VirtualService
+metadata:
+  name: grafana-vs
+  namespace: fuddin-dev
+spec:
+  hosts:
+    - "grafana.example.com"
+  gateways:
+    - istio-system/public-gateway  # Shared cluster gateway
+  http:
+    - match:
+        - uri:
+            prefix: /
+      route:
+        - destination:
+            host: gpu-grafana.fuddin-dev.svc.cluster.local
+            port:
+              number: 80
+```
+
+**Note**: This requires a shared Gateway to exist and permissions to create VirtualServices.
+
+## Comparison of Methods
+
+| Method | Access | Setup Complexity | Cost | Use Case |
+|--------|--------|------------------|------|----------|
+| **LoadBalancer + DNS** | Public internet | Low | Charges for public IP | Production, public dashboards |
+| **Port-Forward** | Local only | Very low | Free | Development, debugging |
+| **Istio VirtualService** | Shared gateway | Medium | Shared cost | Multi-service routing, advanced traffic control |
+
+## Recommended Approach for Grafana
+
+### Option A: LoadBalancer (Public Access)
+
+Best for production Grafana instance that multiple team members need to access.
+
+```bash
+# Update Grafana service to LoadBalancer
+kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}'
+
+# Add REQUIRED annotation for public IP
+kubectl annotate svc gpu-grafana -n fuddin-dev \
+  service.beta.kubernetes.io/coreweave-load-balancer-type="public"
+
+# Add DNS annotation
+kubectl annotate svc gpu-grafana -n fuddin-dev \
+  service.beta.kubernetes.io/external-hostname="gpu-grafana"
+
+# Wait for external IP
+kubectl get svc gpu-grafana -n fuddin-dev -w
+
+# Get the public IP
+kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+```
+
+### Option B: Port-Forward (Personal Access)
+
+Best for personal dashboards or development:
+
+```bash
+# Add to your shell profile for automatic port-forward
+alias grafana-forward='kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80'
+
+# Run whenever you need access
+grafana-forward
+```
+
+## Current Grafana Setup
+
+Your Grafana is currently deployed with:
+
+- **Service Type**: ClusterIP (internal only)
+- **Namespace**: `fuddin-dev`
+- **Port**: 80 (service) → 3000 (pod)
+- **Access Method**: Port-forward only
+
+### Convert to LoadBalancer
+
+```bash
+# Method 1: kubectl patch
+kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}'
+kubectl annotate svc gpu-grafana -n fuddin-dev \
+  service.beta.kubernetes.io/external-hostname="gpu-grafana-fuddin"
+
+# Method 2: Helm upgrade
+helm upgrade gpu-grafana grafana/grafana \
+  --reuse-values \
+  --set service.type=LoadBalancer \
+  --set service.annotations."service\.beta\.kubernetes\.io/external-hostname"="gpu-grafana-fuddin" \
+  -n fuddin-dev
+```
+
+## Cluster Architecture
+
+CoreWeave Kubernetes (CKS) uses:
+
+- **Istio** for service mesh (installed at cluster level)
+- **Gateway API** (available but restricted permissions)
+- **External Hostname Controller** for automatic DNS provisioning
+- **LoadBalancer** services get public IPs automatically
+
+### Installed Components
+
+```bash
+# Istio control plane
+kubectl get svc -n istio-system istiod
+# NAME     TYPE        CLUSTER-IP    EXTERNAL-IP   PORT(S)
+# istiod   ClusterIP   10.16.0.170   <none>        15010/TCP,15012/TCP,443/TCP,15014/TCP
+
+# Gateway API CRDs available
+kubectl api-resources | grep gateway
+# httproutes
+# gateways.gateway.networking.k8s.io
+# virtualservices (Istio)
+```
+
+### Permissions
+
+Standard users in CKS can:
+- ✅ Create/modify Services in their namespace
+- ✅ Use LoadBalancer service type
+- ✅ Add DNS annotations
+- ❌ Create Gateway resources
+- ❌ Create HTTPRoute resources
+- ❌ Create VirtualService resources (Istio)
+- ❌ List cluster-wide resources
+
+## Troubleshooting
+
+### LoadBalancer stuck in "Pending"
+
+```bash
+kubectl describe svc gpu-grafana -n fuddin-dev
+
+# Check events for errors
+kubectl get events -n fuddin-dev --sort-by='.lastTimestamp' | grep gpu-grafana
+```
+
+Common causes:
+- Quota limits on public IPs
+- Invalid annotation format
+- Namespace resource limits
+
+### DNS not resolving
+
+```bash
+# Check service status
+kubectl get svc gpu-grafana -n fuddin-dev -o yaml
+
+# Look for ExternalRecords condition
+kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")]}'
+```
+
+The DNS record creation may take 1-2 minutes after the external IP is assigned.
+
+### Port-forward connection refused
+
+```bash
+# Check if pod is running
+kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana
+
+# Check pod logs
+kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=50
+
+# Test service internally
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- \
+  curl http://gpu-grafana.fuddin-dev.svc.cluster.local
+```
+
+## Cost Considerations
+
+- **Public IPs**: CoreWeave charges for LoadBalancer public IPs
+- **Bandwidth**: Egress traffic may have costs
+- **Port-Forward**: No additional cost (uses cluster credentials)
+
+For cost-effective access:
+1. Use port-forward for personal/development access
+2. Use LoadBalancer only for production services that need public access
+3. Share one LoadBalancer across multiple services using path-based routing (requires Istio VirtualService with permissions)
+
+## Security Best Practices
+
+### For LoadBalancer Services
+
+1. **Enable authentication** in Grafana (already configured with admin password)
+2. **Use HTTPS**: Add TLS certificate
+3. **Restrict source IPs**: Use `loadBalancerSourceRanges`
+4. **Monitor access logs**: Enable Grafana audit logging
+5. **Use NetworkPolicies**: Restrict pod-to-pod communication
+
+```yaml
+spec:
+  type: LoadBalancer
+  loadBalancerSourceRanges:
+    - "1.2.3.4/32"      # Your office IP
+    - "5.6.7.8/24"      # Your VPN range
+```
+
+### For Port-Forward
+
+- ✅ Automatically secured by Kubernetes RBAC
+- ✅ Requires valid cluster credentials
+- ✅ No public exposure
+- ⚠️ Ensure your local machine is secured
+
+## Next Steps
+
+1. **Decide on access method**:
+   - Public access → Use LoadBalancer with DNS
+   - Personal access → Use port-forward
+
+2. **If using LoadBalancer**:
+   ```bash
+   kubectl patch svc gpu-grafana -n fuddin-dev -p '{"spec":{"type":"LoadBalancer"}}'
+   kubectl annotate svc gpu-grafana -n fuddin-dev \
+     service.beta.kubernetes.io/external-hostname="gpu-grafana-fuddin"
+   ```
+
+3. **Monitor the service**:
+   ```bash
+   kubectl get svc gpu-grafana -n fuddin-dev -w
+   ```
+
+4. **Access Grafana**:
+   - LoadBalancer: Wait for DNS record, then access via `http://<assigned-dns>.coreweave.app`
+   - Port-forward: `kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80`
+
+## References
+
+- [Create a Public DNS Name | CoreWeave](https://docs.coreweave.com/docs/products/networking/how-to/expose-service-dns)
+- [Introduction to CoreWeave Kubernetes Service | CoreWeave](https://docs.coreweave.com/docs/products/cks)
+- [Kubernetes Ingress Documentation](https://kubernetes.io/docs/concepts/services-networking/ingress/)
+- [Exposing Applications for External Access | Kube by Example](https://kubebyexample.com/learning-paths/application-development-kubernetes/lesson-3-networking-kubernetes/exposing-0)
+
+## Summary
+
+**CoreWeave uses LoadBalancer services with DNS annotations, not traditional Ingress controllers.**
+
+For your Grafana deployment:
+- **Quick access**: `kubectl port-forward -n fuddin-dev svc/gpu-grafana 3000:80`
+- **Public access**: Convert service to LoadBalancer with DNS annotation
+- **Advanced routing**: Request VirtualService permissions from cluster admin
+
+The simplest production-ready approach is to use LoadBalancer with the `service.beta.kubernetes.io/external-hostname` annotation.
diff --git a/GRAFANA_ACCESS.md b/GRAFANA_ACCESS.md
new file mode 100644
index 0000000..866136d
--- /dev/null
+++ b/GRAFANA_ACCESS.md
@@ -0,0 +1,294 @@
+# Grafana GPU Dashboard - Access Information
+
+## ✅ Deployment Status: LIVE
+
+Your Grafana instance with the GPU Pruner dashboard is now deployed and publicly accessible!
+
+## 🌐 Access Details
+
+**Public URL (DNS - may take 5-10 minutes to update)**: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app
+
+**Direct IP Access (Works immediately)**: http://166.19.16.227
+
+**Credentials**:
+- **Username**: `admin`
+- **Password**: `GpuPruner2026!`
+
+**External IP**: `166.19.16.227` (CoreWeave Public LoadBalancer)
+
+## 📊 Dashboard Import
+
+The GPU dashboard is **not yet imported**. After logging in, you need to import it:
+
+### Option 1: Automated Import Script
+
+```bash
+./import-dashboard.sh http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app GpuPruner2026!
+```
+
+### Option 2: Manual Import via UI
+
+1. Access http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app
+2. Login with `admin` / `GpuPruner2026!`
+3. Navigate to **Dashboards** → **Import** → **Upload JSON file**
+4. Select `gpu-dashboard.json` from this repository
+5. Choose **Prometheus** datasource (UID: `PBFA97CFB590B2093`)
+6. Click **Import**
+
+### Option 3: Import via API
+
+```bash
+GRAFANA_URL="http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app"
+ADMIN_PASSWORD="GpuPruner2026!"
+
+DASHBOARD_JSON=$(cat gpu-dashboard.json | jq '{dashboard: ., overwrite: true, folderId: 0}')
+
+curl -X POST \
+  -H "Content-Type: application/json" \
+  -u "admin:$ADMIN_PASSWORD" \
+  -d "$DASHBOARD_JSON" \
+  "$GRAFANA_URL/api/dashboards/db"
+```
+
+## 🔍 Verify Datasource
+
+After logging in, verify the Prometheus datasource is working:
+
+1. Go to **Configuration** → **Data Sources** → **Prometheus**
+2. Click **Save & Test**
+3. Should show: "Data source is working" ✓
+
+If the datasource test fails:
+- Check the Prometheus URL: `http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090`
+- Verify Prometheus is accessible from the `fuddin-dev` namespace
+
+## 📝 Service Configuration
+
+**Namespace**: `fuddin-dev`  
+**Service Type**: `LoadBalancer`  
+**Service Name**: `gpu-grafana`  
+**Internal Port**: `80` → Pod Port `3000`  
+**NodePort**: `30265`
+
+### DNS Configuration
+
+**Annotation**: `service.beta.kubernetes.io/external-hostname: "gpu-grafana-fuddin"`  
+**Auto-generated FQDN**: `gpu-grafana-fuddin.6787d4-waldorf.coreweave.app`
+
+CoreWeave's External Hostname Controller automatically:
+- Created the DNS record in the `.coreweave.app` domain
+- Appended the cluster identifier `6787d4-waldorf` to prevent conflicts
+- Set the DNS status in the Service `.status.conditions` field
+
+## 🔧 Management Commands
+
+### Check Service Status
+
+```bash
+kubectl get svc gpu-grafana -n fuddin-dev
+```
+
+### View Service Details
+
+```bash
+kubectl describe svc gpu-grafana -n fuddin-dev
+```
+
+### Check DNS Status
+
+```bash
+kubectl get svc gpu-grafana -n fuddin-dev -o jsonpath='{.status.conditions[?(@.type=="ExternalRecords")]}' | jq .
+```
+
+### Check Grafana Pods
+
+```bash
+kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana
+```
+
+### View Grafana Logs
+
+```bash
+kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=100 -f
+```
+
+### Restart Grafana
+
+```bash
+kubectl rollout restart deployment gpu-grafana -n fuddin-dev
+```
+
+## ⚠️ Important Notes
+
+### Persistence
+
+**WARNING**: Persistence is currently **DISABLED**. This means:
+- Dashboard customizations will be **lost** if the pod restarts
+- Datasource changes will be **lost** if the pod restarts
+- User accounts (other than admin) will be **lost** if the pod restarts
+
+To enable persistence:
+
+```bash
+helm upgrade gpu-grafana grafana/grafana \
+  --reuse-values \
+  --set persistence.enabled=true \
+  --set persistence.size=10Gi \
+  -n fuddin-dev
+```
+
+### Security
+
+- ✅ Authentication is enabled (admin password required)
+- ⚠️ HTTP only (no HTTPS/TLS)
+- ⚠️ No IP restrictions (publicly accessible)
+- ⚠️ Default admin password (should be changed for production)
+
+### Recommended Security Improvements
+
+1. **Change admin password**:
+   ```bash
+   # Login to Grafana UI
+   # Profile → Change Password
+   ```
+
+2. **Enable HTTPS** (requires TLS certificate):
+   ```bash
+   # Add TLS certificate to cluster
+   kubectl create secret tls grafana-tls \
+     --cert=grafana.crt \
+     --key=grafana.key \
+     -n fuddin-dev
+   
+   # Update service annotation
+   kubectl annotate svc gpu-grafana -n fuddin-dev \
+     service.beta.kubernetes.io/external-hostname-tls="grafana-tls"
+   ```
+
+3. **Restrict source IPs** (optional):
+   ```bash
+   kubectl patch svc gpu-grafana -n fuddin-dev -p '{
+     "spec": {
+       "loadBalancerSourceRanges": ["YOUR.IP.ADDRESS/32"]
+     }
+   }'
+   ```
+
+4. **Enable persistence** (as shown above)
+
+## 📊 Expected Dashboard Features
+
+Once imported, the GPU dashboard will show:
+
+- **Cluster GPU Overview**
+  - Total GPUs
+  - VRAM allocation (FB>0 vs FB=0)
+  - Engine activity (idle 30m vs active 30m)
+  - GPU memory per GPU
+
+- **GPU Utilization Heatmap**
+  - Per-node GPU utilization over time
+
+- **Running GPU Workloads**
+  - All pods with GPU requests
+  - Grouped by namespace
+
+- **Idle GPU Workloads**
+  - GPUs with zero compute activity for 30+ minutes
+  - Identifies wasted resources
+
+- **Idle GPU Time by Deployment**
+  - Historical analysis of which deployments waste the most GPU time
+  - Requires `kube_pod_labels` metric from kube-state-metrics
+
+- **GPU Allocation Leaderboard**
+  - Total GPU requests per namespace
+
+## 🐛 Troubleshooting
+
+### Cannot access the URL
+
+**Check DNS propagation**:
+```bash
+nslookup gpu-grafana-fuddin.6787d4-waldorf.coreweave.app
+```
+
+Should return: `10.16.4.0`
+
+**Check from your browser**:
+- Try: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app
+- If DNS fails, try direct IP: http://10.16.4.0 (may not work from external networks)
+
+### Dashboard shows "No Data"
+
+1. **Verify Prometheus datasource**:
+   - Configuration → Data Sources → Prometheus → Save & Test
+
+2. **Check Prometheus is accessible**:
+   ```bash
+   kubectl run curl-test --image=curlimages/curl:latest --rm -i --restart=Never -n fuddin-dev -- \
+     curl -s 'http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090/api/v1/query?query=up'
+   ```
+
+3. **Verify DCGM metrics exist**:
+   ```bash
+   # Port-forward to Prometheus
+   kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
+   
+   # Query DCGM metrics
+   curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length'
+   ```
+
+### "Idle GPU Time by Deployment" panel empty
+
+This panel requires `kube_pod_labels` metric. Verify kube-state-metrics is configured with:
+
+```bash
+kubectl get deploy kube-state-metrics -A -o yaml | grep metric-labels-allowlist
+```
+
+Should show: `--metric-labels-allowlist=pods=[*]`
+
+### Grafana pod not running
+
+```bash
+# Check pod status
+kubectl get pods -n fuddin-dev -l app.kubernetes.io/name=grafana
+
+# Check logs
+kubectl logs -n fuddin-dev -l app.kubernetes.io/name=grafana --tail=50
+
+# Describe pod for events
+kubectl describe pod -n fuddin-dev -l app.kubernetes.io/name=grafana
+```
+
+## 📚 Additional Resources
+
+- **Main Documentation**: [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md)
+- **Dashboard Features**: [DASHBOARD.md](DASHBOARD.md)
+- **Helm Configuration**: [helm/README.md](helm/README.md)
+- **CoreWeave Ingress**: [COREWEAVE_INGRESS_GUIDE.md](COREWEAVE_INGRESS_GUIDE.md)
+- **Import Script**: [import-dashboard.sh](import-dashboard.sh)
+
+## 🎯 Next Steps
+
+1. ✅ **Access Grafana**: http://gpu-grafana-fuddin.6787d4-waldorf.coreweave.app
+2. ✅ **Login**: `admin` / `GpuPruner2026!`
+3. ⏳ **Import Dashboard**: Use the import script or manual UI import
+4. ⏳ **Verify Datasource**: Configuration → Data Sources → Prometheus → Save & Test
+5. ⏳ **Enable Persistence**: To prevent data loss on pod restart
+6. ⏳ **Change Password**: For production security
+
+## 📞 Support
+
+For issues or questions:
+- **GitHub Issues**: https://github.com/wseaton/gpu-pruner/issues
+- **CoreWeave Docs**: https://docs.coreweave.com/
+- **Deployment Guide**: [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md)
+
+---
+
+**Deployment Date**: 2026-06-04  
+**Deployed By**: fuddin@redhat.com  
+**Cluster**: coreweave-waldorf (6787d4)  
+**Namespace**: fuddin-dev
diff --git a/README.md b/README.md
index bc1b770..e5d9018 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,25 @@ Import `gpu-dashboard.json` into Grafana for advanced analytics and visualizatio
 
 See [DASHBOARD.md](DASHBOARD.md) for import instructions and [IDLE_GPU_QUERY.md](IDLE_GPU_QUERY.md) for querying idle GPU time by deployment.
 
+#### Deploy Grafana with Helm
+
+For a complete standalone Grafana deployment with the GPU dashboard pre-configured:
+
+```bash
+# Add Grafana Helm repository
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+# Install Grafana with GPU dashboard
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \
+  -n monitoring --create-namespace
+```
+
+See [GRAFANA_DEPLOYMENT.md](GRAFANA_DEPLOYMENT.md) for complete deployment instructions, configuration options, and troubleshooting.
+
 ## usage 
 
 ```sh
diff --git a/helm/QUICKSTART.md b/helm/QUICKSTART.md
new file mode 100644
index 0000000..26afad8
--- /dev/null
+++ b/helm/QUICKSTART.md
@@ -0,0 +1,232 @@
+# Grafana GPU Dashboard - Quick Start Guide
+
+One-command deployments for common scenarios.
+
+## Prerequisites Check
+
+```bash
+# Verify prerequisites are met
+kubectl get svc -A | grep prometheus      # ✓ Prometheus exists
+kubectl get pods -A | grep dcgm           # ✓ DCGM exporter running
+kubectl get deploy -A | grep kube-state  # ✓ kube-state-metrics deployed
+helm version                              # ✓ Helm 3.x installed
+```
+
+## Scenario 1: OpenShift with Prometheus Operator
+
+```bash
+# One command deployment
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+helm install gpu-grafana grafana/grafana \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values-openshift.yaml \
+  --set adminPassword='ChangeMe123!' \
+  -n monitoring --create-namespace
+
+# Grant Prometheus access
+oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring
+
+# Get Route URL
+echo "https://$(oc get route -n monitoring grafana -o jsonpath='{.spec.host}')"
+```
+
+**Login**: `admin` / `ChangeMe123!`
+
+## Scenario 2: Vanilla Kubernetes with nginx Ingress
+
+```bash
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+helm install gpu-grafana grafana/grafana \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values-vanilla-k8s.yaml \
+  --set adminPassword='ChangeMe123!' \
+  --set ingress.hosts[0]='grafana.example.com' \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \
+  -n monitoring --create-namespace
+
+# Access via Ingress
+echo "https://grafana.example.com"
+```
+
+**Login**: `admin` / `ChangeMe123!`
+
+## Scenario 3: Local Testing with Port-Forward
+
+```bash
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+helm install gpu-grafana grafana/grafana \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \
+  --set adminPassword='admin' \
+  --set persistence.enabled=false \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \
+  -n monitoring --create-namespace
+
+# Port-forward to access
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000
+```
+
+**Access**: http://localhost:3000  
+**Login**: `admin` / `admin`
+
+## Scenario 4: Air-Gapped Cluster (ConfigMap Method)
+
+```bash
+# Step 1: Download files
+curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-dashboard-configmap.yaml
+curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml
+
+# Step 2: Create ConfigMap
+kubectl apply -f grafana-dashboard-configmap.yaml
+
+# Step 3: Deploy Grafana with sidecar
+helm install gpu-grafana grafana/grafana \
+  -f grafana-values.yaml \
+  --set adminPassword='ChangeMe123!' \
+  --set sidecar.dashboards.enabled=true \
+  --set sidecar.dashboards.label=grafana_dashboard \
+  --set dashboards=null \
+  -n monitoring --create-namespace
+```
+
+## Scenario 5: Using LoadBalancer Service
+
+```bash
+helm install gpu-grafana grafana/grafana \
+  -f https://raw.githubusercontent.com/wseaton/gpu-pruner/main/helm/grafana-values.yaml \
+  --set adminPassword='ChangeMe123!' \
+  --set service.type=LoadBalancer \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus:9090' \
+  -n monitoring --create-namespace
+
+# Get LoadBalancer IP
+kubectl get svc -n monitoring gpu-grafana -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+```
+
+## Post-Installation
+
+### Get Admin Password
+
+```bash
+kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+```
+
+### Verify Dashboard Loaded
+
+```bash
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000 &
+curl -s -u admin:YOUR_PASSWORD http://localhost:3000/api/search?query=gpu | jq .
+```
+
+Expected: JSON with dashboard titled "Waldorf GPU Usage & Idle Tracker"
+
+### Test Prometheus Datasource
+
+Grafana UI → Configuration → Data Sources → Prometheus → Save & Test
+
+Should show: **"Data source is working"** ✓
+
+## Customization
+
+### Change Prometheus URL
+
+```bash
+--set datasources."datasources\.yaml".datasources[0].url='http://YOUR_PROMETHEUS:9090'
+```
+
+### Change Ingress Hostname
+
+```bash
+--set ingress.hosts[0]='grafana-gpu.yourdomain.com'
+```
+
+### Enable Persistence
+
+```bash
+--set persistence.enabled=true \
+--set persistence.size=20Gi \
+--set persistence.storageClassName=fast-ssd
+```
+
+### Increase Resources
+
+```bash
+--set resources.limits.cpu=1000m \
+--set resources.limits.memory=1Gi \
+--set resources.requests.cpu=500m \
+--set resources.requests.memory=512Mi
+```
+
+## Troubleshooting
+
+### Dashboard Shows "No Data"
+
+```bash
+# Test Prometheus connectivity
+kubectl exec -n monitoring deploy/gpu-grafana -- wget -qO- http://prometheus-k8s.monitoring.svc.cluster.local:9090/api/v1/query?query=up
+
+# Check DCGM metrics exist
+kubectl port-forward -n <prometheus-ns> svc/<prometheus-svc> 9090:9090 &
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq .
+```
+
+### Can't Access Grafana
+
+```bash
+# Check pod status
+kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
+
+# Check logs
+kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50
+
+# Use port-forward as fallback
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000
+```
+
+### Forgot Admin Password
+
+```bash
+# Retrieve existing password
+kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+
+# Or reset it
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --set adminPassword='NewPassword123!' \
+  --reuse-values \
+  -n monitoring
+```
+
+## Upgrading
+
+```bash
+helm repo update
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -n monitoring
+```
+
+## Uninstalling
+
+```bash
+helm uninstall gpu-grafana -n monitoring
+
+# Optional: Delete persistent data
+kubectl delete pvc -n monitoring -l app.kubernetes.io/name=grafana
+```
+
+## Next Steps
+
+- **Detailed Guide**: [GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md)
+- **Dashboard Features**: [DASHBOARD.md](../DASHBOARD.md)
+- **Configuration Reference**: [helm/README.md](README.md)
+
+## Need Help?
+
+- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues
+- Full Documentation: [GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md)
diff --git a/helm/README.md b/helm/README.md
new file mode 100644
index 0000000..adb217c
--- /dev/null
+++ b/helm/README.md
@@ -0,0 +1,272 @@
+# Grafana Helm Chart Deployment Files
+
+This directory contains Helm values files and Kubernetes manifests for deploying Grafana with the GPU Pruner dashboard.
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `grafana-values.yaml` | **Base values file** - Core Grafana configuration with dashboard provisioning, Prometheus datasource, and resource settings. Use this as the foundation for all deployments. |
+| `grafana-values-openshift.yaml` | **OpenShift overrides** - Route configuration, token-based Prometheus authentication, and OpenShift-specific security context. Merge with base values. |
+| `grafana-values-vanilla-k8s.yaml` | **Vanilla Kubernetes overrides** - Ingress configuration for nginx/traefik, standard K8s security context. Merge with base values. |
+| `grafana-dashboard-configmap.yaml` | **Dashboard ConfigMap** - Alternative provisioning method using ConfigMap + sidecar instead of direct URL import. |
+
+## Quick Start
+
+### OpenShift Deployment
+
+```bash
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-openshift.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  -n monitoring --create-namespace
+
+# Grant Prometheus access
+oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring
+```
+
+Access via Route:
+```bash
+oc get route -n monitoring grafana -o jsonpath='{.spec.host}'
+```
+
+### Vanilla Kubernetes Deployment
+
+```bash
+helm install gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-vanilla-k8s.yaml \
+  --set adminPassword='YOUR_SECURE_PASSWORD' \
+  --set ingress.hosts[0]='grafana-gpu.example.com' \
+  --set datasources."datasources\.yaml".datasources[0].url='http://prometheus-k8s.monitoring.svc.cluster.local:9090' \
+  -n monitoring --create-namespace
+```
+
+Access via Ingress: `https://grafana-gpu.example.com`
+
+## Configuration
+
+### Required Customizations
+
+Before deploying, update these values in `grafana-values.yaml` or via `--set`:
+
+1. **Admin Password**:
+   ```bash
+   --set adminPassword='YOUR_SECURE_PASSWORD'
+   ```
+   Or use a secret (recommended):
+   ```bash
+   kubectl create secret generic grafana-admin-secret \
+     -n monitoring \
+     --from-literal=admin-password='YOUR_PASSWORD'
+   
+   --set admin.existingSecret=grafana-admin-secret
+   ```
+
+2. **Prometheus URL**:
+   ```bash
+   --set datasources."datasources\.yaml".datasources[0].url='http://YOUR_PROMETHEUS:9090'
+   ```
+
+3. **Ingress Hostname** (vanilla K8s):
+   ```bash
+   --set ingress.hosts[0]='grafana-gpu.example.com'
+   ```
+
+4. **Route Hostname** (OpenShift):
+   ```bash
+   --set route.host='grafana-gpu.apps.example.com'
+   ```
+
+### Critical: Datasource UID
+
+**Do NOT modify the datasource UID** in the values file. The dashboard has a hardcoded UID:
+
+```yaml
+datasources:
+  datasources.yaml:
+    datasources:
+      - uid: PBFA97CFB590B2093  # Must match gpu-dashboard.json
+```
+
+If you change this UID, the dashboard will not work.
+
+## Dashboard Provisioning Methods
+
+### Method 1: Direct URL Import (Default)
+
+Configured in `grafana-values.yaml`:
+
+```yaml
+dashboards:
+  gpu-pruner:
+    gpu-dashboard:
+      url: https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json
+```
+
+**Pros**: Simple, automatic updates when repo changes  
+**Cons**: Requires internet access from Grafana pod
+
+### Method 2: ConfigMap + Sidecar
+
+Apply the ConfigMap:
+
+```bash
+kubectl apply -f helm/grafana-dashboard-configmap.yaml
+```
+
+Enable sidecar in values:
+
+```yaml
+sidecar:
+  dashboards:
+    enabled: true
+    label: grafana_dashboard
+```
+
+**Pros**: Works in air-gapped clusters, no external dependencies  
+**Cons**: Requires manual updates when dashboard changes
+
+### Method 3: Manual Import
+
+1. Download dashboard:
+   ```bash
+   curl -O https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json
+   ```
+
+2. Grafana UI → Dashboards → Import → Upload JSON
+
+3. Select Prometheus datasource
+
+**Pros**: Full control over dashboard version  
+**Cons**: Not automated, requires UI access
+
+## Prerequisites
+
+Ensure these components are running before deploying Grafana:
+
+- ✅ **Prometheus** - Accessible at configured URL
+- ✅ **DCGM Exporter** - Running on GPU nodes
+- ✅ **kube-state-metrics** - With `--metric-labels-allowlist=pods=[*]`
+- ✅ **Persistent Storage** (optional) - For dashboard/datasource persistence
+
+Validation:
+
+```bash
+# Check Prometheus
+kubectl get svc -A | grep prometheus
+
+# Check DCGM exporter
+kubectl get pods -A | grep dcgm
+
+# Verify DCGM metrics
+kubectl port-forward -n <prometheus-ns> svc/<prometheus-svc> 9090:9090 &
+curl -s 'http://localhost:9090/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL' | jq '.data.result | length'
+```
+
+## Upgrading
+
+To upgrade an existing deployment with new values:
+
+```bash
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  -f helm/grafana-values-vanilla-k8s.yaml \
+  -n monitoring
+```
+
+To upgrade the Grafana chart version:
+
+```bash
+helm repo update
+helm search repo grafana/grafana --versions | head -5  # Check available versions
+
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --version 8.0.0 \
+  -n monitoring
+```
+
+## Uninstalling
+
+```bash
+helm uninstall gpu-grafana -n monitoring
+```
+
+To also delete persistent data:
+
+```bash
+kubectl delete pvc -n monitoring -l app.kubernetes.io/name=grafana
+```
+
+## Troubleshooting
+
+### Dashboard shows "No data"
+
+1. Verify Prometheus datasource:
+   ```bash
+   kubectl exec -n monitoring -it deploy/gpu-grafana -- \
+     wget -O- http://prometheus-k8s.monitoring.svc.cluster.local:9090/api/v1/query?query=up
+   ```
+
+2. Check Grafana logs:
+   ```bash
+   kubectl logs -n monitoring -l app.kubernetes.io/name=grafana --tail=50
+   ```
+
+### Can't login to Grafana
+
+Get admin password:
+
+```bash
+kubectl get secret -n monitoring gpu-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+```
+
+Reset admin password:
+
+```bash
+kubectl delete secret gpu-grafana -n monitoring
+helm upgrade gpu-grafana grafana/grafana \
+  -f helm/grafana-values.yaml \
+  --set adminPassword='NEW_PASSWORD' \
+  -n monitoring
+```
+
+### Ingress not working
+
+Check ingress controller:
+
+```bash
+kubectl get pods -n ingress-nginx
+```
+
+Verify ingress resource:
+
+```bash
+kubectl get ingress -n monitoring
+kubectl describe ingress gpu-grafana -n monitoring
+```
+
+Alternative: Use port-forward for testing:
+
+```bash
+kubectl port-forward -n monitoring svc/gpu-grafana 3000:3000
+```
+
+## Additional Documentation
+
+- **[GRAFANA_DEPLOYMENT.md](../GRAFANA_DEPLOYMENT.md)** - Complete deployment guide with validation steps and security considerations
+- **[DASHBOARD.md](../DASHBOARD.md)** - Dashboard features and usage guide
+- **[gpu-dashboard.json](../gpu-dashboard.json)** - Dashboard source JSON
+
+## Support
+
+For issues or questions:
+
+- GitHub Issues: https://github.com/wseaton/gpu-pruner/issues
+- Grafana Documentation: https://grafana.com/docs/
+- Helm Chart: https://github.com/grafana/helm-charts/tree/main/charts/grafana
diff --git a/helm/grafana-values-openshift.yaml b/helm/grafana-values-openshift.yaml
new file mode 100644
index 0000000..52372ea
--- /dev/null
+++ b/helm/grafana-values-openshift.yaml
@@ -0,0 +1,151 @@
+# Grafana Helm Chart Values for OpenShift
+#
+# OpenShift-specific configuration with Route and token-based Prometheus auth
+#
+# Usage:
+#   helm install gpu-grafana grafana/grafana \
+#     -f helm/grafana-values.yaml \
+#     -f helm/grafana-values-openshift.yaml \
+#     -n monitoring --create-namespace
+
+# Override Prometheus datasource for OpenShift monitoring
+datasources:
+  datasources.yaml:
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        # OpenShift uses Thanos Querier as the query frontend
+        url: http://thanos-querier.openshift-monitoring.svc.cluster.local:9090
+        access: proxy
+        isDefault: true
+        uid: PBFA97CFB590B2093
+        editable: true
+        jsonData:
+          timeInterval: 30s
+          queryTimeout: 60s
+          httpMethod: POST
+          # Token authentication for OpenShift monitoring
+          httpHeaderName1: 'Authorization'
+        # IMPORTANT: Set this token via --set or use a secret
+        # To get a token:
+        #   oc serviceaccounts get-token grafana -n monitoring
+        # secureJsonData:
+        #   httpHeaderValue1: 'Bearer YOUR_OPENSHIFT_TOKEN_HERE'
+
+# Service account with additional OpenShift annotations
+serviceAccount:
+  create: true
+  name: grafana
+  annotations:
+    serviceaccounts.openshift.io/oauth-redirectreference.grafana: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"grafana"}}'
+
+# RBAC with cluster-monitoring-view for Prometheus access
+rbac:
+  create: true
+  pspEnabled: false
+  extraClusterRoleRules:
+    - apiGroups:
+        - ""
+      resources:
+        - configmaps
+      verbs:
+        - get
+        - list
+        - watch
+
+# Additional ClusterRoleBinding for monitoring access
+# Note: This requires creating the binding separately:
+#   oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana -n monitoring
+# Or apply this separately:
+# ---
+# apiVersion: rbac.authorization.k8s.io/v1
+# kind: ClusterRoleBinding
+# metadata:
+#   name: grafana-cluster-monitoring-view
+# roleRef:
+#   apiGroup: rbac.authorization.k8s.io
+#   kind: ClusterRole
+#   name: cluster-monitoring-view
+# subjects:
+# - kind: ServiceAccount
+#   name: grafana
+#   namespace: monitoring
+
+# OpenShift Route configuration (instead of Ingress)
+route:
+  enabled: true
+  host: grafana-gpu.apps.example.com  # Update to match your OpenShift cluster domain
+  tls:
+    enabled: true
+    termination: edge
+    insecureEdgeTerminationPolicy: Redirect
+  annotations:
+    haproxy.router.openshift.io/timeout: 4m
+    haproxy.router.openshift.io/cookie_name: grafana-session
+
+# Disable standard Ingress
+ingress:
+  enabled: false
+
+# OpenShift-compatible security context
+securityContext:
+  runAsNonRoot: true
+  # OpenShift assigns UIDs dynamically from project range
+  # runAsUser: 472
+  # fsGroup: 472
+
+# Pod security context for OpenShift
+podSecurityContext:
+  runAsNonRoot: true
+
+# Environment variables for OpenShift
+env:
+  GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password
+  GF_SERVER_ROOT_URL: https://grafana-gpu.apps.example.com
+  GF_SERVER_DOMAIN: grafana-gpu.apps.example.com
+  GF_ANALYTICS_REPORTING_ENABLED: "false"
+  GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
+  # Optional: Enable OAuth proxy for OpenShift SSO
+  # GF_AUTH_PROXY_ENABLED: "true"
+  # GF_AUTH_PROXY_HEADER_NAME: "X-Forwarded-User"
+  # GF_AUTH_PROXY_AUTO_SIGN_UP: "true"
+
+# Grafana configuration for OpenShift
+grafana.ini:
+  server:
+    domain: grafana-gpu.apps.example.com
+    root_url: https://grafana-gpu.apps.example.com
+    serve_from_sub_path: false
+  analytics:
+    reporting_enabled: false
+    check_for_updates: false
+  log:
+    mode: console
+    level: info
+  security:
+    admin_user: admin
+  dashboards:
+    default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json
+
+# Note: If you need to deploy a Route manually, use this YAML:
+# ---
+# apiVersion: route.openshift.io/v1
+# kind: Route
+# metadata:
+#   name: grafana
+#   namespace: monitoring
+#   annotations:
+#     haproxy.router.openshift.io/timeout: 4m
+# spec:
+#   host: grafana-gpu.apps.example.com
+#   to:
+#     kind: Service
+#     name: gpu-grafana
+#     weight: 100
+#   port:
+#     targetPort: 3000
+#   tls:
+#     termination: edge
+#     insecureEdgeTerminationPolicy: Redirect
+#   wildcardPolicy: None
diff --git a/helm/grafana-values-vanilla-k8s.yaml b/helm/grafana-values-vanilla-k8s.yaml
new file mode 100644
index 0000000..343baa2
--- /dev/null
+++ b/helm/grafana-values-vanilla-k8s.yaml
@@ -0,0 +1,172 @@
+# Grafana Helm Chart Values for Vanilla Kubernetes
+#
+# Standard Kubernetes configuration with Ingress and custom Prometheus
+#
+# Usage:
+#   helm install gpu-grafana grafana/grafana \
+#     -f helm/grafana-values.yaml \
+#     -f helm/grafana-values-vanilla-k8s.yaml \
+#     -n monitoring --create-namespace
+
+# Override Prometheus datasource for custom Prometheus deployment
+datasources:
+  datasources.yaml:
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        # Update this URL to match your Prometheus service
+        # Common patterns:
+        #   Prometheus Operator: http://prometheus-k8s.monitoring.svc.cluster.local:9090
+        #   Kube-prometheus-stack: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090
+        #   Custom deployment: http://prometheus.prometheus.svc.cluster.local:9090
+        url: http://prometheus-k8s.monitoring.svc.cluster.local:9090
+        access: proxy
+        isDefault: true
+        uid: PBFA97CFB590B2093
+        editable: true
+        jsonData:
+          timeInterval: 30s
+          queryTimeout: 60s
+          httpMethod: POST
+        # If your Prometheus requires authentication:
+        # basicAuth: true
+        # basicAuthUser: admin
+        # secureJsonData:
+        #   basicAuthPassword: 'YOUR_PASSWORD_HERE'
+
+# Ingress configuration (nginx example)
+ingress:
+  enabled: true
+  ingressClassName: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
+    # For large dashboards or long queries:
+    nginx.ingress.kubernetes.io/proxy-body-size: "10m"
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
+  hosts:
+    - grafana-gpu.example.com
+  tls:
+    - secretName: grafana-tls
+      hosts:
+        - grafana-gpu.example.com
+  path: /
+  pathType: Prefix
+
+# Alternative: Traefik Ingress configuration
+# ingress:
+#   enabled: true
+#   ingressClassName: traefik
+#   annotations:
+#     cert-manager.io/cluster-issuer: letsencrypt-prod
+#     traefik.ingress.kubernetes.io/router.entrypoints: websecure
+#     traefik.ingress.kubernetes.io/router.tls: "true"
+#   hosts:
+#     - grafana-gpu.example.com
+#   tls:
+#     - secretName: grafana-tls
+#       hosts:
+#         - grafana-gpu.example.com
+
+# Service configuration (ClusterIP with Ingress)
+service:
+  enabled: true
+  type: ClusterIP
+  port: 3000
+  targetPort: 3000
+
+# For LoadBalancer service (alternative to Ingress):
+# service:
+#   type: LoadBalancer
+#   loadBalancerIP: 192.168.1.100  # Optional: static IP
+#   annotations:
+#     service.beta.kubernetes.io/aws-load-balancer-type: "nlb"  # For AWS
+#     cloud.google.com/load-balancer-type: "Internal"  # For GCP
+
+# For NodePort service (alternative to Ingress):
+# service:
+#   type: NodePort
+#   nodePort: 30300
+#   port: 3000
+
+# Security context for standard Kubernetes
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 472
+  fsGroup: 472
+
+# Pod security context
+podSecurityContext:
+  runAsNonRoot: true
+  fsGroup: 472
+
+# Environment variables
+env:
+  GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password
+  GF_SERVER_ROOT_URL: https://grafana-gpu.example.com
+  GF_SERVER_DOMAIN: grafana-gpu.example.com
+  GF_ANALYTICS_REPORTING_ENABLED: "false"
+  GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
+
+# Grafana configuration
+grafana.ini:
+  server:
+    domain: grafana-gpu.example.com
+    root_url: https://grafana-gpu.example.com
+  analytics:
+    reporting_enabled: false
+    check_for_updates: false
+  log:
+    mode: console
+    level: info
+  auth.anonymous:
+    enabled: false
+  security:
+    admin_user: admin
+  dashboards:
+    default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json
+
+# Service account
+serviceAccount:
+  create: true
+  name: grafana
+  annotations: {}
+
+# RBAC
+rbac:
+  create: true
+  pspEnabled: false
+  extraClusterRoleRules:
+    - apiGroups:
+        - ""
+      resources:
+        - configmaps
+      verbs:
+        - get
+        - list
+        - watch
+
+# Optional: PodDisruptionBudget for high availability
+# podDisruptionBudget:
+#   minAvailable: 1
+
+# Optional: HorizontalPodAutoscaler
+# autoscaling:
+#   enabled: true
+#   minReplicas: 1
+#   maxReplicas: 3
+#   targetCPU: 80
+#   targetMemory: 80
+
+# Optional: Node affinity to schedule on specific nodes
+# nodeSelector:
+#   node-role.kubernetes.io/monitoring: "true"
+
+# Optional: Tolerations for tainted nodes
+# tolerations:
+#   - key: monitoring
+#     operator: Equal
+#     value: "true"
+#     effect: NoSchedule
diff --git a/helm/grafana-values.yaml b/helm/grafana-values.yaml
new file mode 100644
index 0000000..1b3209d
--- /dev/null
+++ b/helm/grafana-values.yaml
@@ -0,0 +1,234 @@
+# Grafana Helm Chart Values for GPU Dashboard
+#
+# This values file deploys Grafana with the GPU Pruner dashboard pre-configured.
+#
+# Usage:
+#   helm repo add grafana https://grafana.github.io/helm-charts
+#   helm install gpu-grafana grafana/grafana -f helm/grafana-values.yaml -n monitoring --create-namespace
+#
+# Prerequisites:
+#   - Prometheus accessible in cluster
+#   - DCGM exporter running on GPU nodes
+#   - kube-state-metrics with pod labels enabled
+
+# Replica count for Grafana
+replicas: 1
+
+# Grafana image configuration
+image:
+  repository: grafana/grafana-oss
+  tag: "10.0.0"
+  pullPolicy: IfNotPresent
+
+# Admin user configuration
+# SECURITY: Use a secret for production deployments
+adminUser: admin
+# adminPassword: "changeme"  # Set via --set or use existingSecret
+
+# Use existing secret for admin credentials (recommended for production)
+# admin:
+#   existingSecret: grafana-admin-secret
+#   userKey: admin-user
+#   passwordKey: admin-password
+
+# Persistence configuration for dashboards and plugins
+persistence:
+  enabled: true
+  type: pvc
+  size: 10Gi
+  # storageClassName: default
+  accessModes:
+    - ReadWriteOnce
+
+# Prometheus datasource configuration
+# CRITICAL: UID must match gpu-dashboard.json (PBFA97CFB590B2093)
+datasources:
+  datasources.yaml:
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        type: prometheus
+        # IMPORTANT: Update this URL to match your Prometheus service
+        # Examples:
+        #   OpenShift: http://thanos-querier.openshift-monitoring.svc.cluster.local:9090
+        #   Prometheus Operator: http://prometheus-k8s.monitoring.svc.cluster.local:9090
+        #   Custom: http://prometheus.prometheus.svc.cluster.local:9090
+        url: http://prometheus-k8s.monitoring.svc.cluster.local:9090
+        access: proxy
+        isDefault: true
+        # CRITICAL: This UID MUST match the hardcoded UID in gpu-dashboard.json
+        uid: PBFA97CFB590B2093
+        editable: true
+        jsonData:
+          timeInterval: 30s
+          queryTimeout: 60s
+          httpMethod: POST
+        # For OpenShift with token authentication, uncomment:
+        # secureJsonData:
+        #   httpHeaderValue1: 'Bearer YOUR_TOKEN_HERE'
+        # jsonData:
+        #   httpHeaderName1: 'Authorization'
+
+# Dashboard provisioning configuration
+dashboardProviders:
+  dashboardproviders.yaml:
+    apiVersion: 1
+    providers:
+      - name: 'gpu-pruner'
+        orgId: 1
+        folder: 'GPU Monitoring'
+        type: file
+        disableDeletion: false
+        editable: true
+        updateIntervalSeconds: 10
+        allowUiUpdates: true
+        options:
+          path: /var/lib/grafana/dashboards/gpu-pruner
+
+# Dashboard import configuration
+dashboards:
+  gpu-pruner:
+    gpu-dashboard:
+      # Import dashboard from GitHub repository
+      url: https://raw.githubusercontent.com/wseaton/gpu-pruner/main/gpu-dashboard.json
+      datasource: Prometheus
+      # Alternative: Use local file via ConfigMap (see grafana-dashboard-configmap.yaml)
+      # configMapRef:
+      #   name: grafana-gpu-dashboard
+      #   key: gpu-dashboard.json
+
+# Service configuration
+service:
+  enabled: true
+  type: ClusterIP
+  port: 3000
+  targetPort: 3000
+  # Uncomment for NodePort or LoadBalancer
+  # type: NodePort
+  # nodePort: 30300
+  # type: LoadBalancer
+
+# Ingress configuration (for vanilla Kubernetes)
+ingress:
+  enabled: false
+  ingressClassName: nginx
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    # For other ingress controllers, adjust annotations accordingly
+  hosts:
+    - grafana-gpu.example.com
+  tls:
+    - secretName: grafana-tls
+      hosts:
+        - grafana-gpu.example.com
+  path: /
+  pathType: Prefix
+
+# Resource limits and requests
+resources:
+  limits:
+    cpu: 500m
+    memory: 512Mi
+  requests:
+    cpu: 250m
+    memory: 256Mi
+
+# Security context
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 472
+  fsGroup: 472
+
+# Pod annotations
+podAnnotations:
+  prometheus.io/scrape: "true"
+  prometheus.io/port: "3000"
+
+# Environment variables
+env:
+  GF_SECURITY_ADMIN_PASSWORD__FILE: /etc/secrets/admin-password
+  GF_INSTALL_PLUGINS: ""
+  GF_ANALYTICS_REPORTING_ENABLED: "false"
+  GF_ANALYTICS_CHECK_FOR_UPDATES: "false"
+  # For anonymous access (optional, not recommended for production)
+  # GF_AUTH_ANONYMOUS_ENABLED: "true"
+  # GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer"
+
+# ConfigMaps to mount as volumes
+extraConfigmapMounts: []
+  # - name: gpu-dashboard
+  #   mountPath: /var/lib/grafana/dashboards/gpu-pruner
+  #   configMap: grafana-gpu-dashboard
+  #   readOnly: true
+
+# Service account configuration
+serviceAccount:
+  create: true
+  name: grafana
+  annotations: {}
+
+# RBAC configuration
+rbac:
+  create: true
+  pspEnabled: false
+  # For sidecar dashboard provisioning, additional permissions needed
+  extraClusterRoleRules:
+    - apiGroups:
+        - ""
+      resources:
+        - configmaps
+      verbs:
+        - get
+        - list
+        - watch
+
+# Node selector for pod placement (optional)
+nodeSelector: {}
+  # Example: Deploy on specific nodes
+  # kubernetes.io/hostname: grafana-node
+
+# Tolerations (optional)
+tolerations: []
+
+# Affinity rules (optional)
+affinity: {}
+
+# Grafana.ini configuration
+grafana.ini:
+  server:
+    domain: grafana-gpu.example.com
+    root_url: "%(protocol)s://%(domain)s/"
+  analytics:
+    reporting_enabled: false
+    check_for_updates: false
+  log:
+    mode: console
+    level: info
+  auth.anonymous:
+    enabled: false
+  security:
+    admin_user: admin
+    # Use secret for password in production
+  dashboards:
+    default_home_dashboard_path: /var/lib/grafana/dashboards/gpu-pruner/gpu-dashboard.json
+
+# Sidecar configuration (alternative dashboard provisioning method)
+sidecar:
+  dashboards:
+    enabled: false  # Set to true to use ConfigMap-based provisioning
+    label: grafana_dashboard
+    labelValue: "1"
+    folder: /tmp/dashboards
+    defaultDashboardsEnabled: true
+    searchNamespace: monitoring
+  datasources:
+    enabled: false
+
+# Plugins to install (optional)
+plugins: []
+  # - grafana-piechart-panel
+  # - grafana-worldmap-panel
+
+# Image renderer for PDF exports (optional)
+imageRenderer:
+  enabled: false
diff --git a/import-dashboard.sh b/import-dashboard.sh
new file mode 100755
index 0000000..0670e68
--- /dev/null
+++ b/import-dashboard.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Import GPU Dashboard to Grafana
+#
+# Usage: ./import-dashboard.sh [grafana-url] [admin-password]
+#
+# Example: ./import-dashboard.sh http://localhost:3000 GpuPruner2026!
+
+GRAFANA_URL="${1:-http://localhost:3000}"
+ADMIN_PASSWORD="${2:-GpuPruner2026!}"
+DASHBOARD_FILE="gpu-dashboard.json"
+
+echo "Importing GPU Dashboard to Grafana..."
+echo "Grafana URL: $GRAFANA_URL"
+
+# Test Grafana connectivity
+echo -n "Testing Grafana connectivity... "
+if curl -s "$GRAFANA_URL/api/health" | grep -q "ok"; then
+    echo "✓ Connected"
+else
+    echo "✗ Failed to connect to Grafana"
+    exit 1
+fi
+
+# Prepare dashboard JSON (wrap in API format)
+DASHBOARD_JSON=$(cat "$DASHBOARD_FILE" | jq '{dashboard: ., overwrite: true, folderId: 0}')
+
+# Import dashboard
+echo -n "Importing dashboard... "
+RESPONSE=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -u "admin:$ADMIN_PASSWORD" \
+    -d "$DASHBOARD_JSON" \
+    "$GRAFANA_URL/api/dashboards/db")
+
+if echo "$RESPONSE" | jq -e '.status == "success"' > /dev/null 2>&1; then
+    echo "✓ Success"
+    DASHBOARD_URL=$(echo "$RESPONSE" | jq -r '.url')
+    echo ""
+    echo "Dashboard imported successfully!"
+    echo "Access it at: $GRAFANA_URL$DASHBOARD_URL"
+else
+    echo "✗ Failed"
+    echo ""
+    echo "Error response:"
+    echo "$RESPONSE" | jq .
+    exit 1
+fi

From e04a235c10fcb9100ee000a4e12c9664bffb7f42 Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Thu, 4 Jun 2026 14:48:03 -0400
Subject: [PATCH 3/7] Add graphics engine active panel to GPU Health dashboard
 row.

Visualize DCGM_FI_PROF_GR_ENGINE_ACTIVE per node and document PromQL
in DASHBOARD.md and GRAFANA_DEPLOYMENT.md.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 DASHBOARD.md                          |  1 +
 GRAFANA_DEPLOYMENT.md                 |  1 +
 gpu-dashboard.json                    | 34 +++++++++++++++++++++++----
 helm/grafana-dashboard-configmap.yaml | 34 +++++++++++++++++++++++----
 4 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/DASHBOARD.md b/DASHBOARD.md
index 9965223..154a65c 100644
--- a/DASHBOARD.md
+++ b/DASHBOARD.md
@@ -153,6 +153,7 @@ Panels in the **GPU Health & DCGM** row use additional dcgm-exporter counters. P
 | Power draw by node | `sum by (Hostname) (DCGM_FI_DEV_POWER_USAGE)` |
 | VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` |
 | Memory copy utilization | `avg by (Hostname) (DCGM_FI_DEV_MEM_COPY_UTIL)` |
+| Graphics/compute engine active by node | `avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)` |
 | XID errors (1h increase) | `sum by (Hostname, gpu) (increase(DCGM_FI_DEV_XID_ERRORS[1h]))` |
 | SM active by node | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` |
 | Tensor pipe active by node | `avg by (Hostname) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE)` |
diff --git a/GRAFANA_DEPLOYMENT.md b/GRAFANA_DEPLOYMENT.md
index 5981ce7..d1ec951 100644
--- a/GRAFANA_DEPLOYMENT.md
+++ b/GRAFANA_DEPLOYMENT.md
@@ -412,6 +412,7 @@ Check each panel shows data (not "No data"):
 | Peak power | `max(DCGM_FI_DEV_POWER_USAGE)` | Max per-GPU watts |
 | XID errors | `sum(DCGM_FI_DEV_XID_ERRORS)` | Should be 0 in healthy clusters |
 | VRAM utilization % | `100 * avg by (Hostname, gpu) (DCGM_FI_DEV_FB_USED / DCGM_FI_DEV_FB_TOTAL)` | Per-GPU VRAM fill |
+| Graphics/compute engine active | `avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)` | 0–1; primary idle-detection metric |
 | SM active (profiling) | `avg by (Hostname) (DCGM_FI_PROF_SM_ACTIVE)` | No data if profiling disabled |
 
 ### 6. Test Prometheus Queries Manually
diff --git a/gpu-dashboard.json b/gpu-dashboard.json
index dbd3391..55cf0d6 100644
--- a/gpu-dashboard.json
+++ b/gpu-dashboard.json
@@ -670,11 +670,37 @@
         }
       }
     },
+    {
+      "title": "Graphics/compute engine active by node",
+      "description": "Average graphics/compute engine activity (DCGM_FI_PROF_GR_ENGINE_ACTIVE, 0–1). Primary metric for idle detection in gpu-pruner and engine idle/active overview stats.",
+      "type": "timeseries",
+      "gridPos": { "h": 6, "w": 24, "x": 0, "y": 99 },
+      "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+      "targets": [
+        {
+          "expr": "avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "custom": {
+            "drawStyle": "line",
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "spanNulls": true
+          }
+        }
+      }
+    },
     {
       "title": "SM active by node",
       "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 105 },
       "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
       "targets": [
         {
@@ -700,7 +726,7 @@
       "title": "Tensor pipe active by node",
       "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 105 },
       "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
       "targets": [
         {
@@ -726,7 +752,7 @@
       "title": "DRAM active by node",
       "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 105 },
       "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
       "targets": [
         {
@@ -757,5 +783,5 @@
   "timezone": "browser",
   "title": "Waldorf GPU Usage & Idle Tracker",
   "uid": "prometheus",
-  "version": 2
+  "version": 3
 }
diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml
index 3fe4be0..6380fa2 100644
--- a/helm/grafana-dashboard-configmap.yaml
+++ b/helm/grafana-dashboard-configmap.yaml
@@ -682,11 +682,37 @@ data:
             }
           }
         },
+        {
+          "title": "Graphics/compute engine active by node",
+          "description": "Average graphics/compute engine activity (DCGM_FI_PROF_GR_ENGINE_ACTIVE, 0–1). Primary metric for idle detection in gpu-pruner and engine idle/active overview stats.",
+          "type": "timeseries",
+          "gridPos": { "h": 6, "w": 24, "x": 0, "y": 99 },
+          "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
+          "targets": [
+            {
+              "expr": "avg by (Hostname) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1,
+              "custom": {
+                "drawStyle": "line",
+                "lineWidth": 2,
+                "fillOpacity": 15,
+                "spanNulls": true
+              }
+            }
+          }
+        },
         {
           "title": "SM active by node",
           "description": "DCGM profiling: streaming multiprocessor activity (DCGM_FI_PROF_SM_ACTIVE). No data if DCGM profiling is not enabled on the exporter.",
           "type": "timeseries",
-          "gridPos": { "h": 8, "w": 8, "x": 0, "y": 99 },
+          "gridPos": { "h": 8, "w": 8, "x": 0, "y": 105 },
           "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
           "targets": [
             {
@@ -712,7 +738,7 @@ data:
           "title": "Tensor pipe active by node",
           "description": "DCGM profiling: tensor pipe activity (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE). No data if DCGM profiling is not enabled.",
           "type": "timeseries",
-          "gridPos": { "h": 8, "w": 8, "x": 8, "y": 99 },
+          "gridPos": { "h": 8, "w": 8, "x": 8, "y": 105 },
           "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
           "targets": [
             {
@@ -738,7 +764,7 @@ data:
           "title": "DRAM active by node",
           "description": "DCGM profiling: DRAM activity (DCGM_FI_PROF_DRAM_ACTIVE). No data if DCGM profiling is not enabled.",
           "type": "timeseries",
-          "gridPos": { "h": 8, "w": 8, "x": 16, "y": 99 },
+          "gridPos": { "h": 8, "w": 8, "x": 16, "y": 105 },
           "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
           "targets": [
             {
@@ -769,5 +795,5 @@ data:
       "timezone": "browser",
       "title": "Waldorf GPU Usage & Idle Tracker",
       "uid": "prometheus",
-      "version": 2
+      "version": 3
     }

From bacde097f8a9dd2a3bc6d40cba9cac970c9e97cd Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Mon, 8 Jun 2026 10:05:42 -0400
Subject: [PATCH 4/7] Update GPU dashboard to include 30-minute metrics for
 idle and active states. Adjusted legend formats and added refIds for clarity
 in the Grafana configuration. Ensured consistency across dashboard panels for
 better monitoring of GPU workloads.

---
 gpu-dashboard.json                    | 17 +++++++++++------
 helm/grafana-dashboard-configmap.yaml | 17 +++++++++++------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/gpu-dashboard.json b/gpu-dashboard.json
index 55cf0d6..c414ca7 100644
--- a/gpu-dashboard.json
+++ b/gpu-dashboard.json
@@ -82,7 +82,8 @@
       "targets": [
         {
           "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)",
-          "legendFormat": "Idle"
+          "legendFormat": "Idle (30m)",
+          "refId": "Idle (30m)"
         }
       ],
       "fieldConfig": {
@@ -102,7 +103,8 @@
       "targets": [
         {
           "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)",
-          "legendFormat": "Active"
+          "legendFormat": "Active (30m)",
+          "refId": "Active (30m)"
         }
       ],
       "fieldConfig": {
@@ -233,7 +235,8 @@
           "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0",
           "legendFormat": "",
           "format": "table",
-          "instant": true
+          "instant": true,
+          "refId": "Idle GPUs (30m)"
         }
       ],
       "transformations": [
@@ -266,8 +269,9 @@
       "targets": [
         {
           "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5",
-          "legendFormat": "{{deployment}} ({{namespace}})",
-          "interval": "1m"
+          "legendFormat": "{{deployment}} ({{namespace}}) (30m)",
+          "interval": "1m",
+          "refId": "By deployment (30m)"
         }
       ],
       "fieldConfig": {
@@ -322,7 +326,8 @@
           "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)",
           "legendFormat": "",
           "format": "table",
-          "instant": true
+          "instant": true,
+          "refId": "By deployment (30m)"
         }
       ],
       "transformations": [
diff --git a/helm/grafana-dashboard-configmap.yaml b/helm/grafana-dashboard-configmap.yaml
index 6380fa2..8d52030 100644
--- a/helm/grafana-dashboard-configmap.yaml
+++ b/helm/grafana-dashboard-configmap.yaml
@@ -94,7 +94,8 @@ data:
           "targets": [
             {
               "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) == 0)",
-              "legendFormat": "Idle"
+              "legendFormat": "Idle (30m)",
+              "refId": "Idle (30m)"
             }
           ],
           "fieldConfig": {
@@ -114,7 +115,8 @@ data:
           "targets": [
             {
               "expr": "count(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m]) > 0)",
-              "legendFormat": "Active"
+              "legendFormat": "Active (30m)",
+              "refId": "Active (30m)"
             }
           ],
           "fieldConfig": {
@@ -245,7 +247,8 @@ data:
               "expr": "max by (Hostname, gpu, modelName) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE[30m])) == 0",
               "legendFormat": "",
               "format": "table",
-              "instant": true
+              "instant": true,
+              "refId": "Idle GPUs (30m)"
             }
           ],
           "transformations": [
@@ -278,8 +281,9 @@ data:
           "targets": [
             {
               "expr": "(label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5",
-              "legendFormat": "{{deployment}} ({{namespace}})",
-              "interval": "1m"
+              "legendFormat": "{{deployment}} ({{namespace}}) (30m)",
+              "interval": "1m",
+              "refId": "By deployment (30m)"
             }
           ],
           "fieldConfig": {
@@ -334,7 +338,8 @@ data:
               "expr": "sort_desc((label_replace(sum by (label_app, namespace) (count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\") ) * on (pod, namespace) group_left(label_app) kube_pod_labels{label_app != \"\"}), \"deployment\", \"$1\", \"label_app\", \"(.+)\") or sum by (deployment, namespace) (label_replace(count by (pod, namespace) (label_replace(label_replace((max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100 or max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_pod != \"\", namespace!~\"cw-.*\"}[30m]) / 100) == bool 0, \"pod\", \"$1\", \"exported_pod\", \"(.+)\"), \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\")), \"deployment\", \"$1\", \"pod\", \"^(.*)-[^-]+-[^-]+$\"))) * 0.5)",
               "legendFormat": "",
               "format": "table",
-              "instant": true
+              "instant": true,
+              "refId": "By deployment (30m)"
             }
           ],
           "transformations": [

From d91b98758138f693335bcf94c67042b7197b3f87 Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Tue, 9 Jun 2026 14:56:46 -0400
Subject: [PATCH 5/7] added slack notifies and removed UI

---
 README.md                                     |  19 ++
 gpu-pruner/hack/deployment.yaml               |   7 +
 .../hack/slack-webhook-secret.example.yaml    |  14 +
 gpu-pruner/src/dashboard.html                 | 296 ------------------
 gpu-pruner/src/dashboard.rs                   | 105 -------
 gpu-pruner/src/lib.rs                         |  96 +++++-
 gpu-pruner/src/main.rs                        | 163 ++++++----
 gpu-pruner/src/metrics.rs                     |  47 +++
 gpu-pruner/src/slack.rs                       | 121 +++++++
 gpu-pruner/tests/e2e.rs                       |   4 +-
 10 files changed, 397 insertions(+), 475 deletions(-)
 create mode 100644 gpu-pruner/hack/slack-webhook-secret.example.yaml
 delete mode 100644 gpu-pruner/src/dashboard.html
 delete mode 100644 gpu-pruner/src/dashboard.rs
 create mode 100644 gpu-pruner/src/slack.rs

diff --git a/README.md b/README.md
index e5d9018..f443f37 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,24 @@ The background for `gpu-pruner` is that in certain environments it is very easy
 
 This culler politely pauses workloads that appear idle by scaling them down to 0 replicas. Features may be added in the future for better notifications, but the idea is that a user can simply re-enable the workload when they are ready to test/demo again.
 
+## Acknowledgment System
+
+**NEW**: Prevent unwanted scale-downs by acknowledging workloads that are intentionally idle.
+
+Users can acknowledge idle workloads via the web dashboard to prevent gpu-pruner from scaling them down. Use cases:
+- Loading large datasets
+- Model warm-up / compilation
+- Interactive debugging sessions
+- Scheduled batch jobs with intermittent GPU usage
+
+**Quick Start:**
+1. Open the web dashboard: `http://dashboard-url:8080`
+2. Enter your email address
+3. Click **4h**, **8h**, or **24h** buttons next to idle workloads
+4. Acknowledged workloads won't be scaled down until the acknowledgment expires
+
+See [ACKNOWLEDGMENT_GUIDE.md](ACKNOWLEDGMENT_GUIDE.md) for complete documentation, API usage, and troubleshooting.
+
 ## Dashboard
 
 The gpu-pruner includes both a **web dashboard** and a **Grafana dashboard** for monitoring GPU workloads.
@@ -26,6 +44,7 @@ Real-time web interface for monitoring GPU workloads. See [DASHBOARD.md](DASHBOA
 
 Features:
 - Real-time monitoring of idle GPU workloads
+- **Acknowledgment system** - prevent scale-downs with duration-based acknowledgments
 - Resource usage statistics
 - Modern web UI with auto-refresh
 - REST API endpoint for programmatic access
diff --git a/gpu-pruner/hack/deployment.yaml b/gpu-pruner/hack/deployment.yaml
index 58c7b07..797685e 100644
--- a/gpu-pruner/hack/deployment.yaml
+++ b/gpu-pruner/hack/deployment.yaml
@@ -23,11 +23,18 @@ spec:
             - '--run-mode=scale-down'
             - '--prometheus-url=http://thanos-querier.openshift-monitoring.svc.cluster.local'
             - '--dashboard-port=8080'
+            - '--slack-channel=#test-pruner'
           env:
             - name: RUST_BACKTRACE
               value: '1'
             - name: RUST_LOG
               value: info
+            - name: SLACK_WEBHOOK_URL
+              valueFrom:
+                secretKeyRef:
+                  name: gpu-pruner-slack-webhook
+                  key: webhook-url
+                  optional: true
           ports:
             - containerPort: 8080
               name: dashboard
diff --git a/gpu-pruner/hack/slack-webhook-secret.example.yaml b/gpu-pruner/hack/slack-webhook-secret.example.yaml
new file mode 100644
index 0000000..6ab66f3
--- /dev/null
+++ b/gpu-pruner/hack/slack-webhook-secret.example.yaml
@@ -0,0 +1,14 @@
+# Create the real secret (do not commit webhook URLs):
+#
+#   kubectl create secret generic gpu-pruner-slack-webhook \
+#     --namespace=gpu-pruner-system \
+#     --from-literal=webhook-url='https://hooks.slack.com/services/T.../B.../...'
+#
+apiVersion: v1
+kind: Secret
+metadata:
+  name: gpu-pruner-slack-webhook
+  namespace: gpu-pruner-system
+type: Opaque
+stringData:
+  webhook-url: https://hooks.slack.com/services/REPLACE/ME/REPLACE
diff --git a/gpu-pruner/src/dashboard.html b/gpu-pruner/src/dashboard.html
deleted file mode 100644
index e7c4e17..0000000
--- a/gpu-pruner/src/dashboard.html
+++ /dev/null
@@ -1,296 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>GPU Pruner Dashboard</title>
-    <style>
-        * {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
-
-        body {
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            min-height: 100vh;
-            padding: 20px;
-        }
-
-        .container {
-            max-width: 1400px;
-            margin: 0 auto;
-        }
-
-        .header {
-            background: white;
-            border-radius: 12px;
-            padding: 30px;
-            margin-bottom: 20px;
-            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        }
-
-        .header h1 {
-            color: #333;
-            font-size: 32px;
-            margin-bottom: 10px;
-        }
-
-        .header p {
-            color: #666;
-            font-size: 16px;
-        }
-
-        .stats-grid {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
-            gap: 20px;
-            margin-bottom: 20px;
-        }
-
-        .stat-card {
-            background: white;
-            border-radius: 12px;
-            padding: 25px;
-            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-            transition: transform 0.2s;
-        }
-
-        .stat-card:hover {
-            transform: translateY(-5px);
-        }
-
-        .stat-label {
-            color: #666;
-            font-size: 14px;
-            text-transform: uppercase;
-            letter-spacing: 1px;
-            margin-bottom: 10px;
-        }
-
-        .stat-value {
-            color: #333;
-            font-size: 36px;
-            font-weight: bold;
-        }
-
-        .stat-card.warning .stat-value {
-            color: #f59e0b;
-        }
-
-        .workloads-section {
-            background: white;
-            border-radius: 12px;
-            padding: 30px;
-            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        }
-
-        .section-title {
-            font-size: 24px;
-            color: #333;
-            margin-bottom: 20px;
-            padding-bottom: 10px;
-            border-bottom: 2px solid #667eea;
-        }
-
-        .workload-table {
-            width: 100%;
-            border-collapse: collapse;
-        }
-
-        .workload-table thead {
-            background: #f9fafb;
-        }
-
-        .workload-table th {
-            text-align: left;
-            padding: 12px;
-            color: #666;
-            font-weight: 600;
-            font-size: 14px;
-            text-transform: uppercase;
-            letter-spacing: 0.5px;
-        }
-
-        .workload-table td {
-            padding: 15px 12px;
-            border-top: 1px solid #e5e7eb;
-            color: #333;
-        }
-
-        .workload-table tbody tr:hover {
-            background: #f9fafb;
-        }
-
-        .badge {
-            display: inline-block;
-            padding: 4px 12px;
-            border-radius: 12px;
-            font-size: 12px;
-            font-weight: 600;
-            text-transform: uppercase;
-        }
-
-        .badge.deployment {
-            background: #dbeafe;
-            color: #1e40af;
-        }
-
-        .badge.statefulset {
-            background: #fef3c7;
-            color: #92400e;
-        }
-
-        .badge.replicaset {
-            background: #ddd6fe;
-            color: #5b21b6;
-        }
-
-        .badge.notebook {
-            background: #fce7f3;
-            color: #9f1239;
-        }
-
-        .badge.inferenceservice {
-            background: #d1fae5;
-            color: #065f46;
-        }
-
-        .namespace {
-            color: #667eea;
-            font-family: 'Courier New', monospace;
-        }
-
-        .loading {
-            text-align: center;
-            padding: 40px;
-            color: #666;
-        }
-
-        .last-update {
-            text-align: right;
-            color: #666;
-            font-size: 14px;
-            margin-top: 20px;
-            padding-top: 20px;
-            border-top: 1px solid #e5e7eb;
-        }
-
-        .empty-state {
-            text-align: center;
-            padding: 60px 20px;
-            color: #666;
-        }
-
-        .empty-state svg {
-            width: 80px;
-            height: 80px;
-            margin-bottom: 20px;
-            color: #667eea;
-        }
-    </style>
-</head>
-<body>
-    <div class="container">
-        <div class="header">
-            <h1>GPU Pruner Dashboard</h1>
-            <p>Monitor idle GPU workloads in the cluster</p>
-        </div>
-
-        <div class="stats-grid" id="stats">
-            <div class="stat-card">
-                <div class="stat-label">Total Pods Checked</div>
-                <div class="stat-value" id="total-pods">-</div>
-            </div>
-            <div class="stat-card warning">
-                <div class="stat-label">Idle Workloads</div>
-                <div class="stat-value" id="idle-workloads">-</div>
-            </div>
-            <div class="stat-card warning">
-                <div class="stat-label">Wasted GPU Resources</div>
-                <div class="stat-value" id="idle-gpus">-</div>
-            </div>
-        </div>
-
-        <div class="workloads-section">
-            <h2 class="section-title">Idle GPU Workloads</h2>
-            <div id="workloads-content">
-                <div class="loading">Loading...</div>
-            </div>
-            <div class="last-update" id="last-update"></div>
-        </div>
-    </div>
-
-    <script>
-        let refreshInterval;
-
-        async function fetchData() {
-            try {
-                const response = await fetch('/api/status');
-                const data = await response.json();
-                updateDashboard(data);
-            } catch (error) {
-                console.error('Error fetching data:', error);
-            }
-        }
-
-        function updateDashboard(data) {
-            document.getElementById('total-pods').textContent = data.total_pods_checked.toLocaleString();
-            document.getElementById('idle-workloads').textContent = data.idle_workloads.length.toLocaleString();
-            document.getElementById('idle-gpus').textContent = data.total_idle_gpus.toLocaleString();
-
-            const workloadsContent = document.getElementById('workloads-content');
-
-            if (data.idle_workloads.length === 0) {
-                workloadsContent.innerHTML = `
-                    <div class="empty-state">
-                        <svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
-                                d="M9 12l2 2 4-4m6 2a9 9 0 11-18 0 9 9 0 0118 0z"></path>
-                        </svg>
-                        <h3>No Idle Workloads</h3>
-                        <p>All GPU resources are being utilized efficiently!</p>
-                    </div>
-                `;
-            } else {
-                workloadsContent.innerHTML = `
-                    <table class="workload-table">
-                        <thead>
-                            <tr>
-                                <th>Namespace</th>
-                                <th>Name</th>
-                                <th>Type</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            ${data.idle_workloads.map(w => `
-                                <tr>
-                                    <td><span class="namespace">${w.namespace}</span></td>
-                                    <td>${w.name}</td>
-                                    <td><span class="badge ${w.kind.toLowerCase()}">${w.kind}</span></td>
-                                </tr>
-                            `).join('')}
-                        </tbody>
-                    </table>
-                `;
-            }
-
-            const lastUpdate = new Date(data.last_update);
-            document.getElementById('last-update').textContent =
-                `Last updated: ${lastUpdate.toLocaleString()}`;
-        }
-
-        // Fetch data immediately and then every 10 seconds
-        fetchData();
-        refreshInterval = setInterval(fetchData, 10000);
-
-        // Cleanup on page unload
-        window.addEventListener('beforeunload', () => {
-            if (refreshInterval) {
-                clearInterval(refreshInterval);
-            }
-        });
-    </script>
-</body>
-</html>
diff --git a/gpu-pruner/src/dashboard.rs b/gpu-pruner/src/dashboard.rs
deleted file mode 100644
index e5b0e4a..0000000
--- a/gpu-pruner/src/dashboard.rs
+++ /dev/null
@@ -1,105 +0,0 @@
-use axum::{
-    extract::State,
-    response::{Html, IntoResponse},
-    routing::get,
-    Json, Router,
-};
-use std::sync::Arc;
-use tokio::sync::RwLock;
-use tower_http::{cors::CorsLayer, trace::TraceLayer};
-
-use gpu_pruner::{metrics, Meta, ScaleKind};
-
-#[derive(Clone, Debug, serde::Serialize)]
-pub struct WorkloadInfo {
-    pub name: String,
-    pub namespace: String,
-    pub kind: String,
-    pub gpu_model: Option<String>,
-    pub idle_duration: Option<String>,
-}
-
-#[derive(Clone, Debug, serde::Serialize)]
-pub struct DashboardState {
-    pub idle_workloads: Vec<WorkloadInfo>,
-    pub total_idle_gpus: usize,
-    pub total_pods_checked: usize,
-    pub last_update: String,
-}
-
-impl Default for DashboardState {
-    fn default() -> Self {
-        Self {
-            idle_workloads: Vec::new(),
-            total_idle_gpus: 0,
-            total_pods_checked: 0,
-            last_update: chrono::Utc::now().to_rfc3339(),
-        }
-    }
-}
-
-pub type SharedDashboardState = Arc<RwLock<DashboardState>>;
-
-pub async fn update_dashboard_state(
-    state: SharedDashboardState,
-    idle_workloads: Vec<ScaleKind>,
-    total_pods: usize,
-) {
-    let workloads: Vec<WorkloadInfo> = idle_workloads
-        .iter()
-        .map(|w| WorkloadInfo {
-            name: w.name(),
-            namespace: w.namespace().unwrap_or_default(),
-            kind: w.kind(),
-            gpu_model: None,
-            idle_duration: None,
-        })
-        .collect();
-
-    let idle_count = idle_workloads.len();
-
-    let mut state = state.write().await;
-    state.idle_workloads = workloads;
-    state.total_idle_gpus = idle_count;
-    state.total_pods_checked = total_pods;
-    state.last_update = chrono::Utc::now().to_rfc3339();
-
-    // Update Prometheus gauges
-    metrics::IDLE_GPUS.set(idle_count as i64);
-    metrics::PODS_CHECKED.set(total_pods as i64);
-}
-
-async fn dashboard_html() -> impl IntoResponse {
-    Html(include_str!("dashboard.html"))
-}
-
-async fn api_status(State(state): State<SharedDashboardState>) -> impl IntoResponse {
-    let state = state.read().await;
-    Json(state.clone())
-}
-
-async fn metrics_handler() -> impl IntoResponse {
-    metrics::render()
-}
-
-pub fn create_router(state: SharedDashboardState) -> Router {
-    Router::new()
-        .route("/", get(dashboard_html))
-        .route("/api/status", get(api_status))
-        .route("/metrics", get(metrics_handler))
-        .layer(CorsLayer::permissive())
-        .layer(TraceLayer::new_for_http())
-        .with_state(state)
-}
-
-pub async fn run_server(state: SharedDashboardState, port: u16) -> anyhow::Result<()> {
-    let app = create_router(state);
-    let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port));
-
-    tracing::info!("Dashboard server starting on http://{}", addr);
-
-    let listener = tokio::net::TcpListener::bind(addr).await?;
-    axum::serve(listener, app).await?;
-
-    Ok(())
-}
diff --git a/gpu-pruner/src/lib.rs b/gpu-pruner/src/lib.rs
index 02dc93d..c3c20e7 100644
--- a/gpu-pruner/src/lib.rs
+++ b/gpu-pruner/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod metrics;
+pub mod slack;
 
 use clap::ValueEnum;
 use k8s_openapi::{
@@ -35,6 +36,13 @@ use kube::{
     api::{ObjectMeta, Patch, PatchParams},
 };
 
+#[derive(Debug, Clone)]
+pub struct AckStatus {
+    pub acknowledged: bool,
+    pub expires_at: Option<String>,
+    pub by_user: Option<String>,
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub enum ScaleKind {
     Deployment(Deployment),
@@ -197,8 +205,12 @@ pub trait Meta {
 }
 
 pub trait Scaler {
-    fn scale(&self, client: Client)
-    -> impl std::future::Future<Output = anyhow::Result<()>> + Send;
+    fn scale(
+        &self,
+        client: Client,
+        slack_notifier: Option<slack::SlackNotifier>,
+        idle_duration_minutes: i64,
+    ) -> impl std::future::Future<Output = anyhow::Result<()>> + Send;
 
     fn generate_scale_event(&self) -> anyhow::Result<Event>;
 }
@@ -337,8 +349,13 @@ impl Meta for ScaleKind {
 }
 
 impl Scaler for ScaleKind {
-    #[tracing::instrument(skip(self, client))]
-    async fn scale(&self, client: Client) -> anyhow::Result<()> {
+    #[tracing::instrument(skip(self, client, slack_notifier))]
+    async fn scale(
+        &self,
+        client: Client,
+        slack_notifier: Option<slack::SlackNotifier>,
+        idle_duration_minutes: i64,
+    ) -> anyhow::Result<()> {
         if let Some(ns) = self.namespace() {
             let event = self.generate_scale_event()?;
             let events_api: Api<Event> = Api::namespaced(client.clone(), &ns);
@@ -348,6 +365,20 @@ impl Scaler for ScaleKind {
             } else {
                 tracing::debug!("Emitted scale event for: {:?}", event.involved_object);
             }
+
+            // Send Slack notification if configured
+            if let Some(notifier) = slack_notifier {
+                match notifier.send_notification(self, idle_duration_minutes).await {
+                    Ok(_) => {
+                        metrics::SLACK_NOTIFICATIONS_SENT.inc();
+                    }
+                    Err(e) => {
+                        metrics::SLACK_NOTIFICATION_FAILURES.inc();
+                        tracing::error!("Failed to send Slack notification: {e}");
+                        // Continue with scale-down even if notification fails
+                    }
+                }
+            }
         };
 
         match self {
@@ -429,6 +460,63 @@ impl Scaler for ScaleKind {
     }
 }
 
+/// Check if a workload has an active acknowledgment annotation
+#[tracing::instrument(skip(_client))]
+pub async fn check_acknowledgment(
+    _client: KubeClient,
+    workload: &ScaleKind,
+) -> anyhow::Result<AckStatus> {
+    use chrono::DateTime;
+
+    let annotations = match workload {
+        ScaleKind::Deployment(d) => d.metadata.annotations.clone(),
+        ScaleKind::ReplicaSet(r) => r.metadata.annotations.clone(),
+        ScaleKind::StatefulSet(s) => s.metadata.annotations.clone(),
+        ScaleKind::Notebook(n) => n.metadata.annotations.clone(),
+        ScaleKind::InferenceService(i) => i.metadata.annotations.clone(),
+    };
+
+    let annotations = match annotations {
+        Some(a) => a,
+        None => {
+            return Ok(AckStatus {
+                acknowledged: false,
+                expires_at: None,
+                by_user: None,
+            })
+        }
+    };
+
+    let ack_until = annotations.get("gpu-pruner.io/ack-until");
+    let ack_by = annotations.get("gpu-pruner.io/ack-by");
+
+    if let Some(expires_at_str) = ack_until {
+        // Parse the timestamp and check if it's still valid
+        if let Ok(expires_at) = DateTime::parse_from_rfc3339(expires_at_str) {
+            let now = chrono::Utc::now();
+            if expires_at.timestamp() > now.timestamp() {
+                return Ok(AckStatus {
+                    acknowledged: true,
+                    expires_at: Some(expires_at_str.clone()),
+                    by_user: ack_by.cloned(),
+                });
+            } else {
+                tracing::info!(
+                    "Acknowledgment expired for {} in {}",
+                    workload.name(),
+                    workload.namespace().unwrap_or_default()
+                );
+            }
+        }
+    }
+
+    Ok(AckStatus {
+        acknowledged: false,
+        expires_at: None,
+        by_user: None,
+    })
+}
+
 /// Crawl up the owner references to find the root Deployment or StatefulSet
 /// and allows an action like scaling to be performed.
 ///
diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs
index 37fe9eb..1fa8ad3 100644
--- a/gpu-pruner/src/main.rs
+++ b/gpu-pruner/src/main.rs
@@ -1,7 +1,5 @@
 use minijinja::{Environment, context};
 
-mod dashboard;
-
 #[cfg(feature = "otel")]
 use std::sync::LazyLock;
 #[cfg(feature = "otel")]
@@ -15,7 +13,7 @@ use {
     tracing_opentelemetry::{MetricsLayer, OpenTelemetryLayer},
 };
 
-use std::{collections::HashSet, fmt::Debug, sync::atomic::AtomicUsize};
+use std::{collections::HashSet, fmt::Debug, sync::{atomic::AtomicUsize, Arc}};
 use tokio::{sync::mpsc::Sender, time};
 
 use tracing_subscriber::EnvFilter;
@@ -36,8 +34,9 @@ use kube::{Api, Client as KubeClient, Resource};
 use clap::{Parser, ValueEnum};
 
 use gpu_pruner::{
-    Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, find_root_object,
-    get_enabled_resources, get_prom_client, get_prometheus_token,
+    Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, check_acknowledgment,
+    find_root_object, get_enabled_resources, get_prom_client, get_prometheus_token,
+    slack::SlackNotifier,
 };
 
 /// `gpu-pruner` is a tool to prune idle pods based on GPU utilization. It uses Prometheus to query
@@ -119,9 +118,15 @@ struct Cli {
     #[clap(short, long, default_value = "default")]
     log_format: LogFormat,
 
-    /// Enable the web dashboard on the specified port
+    /// Slack webhook URL for notifications. Can also be set via SLACK_WEBHOOK_URL env var.
+    /// Messages will be sent to the configured channel when idle GPUs are detected.
     #[clap(long)]
-    dashboard_port: Option<u16>,
+    slack_webhook_url: Option<String>,
+
+    /// Slack channel to send notifications to
+    #[clap(long, default_value = "#test-pruner")]
+    slack_channel: String,
+
 }
 
 #[derive(Debug, Clone, ValueEnum, Default, Serialize)]
@@ -287,20 +292,28 @@ async fn main() -> anyhow::Result<()> {
     let enabled_resources = get_enabled_resources(&args.enabled_resources);
     tracing::info!("Enabled resources: {enabled_resources:?}");
 
+    // Initialize Slack notifier if webhook URL is provided
+    let slack_notifier = args
+        .slack_webhook_url
+        .clone()
+        .or_else(|| std::env::var("SLACK_WEBHOOK_URL").ok())
+        .map(|url| {
+            tracing::info!("Slack notifications enabled for channel: {}", args.slack_channel);
+            Arc::new(SlackNotifier::new(url, args.slack_channel.clone()))
+        });
+
+    if slack_notifier.is_none() {
+        tracing::info!("Slack notifications disabled (no webhook URL configured)");
+    }
+
     let env: Environment = Environment::new();
     let query = env.render_str(include_str!("query.promql.j2"), context! { args })?;
     tracing::info!("Running w/ Query: {query}");
 
     let (tx, mut rx) = tokio::sync::mpsc::channel::<ScaleKind>(100);
 
-    // Initialize dashboard state if dashboard is enabled
-    let dashboard_state = std::sync::Arc::new(tokio::sync::RwLock::new(
-        dashboard::DashboardState::default(),
-    ));
-
     let query_task = {
         let args = args.clone();
-        let dashboard_state = dashboard_state.clone();
         tokio::spawn(async move {
             let mut interval =
                 time::interval(tokio::time::Duration::from_secs(args.check_interval));
@@ -310,7 +323,7 @@ async fn main() -> anyhow::Result<()> {
                 }
 
                 let client = build_prom_client(&args).await;
-                match run_query_and_scale(client, query.clone(), &args, tx.clone(), dashboard_state.clone()).await {
+                match run_query_and_scale(client, query.clone(), &args, tx.clone()).await {
                     Ok(qr) => {
                         QUERY_FAILURES.store(0, std::sync::atomic::Ordering::Relaxed);
                         gpu_pruner::metrics::QUERY_SUCCESSES.inc();
@@ -349,29 +362,33 @@ async fn main() -> anyhow::Result<()> {
         })
     };
 
-    let scale_down_task = tokio::spawn(async move {
-        let kube_client = KubeClient::try_default()
-            .await
-            .expect("failed to get kube client");
+    let scale_down_task = {
+        let slack_notifier = slack_notifier.clone();
+        let duration = args.duration;
+        tokio::spawn(async move {
+            let kube_client = KubeClient::try_default()
+                .await
+                .expect("failed to get kube client");
 
-        while let Some(sk) = rx.recv().await {
-            // Check if the resource is enabled
-            if !enabled_resources.contains(sk.clone().into()) {
-                tracing::info!(
-                    "Skipping resource type {kind:?} because it is not enabled",
-                    kind = sk.kind()
-                );
-                continue;
-            }
+            while let Some(sk) = rx.recv().await {
+                // Check if the resource is enabled
+                if !enabled_resources.contains(sk.clone().into()) {
+                    tracing::info!(
+                        "Skipping resource type {kind:?} because it is not enabled",
+                        kind = sk.kind()
+                    );
+                    continue;
+                }
 
-            if let Err(e) = sk.scale(kube_client.clone()).await {
-                gpu_pruner::metrics::SCALE_FAILURES.inc();
-                tracing::error!(
-                    monotonic_counter.scale_failures = 1,
-                    "Failed to scale resource! {e}"
-                );
-                continue;
-            }
+                let notifier = slack_notifier.as_ref().map(|n| (**n).clone());
+                if let Err(e) = sk.scale(kube_client.clone(), notifier, duration).await {
+                    gpu_pruner::metrics::SCALE_FAILURES.inc();
+                    tracing::error!(
+                        monotonic_counter.scale_failures = 1,
+                        "Failed to scale resource! {e}"
+                    );
+                    continue;
+                }
 
             let kind = sk.kind();
             let name = sk.name();
@@ -385,32 +402,15 @@ async fn main() -> anyhow::Result<()> {
                 name = name,
                 namespace = namespace
             )
-        }
-    });
-
-    // Start dashboard server if requested
-    let dashboard_task = if let Some(port) = args.dashboard_port {
-        let dashboard_state = dashboard_state.clone();
-        Some(tokio::spawn(async move {
-            dashboard::run_server(dashboard_state, port).await
-        }))
-    } else {
-        None
+            }
+        })
     };
 
     // Wait for all tasks
-    if let Some(dashboard_task) = dashboard_task {
-        _ = tokio::try_join! {
-            query_task,
-            scale_down_task,
-            dashboard_task
-        }?;
-    } else {
-        _ = tokio::try_join! {
-            query_task,
-            scale_down_task
-        }?;
-    }
+    _ = tokio::try_join! {
+        query_task,
+        scale_down_task
+    }?;
 
     Ok(())
 }
@@ -434,7 +434,6 @@ async fn run_query_and_scale(
     query: String,
     args: &Cli,
     tx: Sender<ScaleKind>,
-    dashboard_state: dashboard::SharedDashboardState,
 ) -> anyhow::Result<QueryResponse> {
     let response = match client.query(query).get().await {
         Ok(response) => response,
@@ -577,16 +576,44 @@ async fn run_query_and_scale(
 
     let num_shutdown_events = shutdown_events.len();
 
-    // Update dashboard state
-    dashboard::update_dashboard_state(
-        dashboard_state,
-        shutdown_events.iter().cloned().collect(),
-        num_pods,
-    )
-    .await;
+    // Check acknowledgment status for all idle workloads
+    let workloads_with_ack: Vec<(ScaleKind, Option<gpu_pruner::AckStatus>)> =
+        futures::stream::iter(shutdown_events.iter().cloned())
+            .then(|obj| async {
+                let ack_status = check_acknowledgment(kube_client.clone(), &obj).await.ok();
+                (obj, ack_status)
+            })
+            .collect()
+            .await;
+
+    // Count acknowledged workloads and update metrics
+    let acknowledged_count = workloads_with_ack
+        .iter()
+        .filter(|(_, ack)| ack.as_ref().map(|a| a.acknowledged).unwrap_or(false))
+        .count();
+
+    gpu_pruner::metrics::ACKNOWLEDGED_WORKLOADS.set(acknowledged_count as i64);
+
+    // Filter out acknowledged workloads before scaling
+    futures::stream::iter(workloads_with_ack)
+        .filter_map(|(obj, ack_status)| async move {
+            // Skip acknowledged workloads
+            if let Some(ack) = &ack_status {
+                if ack.acknowledged {
+                    tracing::info!(
+                        "Skipping [{}] {}:{} - acknowledged until {} by {}",
+                        obj.kind(),
+                        obj.namespace().unwrap_or_default(),
+                        obj.name(),
+                        ack.expires_at.as_ref().unwrap_or(&"unknown".to_string()),
+                        ack.by_user.as_ref().unwrap_or(&"unknown".to_string())
+                    );
+                    gpu_pruner::metrics::SCALEDOWNS_PREVENTED_TOTAL.inc();
+                    return None;
+                }
+            }
 
-    futures::stream::iter(shutdown_events)
-        .filter_map(|obj| async {
+            // Apply dry-run filter
             if let Mode::DryRun = args.run_mode {
                 tracing::info!(
                     "Dry-run: Would have sent [{}] {}:{} for scaledown",
diff --git a/gpu-pruner/src/metrics.rs b/gpu-pruner/src/metrics.rs
index 3f8b59b..2fdf967 100644
--- a/gpu-pruner/src/metrics.rs
+++ b/gpu-pruner/src/metrics.rs
@@ -54,6 +54,38 @@ lazy_static! {
         "Total number of pods analyzed in last query"
     )
     .expect("metric can be created");
+
+    // Acknowledgment metrics
+    pub static ref ACKNOWLEDGED_WORKLOADS: IntGauge = IntGauge::new(
+        "gpu_pruner_acknowledged_workloads",
+        "Current number of workloads with active acknowledgments"
+    )
+    .expect("metric can be created");
+
+    pub static ref ACKNOWLEDGMENTS_TOTAL: IntCounter = IntCounter::new(
+        "gpu_pruner_acknowledgments_total",
+        "Total number of acknowledgments created"
+    )
+    .expect("metric can be created");
+
+    pub static ref SCALEDOWNS_PREVENTED_TOTAL: IntCounter = IntCounter::new(
+        "gpu_pruner_scaledowns_prevented_total",
+        "Total number of scale-downs prevented by acknowledgments"
+    )
+    .expect("metric can be created");
+
+    // Slack notification metrics
+    pub static ref SLACK_NOTIFICATIONS_SENT: IntCounter = IntCounter::new(
+        "gpu_pruner_slack_notifications_sent_total",
+        "Total number of Slack notifications successfully sent"
+    )
+    .expect("metric can be created");
+
+    pub static ref SLACK_NOTIFICATION_FAILURES: IntCounter = IntCounter::new(
+        "gpu_pruner_slack_notification_failures_total",
+        "Total number of failed Slack notification attempts"
+    )
+    .expect("metric can be created");
 }
 
 pub fn init() {
@@ -81,6 +113,21 @@ pub fn init() {
     REGISTRY
         .register(Box::new(PODS_CHECKED.clone()))
         .expect("pods_checked can be registered");
+    REGISTRY
+        .register(Box::new(ACKNOWLEDGED_WORKLOADS.clone()))
+        .expect("acknowledged_workloads can be registered");
+    REGISTRY
+        .register(Box::new(ACKNOWLEDGMENTS_TOTAL.clone()))
+        .expect("acknowledgments_total can be registered");
+    REGISTRY
+        .register(Box::new(SCALEDOWNS_PREVENTED_TOTAL.clone()))
+        .expect("scaledowns_prevented_total can be registered");
+    REGISTRY
+        .register(Box::new(SLACK_NOTIFICATIONS_SENT.clone()))
+        .expect("slack_notifications_sent can be registered");
+    REGISTRY
+        .register(Box::new(SLACK_NOTIFICATION_FAILURES.clone()))
+        .expect("slack_notification_failures can be registered");
 }
 
 pub fn render() -> String {
diff --git a/gpu-pruner/src/slack.rs b/gpu-pruner/src/slack.rs
new file mode 100644
index 0000000..4b29e8c
--- /dev/null
+++ b/gpu-pruner/src/slack.rs
@@ -0,0 +1,121 @@
+use anyhow::Result;
+use reqwest::Client;
+use serde_json::json;
+
+use crate::Meta;
+
+#[derive(Clone, Debug)]
+pub struct SlackNotifier {
+    webhook_url: String,
+    client: Client,
+    channel: String,
+}
+
+impl SlackNotifier {
+    pub fn new(webhook_url: String, channel: String) -> Self {
+        let client = Client::builder()
+            .timeout(std::time::Duration::from_secs(10))
+            .build()
+            .expect("failed to build slack http client");
+
+        Self {
+            webhook_url,
+            client,
+            channel,
+        }
+    }
+
+    #[tracing::instrument(skip(self, workload))]
+    pub async fn send_notification<T: Meta + std::fmt::Debug>(
+        &self,
+        workload: &T,
+        idle_duration_minutes: i64,
+    ) -> Result<()> {
+        let resource_type = workload.kind();
+        let resource_name = workload.name();
+        let namespace = workload.namespace().unwrap_or_else(|| "default".to_string());
+
+        let payload = json!({
+            "channel": self.channel,
+            "attachments": [{
+                "color": "warning",
+                "title": "🔔 Idle GPU Detected - Scale Down Pending",
+                "fields": [
+                    {
+                        "title": "Resource",
+                        "value": format!("{}: {}", resource_type, resource_name),
+                        "short": true
+                    },
+                    {
+                        "title": "Namespace",
+                        "value": namespace,
+                        "short": true
+                    },
+                    {
+                        "title": "Reason",
+                        "value": format!("GPU idle for {} minutes", idle_duration_minutes),
+                        "short": false
+                    },
+                    {
+                        "title": "Action",
+                        "value": "Scaling to 0 replicas",
+                        "short": false
+                    }
+                ],
+                "footer": "gpu-pruner",
+                "ts": std::time::SystemTime::now()
+                    .duration_since(std::time::UNIX_EPOCH)
+                    .unwrap_or_default()
+                    .as_secs()
+            }]
+        });
+
+        tracing::debug!("Sending Slack notification payload: {:?}", payload);
+
+        let response = self
+            .client
+            .post(&self.webhook_url)
+            .json(&payload)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            tracing::error!(
+                "Slack webhook returned error status {}: {}",
+                status,
+                body
+            );
+            return Err(anyhow::anyhow!(
+                "Slack webhook failed with status {}: {}",
+                status,
+                body
+            ));
+        }
+
+        tracing::info!(
+            "Sent Slack notification for [{resource_type}] {namespace}:{resource_name}",
+        );
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_slack_notifier_creation() {
+        let notifier = SlackNotifier::new(
+            "https://hooks.slack.com/services/TEST".to_string(),
+            "#test-pruner".to_string(),
+        );
+        assert_eq!(notifier.webhook_url, "https://hooks.slack.com/services/TEST");
+        assert_eq!(notifier.channel, "#test-pruner");
+    }
+
+    // Note: Actual send_notification tests would require mocking the HTTP client
+    // or using integration tests with a test webhook endpoint
+}
diff --git a/gpu-pruner/tests/e2e.rs b/gpu-pruner/tests/e2e.rs
index 3b2301d..ed28b34 100644
--- a/gpu-pruner/tests/e2e.rs
+++ b/gpu-pruner/tests/e2e.rs
@@ -268,7 +268,7 @@ async fn scale_deployment_to_zero() {
 
     let dep = dep_api.get("e2e-scale").await.unwrap();
     let sk = ScaleKind::Deployment(dep);
-    sk.scale(client.clone()).await.unwrap();
+    sk.scale(client.clone(), None, 30).await.unwrap();
 
     // verify it scaled to zero
     let dep = dep_api.get("e2e-scale").await.unwrap();
@@ -323,7 +323,7 @@ async fn scale_statefulset_to_zero() {
 
     let ss = ss_api.get("e2e-scale-ss").await.unwrap();
     let sk = ScaleKind::StatefulSet(ss);
-    sk.scale(client.clone()).await.unwrap();
+    sk.scale(client.clone(), None, 30).await.unwrap();
 
     let ss = ss_api.get("e2e-scale-ss").await.unwrap();
     let replicas = ss.spec.unwrap().replicas.unwrap_or(1);

From de61d6d1e294827f4f3ae559eeacda4c5fc06d2f Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Wed, 10 Jun 2026 14:19:56 -0400
Subject: [PATCH 6/7] 0.1 threshold

---
 gpu-pruner/src/main.rs         | 21 +++++++++++++++++++++
 gpu-pruner/src/query.promql.j2 |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs
index 1fa8ad3..6e3bf14 100644
--- a/gpu-pruner/src/main.rs
+++ b/gpu-pruner/src/main.rs
@@ -80,6 +80,11 @@ struct Cli {
     #[clap(short, long)]
     model_name: Option<String>,
 
+    /// Maximum combined GPU utilization (0.0–1.0) to still consider a GPU idle.
+    /// Defaults to 0.01 to tolerate DCGM background noise on DCGM_FI_PROF_GR_ENGINE_ACTIVE.
+    #[clap(long, default_value_t = 0.01)]
+    idle_threshold: f64,
+
     /// Power draw threshold in watts. When set, GPUs showing peak power usage above this value
     /// over the lookback window are excluded from idle candidates even if compute utilization is zero.
     /// Useful as a corroborating signal (e.g. 100 for A10G, 150 for A100/H100).
@@ -688,6 +693,22 @@ mod tests {
         );
     }
 
+    #[test]
+    fn query_uses_idle_threshold_not_strict_zero() {
+        let query = render(json!({ "duration": 30 }));
+        assert!(
+            query.contains("< 0.01"),
+            "default idle threshold should be 0.01, not == 0"
+        );
+        assert!(!query.contains("== 0"), "should not use strict == 0");
+    }
+
+    #[test]
+    fn query_idle_threshold_is_configurable() {
+        let query = render(json!({ "duration": 30, "idle_threshold": 0.05 }));
+        assert!(query.contains("< 0.05"), "should use configured idle threshold");
+    }
+
     #[test]
     fn query_without_power_threshold_has_no_unless() {
         let query = render(json!({ "duration": 30 }));
diff --git a/gpu-pruner/src/query.promql.j2 b/gpu-pruner/src/query.promql.j2
index 39db5ed..e819d47 100644
--- a/gpu-pruner/src/query.promql.j2
+++ b/gpu-pruner/src/query.promql.j2
@@ -32,7 +32,7 @@ sum by (Hostname, {{ cl }}, {{ pl }}, {{ nl }}, gpu, modelName) (
   or on (Hostname, {{ cl }}, {{ pl }}, {{ nl }}, gpu, modelName)
   {{ idle_gpus }}
 )
-== 0
+< {{ args.idle_threshold | default(0.01) }}
 {%- if args.power_threshold %}
 unless on ({{ pl }}, {{ nl }})
 (

From 3629dfd739fec7405472b16ed3dde8434a6c1f32 Mon Sep 17 00:00:00 2001
From: fuddin-bit <fuddin@redhat.com>
Date: Wed, 10 Jun 2026 16:03:14 -0400
Subject: [PATCH 7/7] add slack acknowledgement and in-cluster

---
 Cargo.lock                                    |   1 +
 GPU_UTILIZATION_QUERIES.md                    | 402 ++++++++++++++++++
 gpu-pruner/Cargo.toml                         |   1 +
 gpu-pruner/hack/deployment.yaml               |   6 +-
 gpu-pruner/hack/kustomization.yaml            |   1 +
 gpu-pruner/hack/service.yaml                  |   6 +-
 gpu-pruner/hack/servicemonitor.yaml           |   2 +-
 gpu-pruner/hack/slack-interactions-route.yaml |  21 +
 .../hack/slack-interactions-service-lb.yaml   |  26 ++
 .../hack/slack-webhook-secret.example.yaml    |   7 +
 gpu-pruner/src/lib.rs                         |  69 +++
 gpu-pruner/src/main.rs                        | 258 ++++++++++-
 gpu-pruner/src/slack.rs                       |  31 +-
 13 files changed, 822 insertions(+), 9 deletions(-)
 create mode 100644 GPU_UTILIZATION_QUERIES.md
 create mode 100644 gpu-pruner/hack/slack-interactions-route.yaml
 create mode 100644 gpu-pruner/hack/slack-interactions-service-lb.yaml

diff --git a/Cargo.lock b/Cargo.lock
index d1e217f..bbac212 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -836,6 +836,7 @@ dependencies = [
  "secrecy",
  "serde",
  "serde_json",
+ "serde_urlencoded",
  "thiserror 2.0.18",
  "tokio",
  "tower",
diff --git a/GPU_UTILIZATION_QUERIES.md b/GPU_UTILIZATION_QUERIES.md
new file mode 100644
index 0000000..c9224db
--- /dev/null
+++ b/GPU_UTILIZATION_QUERIES.md
@@ -0,0 +1,402 @@
+# GPU Utilization Queries
+
+This document explains every PromQL query `gpu-pruner` uses to detect idle GPUs. The queries are rendered at runtime from `gpu-pruner/src/query.promql.j2` based on CLI flags.
+
+## Prerequisites
+
+- **DCGM exporter** running on GPU nodes
+- **Prometheus** scraping DCGM metrics
+- Port-forward for local testing:
+
+```bash
+kubectl port-forward -n llm-d-monitoring svc/llmd-kube-prometheus-stack-prometheus 9090:9090
+```
+
+Run queries at http://localhost:9090/graph or via curl:
+
+```bash
+curl -sG 'http://localhost:9090/api/v1/query' \
+  --data-urlencode 'query=<PROMQL_HERE>' | jq .
+```
+
+## CLI flags that shape the queries
+
+| Flag | Default | Effect on query |
+|------|---------|-----------------|
+| `--duration` / `-t` | `30` | Lookback window `[Nm]` in minutes |
+| `--honor-labels` | `false` | Use `pod`/`namespace` instead of `exported_pod`/`exported_namespace` |
+| `--namespace` / `-n` | none | Regex filter on namespace label |
+| `--model-name` / `-m` | none | Regex filter on `modelName` |
+| `--idle-threshold` | `0.01` | Max utilization (0.0–1.0) to still count as idle; tolerates DCGM noise |
+| `--power-threshold` | none | Exclude idle candidates with high power draw |
+
+---
+
+## 1. Graphics engine active (primary metric)
+
+Measures the fraction of time the GPU graphics/compute engine was active over the lookback window. Range: **0.0–1.0**.
+
+### With `--honor-labels` (native DCGM labels)
+
+```promql
+max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+  pod != ""
+}[30m])
+```
+
+### Default (`exported_*` labels)
+
+```promql
+max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+  exported_pod != ""
+}[30m])
+```
+
+### With namespace filter (`--namespace=llm-d-optimized-baseline`)
+
+```promql
+max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+  pod != "",
+  namespace =~ "llm-d-optimized-baseline"
+}[30m])
+```
+
+### With model filter (`--model-name="NVIDIA H200"`)
+
+```promql
+max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+  pod != "",
+  modelName =~ "NVIDIA H200"
+}[30m])
+```
+
+**Notes:**
+- `max_over_time` uses the **peak** value in the window, not the average.
+- A value of `0` means the engine was never active during the window.
+- Tiny non-zero values (e.g. `0.00007`) are DCGM background noise; gpu-pruner uses `< 0.01` by default (configurable via `--idle-threshold`) instead of strict `== 0`.
+
+---
+
+## 2. GPU utilization % (fallback metric)
+
+Classic DCGM GPU utilization percentage. Divided by 100 so it matches the 0.0–1.0 scale of engine active.
+
+### With `--honor-labels`
+
+```promql
+max_over_time(DCGM_FI_DEV_GPU_UTIL{
+  pod != ""
+}[30m]) / 100
+```
+
+### Default (`exported_*` labels)
+
+```promql
+max_over_time(DCGM_FI_DEV_GPU_UTIL{
+  exported_pod != ""
+}[30m]) / 100
+```
+
+**Notes:**
+- Used as a fallback when `DCGM_FI_PROF_GR_ENGINE_ACTIVE` is missing for a series.
+- When **both** metrics exist, PromQL `or` keeps the **left-hand** (engine active) value.
+
+---
+
+## 3. Combined utilization per GPU
+
+Aggregates both metrics per GPU, grouped by pod, namespace, and hardware labels.
+
+### With `--honor-labels`
+
+```promql
+sum by (Hostname, container, pod, namespace, gpu, modelName) (
+  max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+    pod != ""
+  }[30m])
+  or
+  max_over_time(DCGM_FI_DEV_GPU_UTIL{
+    pod != ""
+  }[30m]) / 100
+)
+```
+
+### Default (`exported_*` labels)
+
+```promql
+sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) (
+  max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+    exported_pod != ""
+  }[30m])
+  or
+  max_over_time(DCGM_FI_DEV_GPU_UTIL{
+    exported_pod != ""
+  }[30m]) / 100
+)
+```
+
+**Notes:**
+- This is the core "is this GPU busy?" calculation.
+- `sum by` collapses duplicate label sets (one series per GPU).
+- Result `0` = idle; non-zero = some activity detected.
+
+---
+
+## 4. Node type enrichment (optional join)
+
+Joins GPU metrics with `node_dmi_info` to attach hardware `node_type`. Falls back to un-enriched results when node info is missing.
+
+```promql
+sum by (Hostname, container, pod, namespace, gpu, modelName) (
+  max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[30m])
+  or
+  max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[30m]) / 100
+)
+* on (Hostname) group_left(node_type) (
+  label_replace(
+    label_replace(node_dmi_info,
+      "Hostname", "$1", "instance", "(.+)"
+    ),
+    "node_type", "$1", "product_name", "(.+)"
+  )
+)
+or on (Hostname, container, pod, namespace, gpu, modelName)
+sum by (Hostname, container, pod, namespace, gpu, modelName) (
+  max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[30m])
+  or
+  max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[30m]) / 100
+)
+```
+
+**Notes:**
+- The `or` at the end ensures GPUs still appear even when `node_dmi_info` has no match.
+- `node_type` is informational; it does not affect idle detection.
+
+---
+
+## 5. Full gpu-pruner idle detection query
+
+This is the complete query rendered and sent to Prometheus. Returns GPUs considered **idle** (combined utilization below `--idle-threshold`, default `0.01`).
+
+### With `--honor-labels`, `--duration=30`
+
+```promql
+(
+  sum by (Hostname, container, pod, namespace, gpu, modelName) (
+    max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+      pod != ""
+    }[30m])
+    or
+    max_over_time(DCGM_FI_DEV_GPU_UTIL{
+      pod != ""
+    }[30m]) / 100
+  )
+  * on (Hostname) group_left(node_type) (
+    label_replace(
+      label_replace(node_dmi_info,
+        "Hostname", "$1", "instance", "(.+)"
+      ),
+      "node_type", "$1", "product_name", "(.+)"
+    )
+  )
+  or on (Hostname, container, pod, namespace, gpu, modelName)
+  sum by (Hostname, container, pod, namespace, gpu, modelName) (
+    max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+      pod != ""
+    }[30m])
+    or
+    max_over_time(DCGM_FI_DEV_GPU_UTIL{
+      pod != ""
+    }[30m]) / 100
+  )
+) < 0.01
+```
+
+### Default (`exported_*` labels), `--duration=30`
+
+```promql
+(
+  sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) (
+    max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+      exported_pod != ""
+    }[30m])
+    or
+    max_over_time(DCGM_FI_DEV_GPU_UTIL{
+      exported_pod != ""
+    }[30m]) / 100
+  )
+  * on (Hostname) group_left(node_type) (
+    label_replace(
+      label_replace(node_dmi_info,
+        "Hostname", "$1", "instance", "(.+)"
+      ),
+      "node_type", "$1", "product_name", "(.+)"
+    )
+  )
+  or on (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName)
+  sum by (Hostname, exported_container, exported_pod, exported_namespace, gpu, modelName) (
+    max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+      exported_pod != ""
+    }[30m])
+    or
+    max_over_time(DCGM_FI_DEV_GPU_UTIL{
+      exported_pod != ""
+    }[30m]) / 100
+  )
+) < 0.01
+```
+
+**Notes:**
+- Any series returned = gpu-pruner treats that GPU as idle.
+- Override with `--idle-threshold=0.05` for a looser definition of idle.
+- After the query, gpu-pruner resolves each pod to a scalable parent (Deployment, StatefulSet, etc.) in Kubernetes.
+- Infrastructure pods (e.g. `dcgm-exporter` DaemonSets) may match this query but are skipped because they have no scalable root object.
+
+---
+
+## 6. Power draw exclusion (optional, `--power-threshold`)
+
+When set, appends a `unless` clause to exclude GPUs that drew at or above the threshold (watts) during the lookback window, even if utilization is zero.
+
+### Example: `--power-threshold=150` with `--honor-labels`
+
+Full query becomes the idle query above, plus:
+
+```promql
+unless on (pod, namespace)
+(
+  max_over_time(DCGM_FI_DEV_POWER_USAGE{
+    pod != ""
+  }[30m]) >= 150
+)
+```
+
+### Example: `--power-threshold=150` with default labels
+
+```promql
+unless on (exported_pod, exported_namespace)
+(
+  max_over_time(DCGM_FI_DEV_POWER_USAGE{
+    exported_pod != ""
+  }[30m]) >= 150
+)
+```
+
+**Notes:**
+- Useful to catch "compute idle but still drawing power" cases.
+- Suggested starting points: `100` (A10G), `150` (A100/H100).
+
+---
+
+## 7. Manual testing queries
+
+Simplified queries for debugging in Prometheus or curl.
+
+### Count idle GPUs (gpu-pruner default, `< 0.01`)
+
+```promql
+(
+  sum by (pod, namespace, gpu) (
+    max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[5m])
+    or
+    max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[5m]) / 100
+  )
+) < 0.01
+```
+
+```bash
+curl -sG 'http://localhost:9090/api/v1/query' \
+  --data-urlencode 'query=(sum by (pod, namespace, gpu) (max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""}[5m]) or max_over_time(DCGM_FI_DEV_GPU_UTIL{pod != ""}[5m]) / 100)) < 0.01' \
+  | jq '.data.result | length'
+```
+
+### Inspect raw engine active for a workload
+
+```promql
+max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+  pod =~ "optimized-baseline.*"
+}[5m])
+```
+
+### Inspect raw GPU util % for a workload
+
+```promql
+max_over_time(DCGM_FI_DEV_GPU_UTIL{
+  pod =~ "optimized-baseline.*"
+}[5m])
+```
+
+### See the combined value gpu-pruner uses
+
+```promql
+sum by (pod, namespace, gpu) (
+  max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{
+    pod =~ "optimized-baseline.*"
+  }[5m])
+  or
+  max_over_time(DCGM_FI_DEV_GPU_UTIL{
+    pod =~ "optimized-baseline.*"
+  }[5m]) / 100
+)
+```
+
+### Verify DCGM metrics exist
+
+```promql
+DCGM_FI_PROF_GR_ENGINE_ACTIVE
+```
+
+```promql
+DCGM_FI_DEV_GPU_UTIL
+```
+
+### Check which label convention your cluster uses
+
+```promql
+count(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod != ""})
+```
+
+```promql
+count(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod != ""})
+```
+
+If the second count is non-zero and the first is zero, use `--honor-labels` with gpu-pruner.
+
+---
+
+## 8. Example: matching gpu-pruner CLI to query
+
+This command:
+
+```bash
+cargo run --bin gpu-pruner -- \
+  --prometheus-url=http://localhost:9090 \
+  --run-mode=dry-run \
+  --duration=5 \
+  --honor-labels \
+  --namespace=llm-d-optimized-baseline
+```
+
+Renders a query equivalent to section 5 with `[5m]`, `pod`/`namespace` labels, and `namespace =~ "llm-d-optimized-baseline"` on both DCGM metric selectors.
+
+---
+
+## Common pitfalls
+
+| Symptom | Likely cause |
+|---------|----------------|
+| Query returns 0 series | Wrong label convention; try `--honor-labels` or `exported_*` labels |
+| Pods running but not idle | Utilization above `--idle-threshold` (default `0.01`) |
+| Idle GPUs found but no scale-down | Pod owner is a DaemonSet or unsupported resource type |
+| vLLM pods show `0%` util but not idle | `DCGM_FI_PROF_GR_ENGINE_ACTIVE` ≈ `0.00007` wins over `DCGM_FI_DEV_GPU_UTIL` in `or` |
+| New pods never pruned | `--grace-period` (default 300s) adds extra age check in application logic after the query |
+
+---
+
+## Source
+
+Queries are defined in:
+
+- Template: `gpu-pruner/src/query.promql.j2`
+- Rendered in: `gpu-pruner/src/main.rs` (`Running w/ Query:` log line)
+- Tests: `gpu-pruner/src/main.rs` (`query_*` unit tests)
diff --git a/gpu-pruner/Cargo.toml b/gpu-pruner/Cargo.toml
index 35b4b8d..e22d0f6 100644
--- a/gpu-pruner/Cargo.toml
+++ b/gpu-pruner/Cargo.toml
@@ -60,6 +60,7 @@ bitflags = "2"
 axum = "0.8"
 tower = { version = "0.5", features = ["full"] }
 tower-http = { version = "0.6", features = ["fs", "trace", "cors"] }
+serde_urlencoded = "0.7"
 
 # Prometheus metrics
 prometheus = "0.13"
diff --git a/gpu-pruner/hack/deployment.yaml b/gpu-pruner/hack/deployment.yaml
index 797685e..bb0600a 100644
--- a/gpu-pruner/hack/deployment.yaml
+++ b/gpu-pruner/hack/deployment.yaml
@@ -16,13 +16,14 @@ spec:
       serviceAccountName: gpu-pruner
       containers:
         - name: container
-          image: 'ghcr.io/wseaton/gpu-pruner:latest-otel'
+          image: 'ghcr.io/fuddin-bit/gpu-pruner:latest-otel'
           args:
             - 'gpu-pruner'
             - '-d'
             - '--run-mode=scale-down'
             - '--prometheus-url=http://thanos-querier.openshift-monitoring.svc.cluster.local'
             - '--dashboard-port=8080'
+            - '--slack-interaction-port=3002'
             - '--slack-channel=#test-pruner'
           env:
             - name: RUST_BACKTRACE
@@ -39,6 +40,9 @@ spec:
             - containerPort: 8080
               name: dashboard
               protocol: TCP
+            - containerPort: 3002
+              name: slack-interactions
+              protocol: TCP
           resources:
             limits:
               cpu: 500m
diff --git a/gpu-pruner/hack/kustomization.yaml b/gpu-pruner/hack/kustomization.yaml
index c84bf79..abe4e2d 100644
--- a/gpu-pruner/hack/kustomization.yaml
+++ b/gpu-pruner/hack/kustomization.yaml
@@ -5,4 +5,5 @@ resources:
 - clusterrole.yaml
 - service.yaml
 - route.yaml
+- slack-interactions-route.yaml
 - servicemonitor.yaml
\ No newline at end of file
diff --git a/gpu-pruner/hack/service.yaml b/gpu-pruner/hack/service.yaml
index 7b2ba7d..202c6eb 100644
--- a/gpu-pruner/hack/service.yaml
+++ b/gpu-pruner/hack/service.yaml
@@ -10,7 +10,11 @@ spec:
   selector:
     app: gpu-pruner
   ports:
-    - name: http
+    - name: dashboard
       protocol: TCP
       port: 8080
       targetPort: 8080
+    - name: slack-interactions
+      protocol: TCP
+      port: 3002
+      targetPort: 3002
diff --git a/gpu-pruner/hack/servicemonitor.yaml b/gpu-pruner/hack/servicemonitor.yaml
index 84c2cd5..82b935f 100644
--- a/gpu-pruner/hack/servicemonitor.yaml
+++ b/gpu-pruner/hack/servicemonitor.yaml
@@ -10,7 +10,7 @@ spec:
     matchLabels:
       app: gpu-pruner
   endpoints:
-    - port: http
+    - port: dashboard
       path: /metrics
       interval: 30s
       scrapeTimeout: 10s
diff --git a/gpu-pruner/hack/slack-interactions-route.yaml b/gpu-pruner/hack/slack-interactions-route.yaml
new file mode 100644
index 0000000..1471acf
--- /dev/null
+++ b/gpu-pruner/hack/slack-interactions-route.yaml
@@ -0,0 +1,21 @@
+# OpenShift Route exposing Slack interactive component callbacks over HTTPS.
+# After apply, set Slack App → Interactive Components → Request URL to:
+#   https://$(kubectl get route gpu-pruner-slack -n gpu-pruner-system -o jsonpath='{.spec.host}')/slack/interactions
+kind: Route
+apiVersion: route.openshift.io/v1
+metadata:
+  name: gpu-pruner-slack
+  namespace: gpu-pruner-system
+  labels:
+    app: gpu-pruner
+spec:
+  to:
+    kind: Service
+    name: gpu-pruner-dashboard
+    weight: 100
+  port:
+    targetPort: slack-interactions
+  tls:
+    termination: edge
+    insecureEdgeTerminationPolicy: Redirect
+  wildcardPolicy: None
diff --git a/gpu-pruner/hack/slack-interactions-service-lb.yaml b/gpu-pruner/hack/slack-interactions-service-lb.yaml
new file mode 100644
index 0000000..b5515b5
--- /dev/null
+++ b/gpu-pruner/hack/slack-interactions-service-lb.yaml
@@ -0,0 +1,26 @@
+# Optional: CoreWeave CKS LoadBalancer for Slack interactions (no OpenShift Route).
+# Apply standalone when not using slack-interactions-route.yaml:
+#   kubectl apply -f gpu-pruner/hack/slack-interactions-service-lb.yaml
+#
+# Requires TLS termination in front of this service (Slack requires HTTPS).
+# Set Slack App → Interactive Components → Request URL to:
+#   https://<external-hostname>/slack/interactions
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-pruner-slack
+  namespace: gpu-pruner-system
+  labels:
+    app: gpu-pruner
+  annotations:
+    service.beta.kubernetes.io/coreweave-load-balancer-type: "public"
+    service.beta.kubernetes.io/external-hostname: "gpu-pruner-slack"
+spec:
+  type: LoadBalancer
+  selector:
+    app: gpu-pruner
+  ports:
+    - name: slack-interactions
+      protocol: TCP
+      port: 3002
+      targetPort: 3002
diff --git a/gpu-pruner/hack/slack-webhook-secret.example.yaml b/gpu-pruner/hack/slack-webhook-secret.example.yaml
index 6ab66f3..bc8899f 100644
--- a/gpu-pruner/hack/slack-webhook-secret.example.yaml
+++ b/gpu-pruner/hack/slack-webhook-secret.example.yaml
@@ -4,6 +4,13 @@
 #     --namespace=gpu-pruner-system \
 #     --from-literal=webhook-url='https://hooks.slack.com/services/T.../B.../...'
 #
+# After deploying with kubectl apply -k gpu-pruner/hack/, configure Slack:
+#   1. api.slack.com/apps → Your App → Interactive Components → Enable
+#   2. Request URL (OpenShift):
+#        https://$(kubectl get route gpu-pruner-slack -n gpu-pruner-system -o jsonpath='{.spec.host}')/slack/interactions
+#   3. Click a button in #test-pruner and verify annotations:
+#        kubectl get deployment <name> -n <namespace> -o yaml | grep gpu-pruner.io
+#
 apiVersion: v1
 kind: Secret
 metadata:
diff --git a/gpu-pruner/src/lib.rs b/gpu-pruner/src/lib.rs
index c3c20e7..b133b89 100644
--- a/gpu-pruner/src/lib.rs
+++ b/gpu-pruner/src/lib.rs
@@ -460,6 +460,75 @@ impl Scaler for ScaleKind {
     }
 }
 
+/// Apply acknowledgment annotation to a workload
+#[tracing::instrument(skip(client))]
+pub async fn acknowledge_workload(
+    client: KubeClient,
+    kind: &str,
+    name: &str,
+    namespace: &str,
+    duration_hours: u32,
+    user: &str,
+) -> anyhow::Result<()> {
+    use chrono::Duration;
+
+    // Calculate expiry timestamp
+    let now = chrono::Utc::now();
+    let expires_at = now + Duration::hours(duration_hours as i64);
+    let expires_at_rfc3339 = expires_at.to_rfc3339();
+
+    // Build annotation patch
+    let patch = serde_json::json!({
+        "metadata": {
+            "annotations": {
+                "gpu-pruner.io/ack-until": expires_at_rfc3339,
+                "gpu-pruner.io/ack-by": user,
+            }
+        }
+    });
+
+    // Apply patch based on resource kind
+    match kind {
+        "Deployment" => {
+            let api: Api<Deployment> = Api::namespaced(client, namespace);
+            api.patch(name, &PatchParams::default(), &Patch::Merge(&patch))
+                .await?;
+        }
+        "ReplicaSet" => {
+            let api: Api<ReplicaSet> = Api::namespaced(client, namespace);
+            api.patch(name, &PatchParams::default(), &Patch::Merge(&patch))
+                .await?;
+        }
+        "StatefulSet" => {
+            let api: Api<StatefulSet> = Api::namespaced(client, namespace);
+            api.patch(name, &PatchParams::default(), &Patch::Merge(&patch))
+                .await?;
+        }
+        "Notebook" => {
+            let api: Api<Notebook> = Api::namespaced(client, namespace);
+            api.patch(name, &PatchParams::default(), &Patch::Merge(&patch))
+                .await?;
+        }
+        "InferenceService" => {
+            let api: Api<InferenceService> = Api::namespaced(client, namespace);
+            api.patch(name, &PatchParams::default(), &Patch::Merge(&patch))
+                .await?;
+        }
+        _ => {
+            return Err(anyhow::anyhow!("Unsupported resource kind: {}", kind));
+        }
+    }
+
+    // Increment metrics
+    metrics::ACKNOWLEDGMENTS_TOTAL.inc();
+
+    tracing::info!(
+        "Acknowledged [{kind}] {namespace}:{name} by {user} until {expires_at_rfc3339}"
+    );
+
+    Ok(())
+}
+
 /// Check if a workload has an active acknowledgment annotation
 #[tracing::instrument(skip(_client))]
 pub async fn check_acknowledgment(
diff --git a/gpu-pruner/src/main.rs b/gpu-pruner/src/main.rs
index 6e3bf14..fc09b76 100644
--- a/gpu-pruner/src/main.rs
+++ b/gpu-pruner/src/main.rs
@@ -26,6 +26,7 @@ use futures::stream::StreamExt;
 
 use prometheus_http_query::Client;
 use serde::Serialize;
+use serde_json::json;
 
 use jiff::{SignedDuration, Timestamp};
 use k8s_openapi::api::core::v1::Pod;
@@ -34,11 +35,20 @@ use kube::{Api, Client as KubeClient, Resource};
 use clap::{Parser, ValueEnum};
 
 use gpu_pruner::{
-    Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, check_acknowledgment,
+    Meta, PodMetricData, QueryResponse, ScaleKind, Scaler, TlsMode, acknowledge_workload, check_acknowledgment,
     find_root_object, get_enabled_resources, get_prom_client, get_prometheus_token,
     slack::SlackNotifier,
 };
 
+use axum::{
+    extract::State,
+    http::StatusCode,
+    response::{IntoResponse, Response},
+    routing::post,
+    Router,
+};
+use std::net::SocketAddr;
+
 /// `gpu-pruner` is a tool to prune idle pods based on GPU utilization. It uses Prometheus to query
 /// GPU utilization metrics and scales down pods that have been idle for a certain duration.
 ///
@@ -132,6 +142,11 @@ struct Cli {
     #[clap(long, default_value = "#test-pruner")]
     slack_channel: String,
 
+    /// Port to listen for Slack interactive component callbacks (button clicks).
+    /// Required if you want users to acknowledge idle GPUs from Slack messages.
+    #[clap(long)]
+    slack_interaction_port: Option<u16>,
+
 }
 
 #[derive(Debug, Clone, ValueEnum, Default, Serialize)]
@@ -151,6 +166,202 @@ enum LogFormat {
 
 static QUERY_FAILURES: AtomicUsize = AtomicUsize::new(0);
 
+// Slack interaction handler state
+#[derive(Clone)]
+struct SlackInteractionState {
+    kube_client: KubeClient,
+}
+
+// Slack interaction payload structures
+#[derive(Debug, serde::Deserialize)]
+struct SlackVerificationPayload {
+    #[serde(rename = "type")]
+    payload_type: String,
+    challenge: Option<String>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+struct SlackInteractionPayload {
+    user: SlackUser,
+    actions: Vec<SlackAction>,
+    response_url: String,
+}
+
+#[derive(Debug, serde::Deserialize)]
+struct SlackUser {
+    name: String,
+}
+
+#[derive(Debug, serde::Deserialize)]
+struct SlackAction {
+    value: String,
+}
+
+// Handler for Slack interactive component callbacks
+async fn handle_slack_interaction(
+    State(state): State<SlackInteractionState>,
+    body: String,
+) -> Response {
+    tracing::info!("Received Slack interaction callback");
+
+    // Parse the form-encoded payload
+    let payload_str = match serde_urlencoded::from_str::<Vec<(String, String)>>(&body) {
+        Ok(params) => {
+            // Slack sends the payload in a "payload" field
+            params
+                .into_iter()
+                .find(|(k, _)| k == "payload")
+                .map(|(_, v)| v)
+                .unwrap_or_default()
+        }
+        Err(e) => {
+            tracing::error!("Failed to parse form data: {}", e);
+            return (StatusCode::BAD_REQUEST, "Invalid form data").into_response();
+        }
+    };
+
+    // Parse the JSON payload
+    if payload_str.is_empty() {
+        tracing::info!("Received empty Slack interaction payload (URL verification probe)");
+        return (StatusCode::OK, "OK").into_response();
+    }
+
+    if let Ok(verification) = serde_json::from_str::<SlackVerificationPayload>(&payload_str) {
+        if verification.payload_type == "url_verification" {
+            if let Some(challenge) = verification.challenge {
+                tracing::info!("Responding to Slack URL verification challenge");
+                return (StatusCode::OK, challenge).into_response();
+            }
+        }
+    }
+
+    let payload: SlackInteractionPayload = match serde_json::from_str(&payload_str) {
+        Ok(p) => p,
+        Err(e) => {
+            tracing::error!("Failed to parse Slack payload JSON: {}", e);
+            return (StatusCode::BAD_REQUEST, "Invalid JSON payload").into_response();
+        }
+    };
+
+    // Extract action value (format: kind:namespace:name:duration)
+    let action_value = match payload.actions.first() {
+        Some(action) => &action.value,
+        None => {
+            tracing::error!("No action found in payload");
+            return (StatusCode::BAD_REQUEST, "No action found").into_response();
+        }
+    };
+
+    let parts: Vec<&str> = action_value.split(':').collect();
+    if parts.len() != 4 {
+        tracing::error!("Invalid action value format: {}", action_value);
+        return (StatusCode::BAD_REQUEST, "Invalid action value").into_response();
+    }
+
+    let (kind, namespace, name, duration_str) = (parts[0], parts[1], parts[2], parts[3]);
+    let duration_hours: u32 = match duration_str.parse() {
+        Ok(d) => d,
+        Err(e) => {
+            tracing::error!("Failed to parse duration: {}", e);
+            return (StatusCode::BAD_REQUEST, "Invalid duration").into_response();
+        }
+    };
+
+    let user = &payload.user.name;
+
+    // Apply acknowledgment
+    match acknowledge_workload(
+        state.kube_client.clone(),
+        kind,
+        name,
+        namespace,
+        duration_hours,
+        user,
+    )
+    .await
+    {
+        Ok(_) => {
+            tracing::info!(
+                "Successfully acknowledged [{kind}] {namespace}:{name} for {duration_hours}h by {user}"
+            );
+
+            // Send response back to Slack to update the message
+            let response_message = json!({
+                "replace_original": true,
+                "attachments": [{
+                    "color": "good",
+                    "title": "✓ GPU Idle Acknowledgment Confirmed",
+                    "fields": [
+                        {
+                            "title": "Resource",
+                            "value": format!("{}: {}", kind, name),
+                            "short": true
+                        },
+                        {
+                            "title": "Namespace",
+                            "value": namespace,
+                            "short": true
+                        },
+                        {
+                            "title": "Acknowledged By",
+                            "value": user,
+                            "short": true
+                        },
+                        {
+                            "title": "Duration",
+                            "value": format!("{} hours", duration_hours),
+                            "short": true
+                        },
+                        {
+                            "title": "Status",
+                            "value": format!("GPU will not be scaled down for the next {} hours", duration_hours),
+                            "short": false
+                        }
+                    ],
+                    "footer": "gpu-pruner",
+                    "ts": std::time::SystemTime::now()
+                        .duration_since(std::time::UNIX_EPOCH)
+                        .unwrap_or_default()
+                        .as_secs()
+                }]
+            });
+
+            // Send the response to Slack via response_url
+            if let Err(e) = reqwest::Client::new()
+                .post(&payload.response_url)
+                .json(&response_message)
+                .send()
+                .await
+            {
+                tracing::error!("Failed to send response to Slack: {}", e);
+            }
+
+            (StatusCode::OK, "Acknowledged").into_response()
+        }
+        Err(e) => {
+            tracing::error!("Failed to acknowledge workload: {}", e);
+
+            // Send error response to Slack
+            let error_message = json!({
+                "replace_original": false,
+                "text": format!("❌ Failed to acknowledge: {}", e),
+                "response_type": "ephemeral"
+            });
+
+            if let Err(e) = reqwest::Client::new()
+                .post(&payload.response_url)
+                .json(&error_message)
+                .send()
+                .await
+            {
+                tracing::error!("Failed to send error response to Slack: {}", e);
+            }
+
+            (StatusCode::INTERNAL_SERVER_ERROR, "Failed to acknowledge").into_response()
+        }
+    }
+}
+
 #[cfg(feature = "otel")]
 static RESOURCE: LazyLock<OTELResource> = LazyLock::new(|| {
     OTELResource::builder()
@@ -411,11 +622,48 @@ async fn main() -> anyhow::Result<()> {
         })
     };
 
+    // Spawn Slack interaction HTTP server if port is configured
+    let slack_interaction_task = if let Some(port) = args.slack_interaction_port {
+        let kube_client = KubeClient::try_default()
+            .await
+            .expect("failed to get kube client for slack interactions");
+
+        let state = SlackInteractionState { kube_client };
+
+        let app = Router::new()
+            .route("/slack/interactions", post(handle_slack_interaction))
+            .with_state(state);
+
+        let addr = SocketAddr::from(([0, 0, 0, 0], port));
+        tracing::info!("Starting Slack interaction server on {}", addr);
+
+        Some(tokio::spawn(async move {
+            let listener = tokio::net::TcpListener::bind(addr)
+                .await
+                .expect("failed to bind slack interaction server");
+            axum::serve(listener, app)
+                .await
+                .expect("failed to start slack interaction server");
+            Ok::<(), anyhow::Error>(())
+        }))
+    } else {
+        tracing::info!("Slack interaction server disabled (no --slack-interaction-port set)");
+        None
+    };
+
     // Wait for all tasks
-    _ = tokio::try_join! {
-        query_task,
-        scale_down_task
-    }?;
+    if let Some(interaction_task) = slack_interaction_task {
+        _ = tokio::try_join! {
+            query_task,
+            scale_down_task,
+            interaction_task
+        }?;
+    } else {
+        _ = tokio::try_join! {
+            query_task,
+            scale_down_task
+        }?;
+    }
 
     Ok(())
 }
diff --git a/gpu-pruner/src/slack.rs b/gpu-pruner/src/slack.rs
index 4b29e8c..91a08f1 100644
--- a/gpu-pruner/src/slack.rs
+++ b/gpu-pruner/src/slack.rs
@@ -35,9 +35,15 @@ impl SlackNotifier {
         let resource_name = workload.name();
         let namespace = workload.namespace().unwrap_or_else(|| "default".to_string());
 
+        // Encode workload info in button values: kind:namespace:name:duration
+        let button_value_4h = format!("{}:{}:{}:4", resource_type, namespace, resource_name);
+        let button_value_8h = format!("{}:{}:{}:8", resource_type, namespace, resource_name);
+        let button_value_24h = format!("{}:{}:{}:24", resource_type, namespace, resource_name);
+
         let payload = json!({
             "channel": self.channel,
             "attachments": [{
+                "callback_id": "ack_idle_gpu",
                 "color": "warning",
                 "title": "🔔 Idle GPU Detected - Scale Down Pending",
                 "fields": [
@@ -58,10 +64,33 @@ impl SlackNotifier {
                     },
                     {
                         "title": "Action",
-                        "value": "Scaling to 0 replicas",
+                        "value": "Scaling to 0 replicas unless acknowledged",
                         "short": false
                     }
                 ],
+                "actions": [
+                    {
+                        "name": "ack",
+                        "text": "Keep 4h",
+                        "type": "button",
+                        "value": button_value_4h,
+                        "style": "primary"
+                    },
+                    {
+                        "name": "ack",
+                        "text": "Keep 8h",
+                        "type": "button",
+                        "value": button_value_8h,
+                        "style": "primary"
+                    },
+                    {
+                        "name": "ack",
+                        "text": "Keep 24h",
+                        "type": "button",
+                        "value": button_value_24h,
+                        "style": "primary"
+                    }
+                ],
                 "footer": "gpu-pruner",
                 "ts": std::time::SystemTime::now()
                     .duration_since(std::time::UNIX_EPOCH)