diff --git a/ansible/inventory/demo/wiab-staging.yml b/ansible/inventory/demo/wiab-staging.yml index a3fda05b7..102cf9986 100644 --- a/ansible/inventory/demo/wiab-staging.yml +++ b/ansible/inventory/demo/wiab-staging.yml @@ -6,6 +6,6 @@ wiab-staging: ansible_user: 'demo' ansible_ssh_private_key_file: "~/.ssh/id_ed25519" vars: - artifact_hash: 8cd7cf27c149f990a9bca54f196e21fc326cde04 + artifact_hash: 468e8637cfa318e0f9596a4e78e87820b63c4e1a # when enabled, disable WAN SNAT/masquerading for VMs on the private network private_deployment: true diff --git a/bin/grafana-vm.sh b/bin/grafana-vm.sh index a4662c6ba..34aa6ae96 100644 --- a/bin/grafana-vm.sh +++ b/bin/grafana-vm.sh @@ -1,8 +1,19 @@ #!/usr/bin/env bash -# This script creates a VM named 'grafananode' with IP 192.168.122.100 where a grafana instance will be deployed. -# It uses cloud-init for initial setup and requires libvirt and virt-install to be installed on the host machine. -# It is intended for testing or development purposes and should not be used for production setups. - +# +# Non-interactive script for deploying the Wire standard set of Ubuntu Server VMs +# on a single dedicated server using libvirt. It is a fork for offline-vm-setup.sh +# +# Script will create VMs with a sudo user "demo" and PW auth disabled. +# All VMs are created with DHCP IPs from default libvirt subnet (192.168.122.0/24). +# IPs and hostnames are automatically appended to /etc/hosts once VMs receive their addresses. +# +# The script will exit gracefully if VMs already exist. +# +# | hostname | RAM | VCPUs | disk space (thin provisioned) | +# -------------------------------------------------------------- +# | grafananode | 8 GiB | 2 | 100 GB | +# -------------------------------------------------------------- +# | total | 8 GiB | 2 | 100 GB | set -Eeuo pipefail @@ -10,6 +21,10 @@ msg() { echo >&2 -e "${1-}" } +cleanup() { + trap - SIGINT SIGTERM ERR EXIT +} + if [[ $EUID -eq 0 ]]; then msg "Please don't run me as root" 1>&2 exit 1 @@ -17,143 +32,253 @@ fi trap cleanup SIGINT SIGTERM ERR EXIT -cleanup() { - trap - SIGINT SIGTERM ERR EXIT - pkill -f "http.server" || true - rm -r "$DEPLOY_DIR"/nocloud/* 2>/dev/null || true +die() { + local msg=$1 + local code=${2-1} # default exit status 1 + msg "$msg" + exit "$code" } SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P) DEPLOY_DIR="$(cd "$SCRIPT_DIR/../" && pwd)" NOCLOUD_DIR=$DEPLOY_DIR/nocloud +BASE_IMAGE_DIR="$DEPLOY_DIR/" +BASE_IMAGE="$BASE_IMAGE_DIR/ubuntu-22.04-base.qcow2" +IMAGE_URL="https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img" + mkdir -p "$NOCLOUD_DIR" +mkdir -p "$BASE_IMAGE_DIR" -VM_NAME="grafananode" -VM_IP="192.168.122.100" -VM_VCPU=4 -VM_RAM=8192 -VM_DISK=100 +# Download base Ubuntu cloud image if not present +if [ ! -f "$BASE_IMAGE" ]; then + msg "Downloading Ubuntu 22.04 cloud image to $BASE_IMAGE ..." + curl -fL -o "$BASE_IMAGE" "$IMAGE_URL" || die "Failed to download Ubuntu cloud image" + msg "Base image downloaded successfully" +fi -while grep -Fq "$VM_IP" /etc/hosts; do - VM_IP="192.168.122.$(shuf -i100-240 -n1)" -done +SSH_DIR="$DEPLOY_DIR/ssh" +mkdir -p "$SSH_DIR" -if [[ -f "$HOME"/.ssh/authorized_keys && -s "$HOME"/.ssh/authorized_keys ]]; then - SSHKEY_HUMAN=$(head -n 1 ~/.ssh/authorized_keys) -else - read -r -p "No existing SSH keys found in ~/.ssh/authorized_keys for the current user ($USER). Please enter a valid SSH public key to proceed: " SSHKEY_HUMAN +# SSH key paths +SSH_PRIVKEY="$SSH_DIR/id_ed25519" +SSH_PUBKEY="$SSH_DIR/id_ed25519.pub" + +# Create SSH keypair if it doesn't exist +if [ ! -f "$SSH_PRIVKEY" ]; then + msg "Generating SSH keypair in $SSH_DIR..." + ssh-keygen -t ed25519 -q -N '' -f "$SSH_PRIVKEY" + msg "SSH keypair generated successfully" fi -if [[ -f "$HOME"/.ssh/id_ed25519 && -f "$HOME"/.ssh/id_ed25519.pub ]]; then - SSHKEY_DEMO=$(cat "$HOME"/.ssh/id_ed25519.pub) -elif [[ -f "$HOME"/.ssh/id_ed25519 ]]; then - # Public key missing, generate it from private key - ssh-keygen -y -f "$HOME"/.ssh/id_ed25519 > "$HOME"/.ssh/id_ed25519.pub - SSHKEY_DEMO=$(cat "$HOME"/.ssh/id_ed25519.pub) -else - ssh-keygen -t ed25519 -q -N '' -f "$HOME"/.ssh/id_ed25519 - SSHKEY_DEMO=$(cat "$HOME"/.ssh/id_ed25519.pub) +# Check and fix SSH private key permissions +if [ -f "$SSH_PRIVKEY" ]; then + current_perms=$(stat -c %a "$SSH_PRIVKEY" 2>/dev/null || stat -f %A "$SSH_PRIVKEY" 2>/dev/null) + if [ "$current_perms" != "400" ]; then + msg "Fixing SSH private key permissions from $current_perms to 400" + chmod 400 "$SSH_PRIVKEY" + fi fi +# Read the public key +SSHKEY_DEMO=$(cat "$SSH_PUBKEY") + +VM_NAME=(grafananode) +VM_VCPU=(2) +VM_RAM=(8192) +VM_DISK=(100) +VM_NETWORK='wirebox' + +# Check if VM_NETWORK exists, if not fall back to 'default' +if ! sudo virsh net-list --all 2>/dev/null | grep -Fq "$VM_NETWORK"; then + msg "Network $VM_NETWORK not found, switching to default network" + VM_NETWORK='default' +fi + +msg "" +msg "Including the following SSH Keys for VM deployment:" msg "" -msg "Preparing to use the following SSH keys for VM deployment:" -msg "Existing key from ~/.ssh/authorized_keys: $SSHKEY_HUMAN" -msg "Local keypair key from ~/.ssh/id_ed25519: $SSHKEY_DEMO" +msg "SSH keys stored in: $SSH_DIR" +msg "Public key: $SSHKEY_DEMO" msg "" -nohup python3 -m http.server 3003 -d "$NOCLOUD_DIR" /dev/null & prepare_config() { - VM_DIR=$NOCLOUD_DIR/$VM_NAME + VM_DIR=$NOCLOUD_DIR/${VM_NAME[i]} mkdir -p "$VM_DIR" - touch "$VM_DIR"/{vendor-data,meta-data} + cat >"$VM_DIR/user-data"<"$VM_DIR/meta-data"</dev/null || \ + die "Failed to create cloud-init seed ISO for ${VM_NAME[i]}" +} + +get_vm_ip() { + local vm_name=$1 + local max_wait=${2:-300} + local elapsed=0 + + while [ "$elapsed" -lt "$max_wait" ]; do + # Get MAC address of VM + local mac + mac=$(sudo virsh domiflist "$vm_name" 2>/dev/null | grep -oP '(?<= )[0-9a-f:]{17}' | head -1) + + if [ -n "$mac" ]; then + # Query DHCP leases for this MAC address + local ip + ip=$(sudo virsh net-dhcp-leases "$VM_NETWORK" 2>/dev/null | grep "$mac" | awk '{print $5}' | cut -d'/' -f1) + + if [ -n "$ip" ]; then + echo "$ip" + return 0 + fi + fi + + sleep 30 + elapsed=$((elapsed + 30)) + done + + return 1 } create_vm () { - prepare_config + # Check if VM already exists + if sudo virsh list --all | grep -Fq "${VM_NAME[i]}"; then + msg "VM ${VM_NAME[i]} already exists, skipping creation" + return 0 + fi + + prepare_config "${VM_NAME[i]}" + + VM_DISK_PATH="/var/lib/libvirt/images/${VM_NAME[i]}.qcow2" + SEED_ISO="$NOCLOUD_DIR/${VM_NAME[i]}/seed.iso" + + # Create qcow2 backing file from base image + sudo qemu-img create -f qcow2 -b "$BASE_IMAGE" -F qcow2 "$VM_DISK_PATH" || \ + die "Failed to create backing file for ${VM_NAME[i]}" + + # Resize backing file to desired size + sudo qemu-img resize "$VM_DISK_PATH" "${VM_DISK[i]}G" || \ + die "Failed to resize disk for ${VM_NAME[i]}" sudo virt-install \ - --name "$VM_NAME" \ - --ram "$VM_RAM" \ - --disk path=/var/lib/libvirt/images/"$VM_NAME".qcow2,size="$VM_DISK" \ - --vcpus "$VM_VCPU" \ - --network bridge=virbr0 \ + --name "${VM_NAME[i]}" \ + --ram "${VM_RAM[i]}" \ + --disk "path=$VM_DISK_PATH,format=qcow2,bus=virtio" \ + --disk "path=$SEED_ISO,device=cdrom" \ + --vcpus "${VM_VCPU[i]}" \ + --network "bridge=virbr0,model=virtio" \ --graphics none \ - --osinfo detect=on,require=off \ + --osinfo ubuntu22.04 \ --noautoconsole \ - --location "$DEPLOY_DIR"/ubuntu.iso,kernel=casper/vmlinuz,initrd=casper/initrd \ - --extra-args "console=ttyS0,115200n8 autoinstall ds=nocloud-net;s=http://192.168.122.1:3003/$VM_NAME" + --import \ + --console pty,target_type=serial } -if sudo virsh list --all | grep -Fq "$VM_NAME"; then - msg "" - msg "ATTENTION - VM $VM_NAME already exists" - msg "" - exit 0 -else - set -u - msg "" - msg "Creating VM $VM_NAME ..." - msg "IP: $VM_IP" - msg "VCPUs: $VM_VCPU" - msg "RAM: $VM_RAM MiB" - msg "DISK: $VM_DISK GB" - create_vm - if grep -Fq "$VM_NAME" /etc/hosts; then +for (( i=0; i<${#VM_NAME[@]}; i++ )); do + if sudo virsh list --all | grep -Fq "${VM_NAME[i]}"; then + msg "" + msg "ATTENTION - VM ""${VM_NAME[i]}"" already exists" msg "" - msg "Updating existing record in /etc/hosts for $VM_NAME with IP $VM_IP" - sudo sed -i -e "/$VM_NAME/c\\$VM_IP $VM_NAME" /etc/hosts + continue else + set -u msg "" - msg "Writing IP and hostname to /etc/hosts ..." - echo "$VM_IP $VM_NAME" | sudo tee -a /etc/hosts + msg "Creating VM ""${VM_NAME[i]}"" ..." + msg "VCPUs: ""${VM_VCPU[i]}""" + msg "RAM: ""${VM_RAM[i]}"" MiB" + msg "DISK: ""${VM_DISK[i]}"" GB" + create_vm "${VM_NAME[i]}" + fi +done + +msg "" +msg "Waiting for VMs to complete cloud-init provisioning..." +msg "" + +# Create environment file to store VM IPs +ENV_FILE="$DEPLOY_DIR/.vm-env" +: > "$ENV_FILE" # Clear the file + +for (( i=0; i<${#VM_NAME[@]}; i++ )); do + # Skip if VM already existed + if sudo virsh list --all | grep -Fq "${VM_NAME[i]}"; then msg "" + msg "Waiting for ${VM_NAME[i]} to acquire DHCP IP address..." + + # Wait for VM to get IP address from DHCP + if vm_ip=$(get_vm_ip "${VM_NAME[i]}" 120); then + msg "${VM_NAME[i]} acquired IP: $vm_ip" + + # Set environment variable for this VM + env_var_name="${VM_NAME[i]}_ip" + export "${env_var_name}=$vm_ip" + echo "export ${env_var_name}=$vm_ip" >> "$ENV_FILE" + + # Update /etc/hosts with the actual DHCP IP + if grep -Fq "${VM_NAME[i]}" /etc/hosts; then + msg "Updating /etc/hosts for ${VM_NAME[i]} with IP $vm_ip" + sudo sed -i -e "/${VM_NAME[i]}/c\\$vm_ip ${VM_NAME[i]}" /etc/hosts + else + msg "Writing ${VM_NAME[i]} ($vm_ip) to /etc/hosts" + echo "$vm_ip ${VM_NAME[i]}" | sudo tee -a /etc/hosts >/dev/null + fi + + # Wait for SSH connectivity + msg "Waiting for SSH connectivity on ${VM_NAME[i]} ($vm_ip)..." + max_attempts=10 + attempt=0 + + while ! ssh -i "$SSH_PRIVKEY" -o ConnectTimeout=2 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + "demo@$vm_ip" "exit" 2>/dev/null; do + attempt=$((attempt + 1)) + if [ $attempt -gt $max_attempts ]; then + msg "WARNING: ${VM_NAME[i]} ($vm_ip) did not become reachable after $max_attempts attempts" + break + fi + sleep 30 + done + + # Wait for cloud-init to complete + if [ $attempt -le $max_attempts ]; then + msg "Waiting for cloud-init to complete on ${VM_NAME[i]}..." + ssh -i "$SSH_PRIVKEY" -o ConnectTimeout=2 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + "demo@$vm_ip" "cloud-init status --wait" 2>/dev/null || true + msg "VM ${VM_NAME[i]} is ready at $vm_ip" + fi + else + msg "ERROR: ${VM_NAME[i]} did not acquire an IP address within timeout period" + fi fi - sleep 20 -fi +done + +msg "" +msg "Environment variables saved to: $ENV_FILE" +msg "Source with: source $ENV_FILE" +msg "VM IPs:" +grep export "$ENV_FILE" | sed 's/export / /' -while sudo virsh list --state-running --name | grep -Fxq "$VM_NAME"; do - sleep 20 - msg "INFO: $VM_NAME deployment still in progress ..." -done \ No newline at end of file diff --git a/bin/helm-operations.sh b/bin/helm-operations.sh index d60a1805a..f2a7fce80 100755 --- a/bin/helm-operations.sh +++ b/bin/helm-operations.sh @@ -11,6 +11,10 @@ CERT_MASTER_EMAIL="${CERT_MASTER_EMAIL:-certmaster@example.com}" # default is set to TRUE to deploy it unless changed DEPLOY_CERT_MANAGER="${DEPLOY_CERT_MANAGER:-TRUE}" +# DEPLOY_CALLING_SERVICES env variable is used to decide if sftd and coturn should get deployed +# default is set to TRUE to deploy them unless changed +DEPLOY_CALLING_SERVICES="${DEPLOY_CALLING_SERVICES:-TRUE}" + # DUMP_LOGS_ON_FAIL to dump logs on failure # it is false by default DUMP_LOGS_ON_FAIL="${DUMP_LOGS_ON_FAIL:-FALSE}" @@ -19,9 +23,7 @@ DUMP_LOGS_ON_FAIL="${DUMP_LOGS_ON_FAIL:-FALSE}" # assuming it to be the public address used by clients to reach public Address HOST_IP="${HOST_IP:-}" -if [ -z "$HOST_IP" ]; then -HOST_IP=$(wget -qO- https://api.ipify.org) -fi +CALLING_NODE="" function dump_debug_logs { local exit_code=$? @@ -32,12 +34,28 @@ function dump_debug_logs { } trap dump_debug_logs ERR -# picking a node for calling traffic (3rd kube worker node) -CALLING_NODE=$(kubectl get nodes --no-headers | tail -n 1 | awk '{print $1}') -if [[ -z "$CALLING_NODE" ]]; then - echo "Error: could not determine the last kube worker node via kubectl" - exit 1 -fi +configure_calling_environment() { + + if [[ "$DEPLOY_CALLING_SERVICES" != "TRUE" ]]; then + return 0 + fi + + if [[ -z "$HOST_IP" ]]; then + HOST_IP=$(wget -qO- https://api.ipify.org) + fi + + if [[ -z "$HOST_IP" ]]; then + echo "Error: could not determine HOST_IP automatically" + exit 1 + fi + + # picking a node for calling traffic (3rd kube worker node) + CALLING_NODE=$(kubectl get nodes --no-headers | tail -n 1 | awk '{print $1}') + if [[ -z "$CALLING_NODE" ]]; then + echo "Error: could not determine the last kube worker node via kubectl" + exit 1 + fi +} sync_pg_secrets() { echo "Retrieving PostgreSQL password from databases-ephemeral for wire-server deployment..." @@ -60,7 +78,15 @@ process_values() { ENV=$1 TYPE=$2 - charts=(fake-aws demo-smtp rabbitmq databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller nginx-ingress-services coturn sftd cert-manager) + charts=(fake-aws demo-smtp rabbitmq databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller) + + if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then + charts+=(nginx-ingress-services cert-manager) + fi + + if [[ "$DEPLOY_CALLING_SERVICES" == "TRUE" ]]; then + charts+=(coturn sftd) + fi if [[ "$ENV" != "prod" ]] || [[ -z "$TYPE" ]] ; then echo "Error: This function only supports prod deployments with TYPE as values or secrets. ENV must be 'prod', got: '$ENV' and '$TYPE'" @@ -92,41 +118,55 @@ configure_values() { TEMP_DIR=$(mktemp -d) trap 'rm -rf $TEMP_DIR' EXIT - # to find IP address of calling NODE - CALLING_NODE_IP=$(kubectl get node "$CALLING_NODE" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') - # Fixing the hosts with TARGET_SYSTEM and setting the turn server sed -e "s/example.com/$TARGET_SYSTEM/g" \ "$BASE_DIR/values/wire-server/values.yaml" > "$TEMP_DIR/wire-server-values.yaml" - # fixing the turnStatic values - yq eval -i ".brig.turnStatic.v2 = [\"turn:$HOST_IP:3478\", \"turn:$HOST_IP:3478?transport=tcp\"]" "$TEMP_DIR/wire-server-values.yaml" - # Fixing the hosts in webapp team-settings and account-pages charts for chart in webapp team-settings account-pages; do sed "s/example.com/$TARGET_SYSTEM/g" "$BASE_DIR/values/$chart/values.yaml" > "$TEMP_DIR/$chart-values.yaml" done - # Setting certManager and DNS records - sed -e 's/useCertManager: false/useCertManager: true/g' \ - -e "/certmasterEmail:$/s/certmasterEmail:/certmasterEmail: $CERT_MASTER_EMAIL/" \ - -e "s/example.com/$TARGET_SYSTEM/" \ - "$BASE_DIR/values/nginx-ingress-services/values.yaml" > "$TEMP_DIR/nginx-ingress-services-values.yaml" + files=(wire-server-values.yaml webapp-values.yaml team-settings-values.yaml account-pages-values.yaml) + + if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then + # Setting certManager and DNS records for Let's Encrypt based certificate management + sed -e 's/useCertManager: false/useCertManager: true/g' \ + -e "/certmasterEmail:$/s/certmasterEmail:/certmasterEmail: $CERT_MASTER_EMAIL/" \ + -e "s/example.com/$TARGET_SYSTEM/" \ + "$BASE_DIR/values/nginx-ingress-services/values.yaml" > "$TEMP_DIR/nginx-ingress-services-values.yaml" + + files+=(nginx-ingress-services-values.yaml) + fi + + if [[ "$DEPLOY_CALLING_SERVICES" == "TRUE" ]]; then + # to find IP address of calling NODE + CALLING_NODE_IP=$(kubectl get node "$CALLING_NODE" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') + + # fixing the turnStatic values + yq eval -i ".brig.turnStatic.v2 = [\"turn:$HOST_IP:3478\", \"turn:$HOST_IP:3478?transport=tcp\"]" "$TEMP_DIR/wire-server-values.yaml" - # Fixing SFTD hosts and setting the cert-manager to http01 - sed -e "s/webapp.example.com/webapp.$TARGET_SYSTEM/" \ - -e "s/sftd.example.com/sftd.$TARGET_SYSTEM/" \ - -e 's/name: letsencrypt-prod/name: letsencrypt-http01/' \ - "$BASE_DIR/values/sftd/values.yaml" > "$TEMP_DIR/sftd-values.yaml" + # Fix SFTD hostnames, and only enable Let's Encrypt specific issuer changes when cert-manager is enabled. + sed -e "s/webapp.example.com/webapp.$TARGET_SYSTEM/" \ + -e "s/sftd.example.com/sftd.$TARGET_SYSTEM/" \ + "$BASE_DIR/values/sftd/values.yaml" > "$TEMP_DIR/sftd-values.yaml" - # Setting coturn node IP values - yq eval -i ".coturnTurnListenIP = \"$CALLING_NODE_IP\"" "$BASE_DIR/values/coturn/values.yaml" - yq eval -i ".coturnTurnRelayIP = \"$CALLING_NODE_IP\"" "$BASE_DIR/values/coturn/values.yaml" - yq eval -i ".coturnTurnExternalIP = \"$HOST_IP\"" "$BASE_DIR/values/coturn/values.yaml" + cp "$BASE_DIR/values/coturn/values.yaml" "$TEMP_DIR/coturn-values.yaml" + + if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then + yq eval -i '.tls.issuerRef.name = "letsencrypt-http01"' "$TEMP_DIR/sftd-values.yaml" + fi + + # Setting coturn node IP values + yq eval -i ".coturnTurnListenIP = \"$CALLING_NODE_IP\"" "$TEMP_DIR/coturn-values.yaml" + yq eval -i ".coturnTurnRelayIP = \"$CALLING_NODE_IP\"" "$TEMP_DIR/coturn-values.yaml" + yq eval -i ".coturnTurnExternalIP = \"$HOST_IP\"" "$TEMP_DIR/coturn-values.yaml" + + files+=(sftd-values.yaml coturn-values.yaml) + fi # Compare and copy files if different - for file in wire-server-values.yaml webapp-values.yaml team-settings-values.yaml account-pages-values.yaml \ - nginx-ingress-services-values.yaml sftd-values.yaml; do + for file in "${files[@]}"; do if ! cmp -s "$TEMP_DIR/$file" "$BASE_DIR/values/${file%-values.yaml}/values.yaml"; then cp "$TEMP_DIR/$file" "$BASE_DIR/values/${file%-values.yaml}/values.yaml" echo "Updating $BASE_DIR/values/${file%-values.yaml}/values.yaml" @@ -188,6 +228,11 @@ deploy_cert_manager() { deploy_calling_services() { + if [[ "$DEPLOY_CALLING_SERVICES" != "TRUE" ]]; then + echo "Skipping sftd and coturn deployment because DEPLOY_CALLING_SERVICES=$DEPLOY_CALLING_SERVICES" + return 0 + fi + echo "Deploying sftd and coturn" # select the node to deploy sftd kubectl annotate node "$CALLING_NODE" wire.com/external-ip="$HOST_IP" --overwrite @@ -202,6 +247,9 @@ deploy_calling_services() { main() { +# initialize calling-service specific values only when enabled +configure_calling_environment + # Create prod-values.example.yaml to values.yaml and take backup process_values "prod" "values" # Create prod-secrets.example.yaml to secrets.yaml and take backup @@ -228,7 +276,7 @@ if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then kubectl get certificate fi -# deploying sft and coturn services +# deploying sft and coturn services when enabled deploy_calling_services } diff --git a/dashboards/api_upload/CPU_and_Memory.json b/dashboards/api_upload/CPU_and_Memory.json index ebe3e7f9f..bb5d5935b 100644 --- a/dashboards/api_upload/CPU_and_Memory.json +++ b/dashboards/api_upload/CPU_and_Memory.json @@ -81,7 +81,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -357,7 +357,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -385,7 +385,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -495,7 +495,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -611,7 +611,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -727,7 +727,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1000,7 +1000,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1028,7 +1028,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1138,7 +1138,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1254,7 +1254,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1383,7 +1383,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1565,7 +1565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1595,7 +1595,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1705,7 +1705,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\"}[5m])) by (namespace)", @@ -1805,7 +1805,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\"}) by (namespace)", @@ -1909,7 +1909,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{job=\"cadvisor\", container!=\"\"}[$__rate_interval])) by (namespace, pod, container) /sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", container!=\"\"}[$__rate_interval])) by (namespace, pod, container) > 0", @@ -1933,7 +1933,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${PROMETHEUS_DS}" }, "includeAll": false, "name": "PROMETHEUS_DS", diff --git a/dashboards/api_upload/Kubernetes__Views__Nodes.json b/dashboards/api_upload/Kubernetes__Views__Nodes.json index da969cff1..72d4d8e95 100644 --- a/dashboards/api_upload/Kubernetes__Views__Nodes.json +++ b/dashboards/api_upload/Kubernetes__Views__Nodes.json @@ -3923,13 +3923,13 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(node_uname_info{nodename=~\"(?i:($node)(\\.[a-z0-9.]+)?)\"}, instance)", + "definition": "label_values(node_uname_info, instance)", "hide": 2, "includeAll": false, "name": "instance", "options": [], "query": { - "query": "label_values(node_uname_info{nodename=~\"(?i:($node)(\\.[a-z0-9.]+)?)\"}, instance)", + "query": "label_values(node_uname_info, instance)", "refId": "StandardVariableQuery" }, "refresh": 2, diff --git a/dashboards/api_upload/NGINX_Ingress_controller.json b/dashboards/api_upload/NGINX_Ingress_controller.json index d89e23abb..c409e65be 100644 --- a/dashboards/api_upload/NGINX_Ingress_controller.json +++ b/dashboards/api_upload/NGINX_Ingress_controller.json @@ -179,7 +179,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -281,7 +281,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -383,7 +383,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -485,7 +485,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "round(sum(irate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", "format": "time_series", @@ -565,7 +565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -650,7 +650,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"5.*\"}[2m])) / sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -731,7 +731,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "avg(nginx_ingress_controller_success{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"})", "format": "time_series", @@ -813,7 +813,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "count(nginx_ingress_controller_config_last_reload_successful{controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", "format": "time_series", @@ -936,7 +936,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "round(sum(irate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", "format": "time_series", @@ -1087,7 +1087,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\",status!~\"5.*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", "format": "time_series", @@ -1193,7 +1193,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum (irate (nginx_ingress_controller_request_size_sum{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -1227,7 +1227,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1379,7 +1379,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1513,7 +1513,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1801,7 +1801,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (le, ingress))", "format": "table", @@ -1827,7 +1827,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(irate(nginx_ingress_controller_request_size_sum{ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", "format": "table", @@ -1857,7 +1857,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -2002,7 +2002,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${PROMETHEUS_DS}" }, "includeAll": false, "name": "PROMETHEUS_DS", @@ -2020,7 +2020,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "definition": "", "includeAll": true, @@ -2066,7 +2066,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "definition": "", "includeAll": true, diff --git a/dashboards/api_upload/Node_Exporter_Full.json b/dashboards/api_upload/Node_Exporter_Full.json index 06f106cf0..1f60ca0b5 100644 --- a/dashboards/api_upload/Node_Exporter_Full.json +++ b/dashboards/api_upload/Node_Exporter_Full.json @@ -98,7 +98,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Resource pressure via PSI", "fieldConfig": { @@ -168,7 +168,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -184,7 +184,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -201,7 +201,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -218,7 +218,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -239,7 +239,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Overall CPU busy percentage (averaged across all cores)", "fieldConfig": { @@ -308,7 +308,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -328,7 +328,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "System load over all CPU cores together", "fieldConfig": { @@ -397,7 +397,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -417,7 +417,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Real RAM usage excluding cache and reclaimable memory", "fieldConfig": { @@ -476,7 +476,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -496,7 +496,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of swap space currently used by the system", "fieldConfig": { @@ -565,7 +565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -583,7 +583,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Used Root FS", "fieldConfig": { @@ -652,7 +652,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -671,7 +671,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -732,7 +732,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -749,7 +749,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -811,7 +811,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -829,7 +829,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -895,7 +895,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -913,7 +913,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -983,7 +983,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1003,7 +1003,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1069,7 +1069,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1087,7 +1087,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1153,7 +1153,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1184,7 +1184,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "CPU time spent busy vs idle, split by activity type", "fieldConfig": { @@ -1342,7 +1342,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1359,7 +1359,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1374,7 +1374,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1388,7 +1388,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1402,7 +1402,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1416,7 +1416,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1434,7 +1434,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "RAM and swap usage overview, including caches", "fieldConfig": { @@ -1588,7 +1588,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", @@ -1603,7 +1603,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", @@ -1618,7 +1618,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", @@ -1632,7 +1632,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", @@ -1646,7 +1646,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", @@ -1664,7 +1664,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-interface network traffic (receive and transmit) in bits per second", "fieldConfig": { @@ -1757,7 +1757,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -1771,7 +1771,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -1789,7 +1789,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of filesystem space used for each mounted device", "fieldConfig": { @@ -1871,7 +1871,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "((node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"} - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) * 100", @@ -1899,7 +1899,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "CPU time usage split by state, normalized across all CPU cores", "fieldConfig": { @@ -2136,7 +2136,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2151,7 +2151,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"user\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2165,7 +2165,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"nice\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2179,7 +2179,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"iowait\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2193,7 +2193,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"irq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2207,7 +2207,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"softirq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2221,7 +2221,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"steal\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2235,7 +2235,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2250,7 +2250,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))) > 0", @@ -2269,7 +2269,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breakdown of physical memory and swap usage. Hardware-detected memory errors are also displayed", "fieldConfig": { @@ -2657,7 +2657,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2672,7 +2672,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", @@ -2687,7 +2687,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2701,7 +2701,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", @@ -2716,7 +2716,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2731,7 +2731,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", @@ -2746,7 +2746,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", @@ -2761,7 +2761,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", @@ -2776,7 +2776,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", @@ -2795,7 +2795,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Incoming and outgoing network traffic per interface", "fieldConfig": { @@ -2893,7 +2893,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -2907,7 +2907,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -2925,7 +2925,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Network interface utilization as a percentage of its maximum capacity", "fieldConfig": { @@ -3023,7 +3023,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}) * 100", @@ -3037,7 +3037,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}) * 100", @@ -3056,7 +3056,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk I/O operations per second for each device", "fieldConfig": { @@ -3154,7 +3154,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3167,7 +3167,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3184,7 +3184,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk I/O throughput per device", "fieldConfig": { @@ -3282,7 +3282,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3297,7 +3297,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3316,7 +3316,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Amount of available disk space per mounted filesystem, excluding rootfs. Based on block availability to non-root users", "fieldConfig": { @@ -3402,7 +3402,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3418,7 +3418,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3433,7 +3433,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3452,7 +3452,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk usage (used = total - available) per mountpoint", "fieldConfig": { @@ -3538,7 +3538,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3556,7 +3556,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of time the disk was actively processing I/O operations", "fieldConfig": { @@ -3642,7 +3642,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", @@ -3662,7 +3662,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How often tasks experience CPU, memory, or I/O delays. “Some” indicates partial slowdown; “Full” indicates all tasks are stalled. Based on Linux PSI metrics:\nhttps://docs.kernel.org/accounting/psi.html", "fieldConfig": { @@ -3772,7 +3772,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3786,7 +3786,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3801,7 +3801,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3816,7 +3816,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3831,7 +3831,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3846,7 +3846,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3879,7 +3879,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays committed memory usage versus the system's commit limit. Exceeding the limit is allowed under Linux overcommit policies but may increase OOM risks under high load", "fieldConfig": { @@ -3986,7 +3986,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", @@ -4000,7 +4000,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", @@ -4018,7 +4018,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory currently dirty (modified but not yet written to disk), being actively written back, or held by writeback buffers. High dirty or writeback memory may indicate disk I/O pressure or delayed flushing", "fieldConfig": { @@ -4104,7 +4104,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", @@ -4118,7 +4118,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", @@ -4132,7 +4132,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", @@ -4146,7 +4146,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", @@ -4165,7 +4165,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Kernel slab memory usage, separated into reclaimable and non-reclaimable categories. Reclaimable memory can be freed under memory pressure (e.g., caches), while unreclaimable memory is locked by the kernel for core functions", "fieldConfig": { @@ -4251,7 +4251,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", @@ -4265,7 +4265,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", @@ -4283,7 +4283,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory used for mapped files (such as libraries) and shared memory (shmem and tmpfs), including variants backed by huge pages", "fieldConfig": { @@ -4370,7 +4370,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", @@ -4384,7 +4384,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", @@ -4398,7 +4398,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", @@ -4413,7 +4413,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", @@ -4432,7 +4432,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Proportion of memory pages in the kernel's active and inactive LRU lists relative to total RAM. Active pages have been recently used, while inactive pages are less recently accessed but still resident in memory", "fieldConfig": { @@ -4550,7 +4550,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", @@ -4564,7 +4564,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})\n", @@ -4582,7 +4582,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breakdown of memory pages in the kernel's active and inactive LRU lists, separated by anonymous (heap, tmpfs) and file-backed (caches, mmap) pages.", "fieldConfig": { @@ -4669,7 +4669,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", @@ -4684,7 +4684,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", @@ -4699,7 +4699,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", @@ -4714,7 +4714,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", @@ -4733,7 +4733,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks kernel memory used for CPU-local structures, per-thread stacks, and bounce buffers used for I/O on DMA-limited devices. These areas are typically small but critical for low-level operations", "fieldConfig": { @@ -4820,7 +4820,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", @@ -4834,7 +4834,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", @@ -4849,7 +4849,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", @@ -4869,7 +4869,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Usage of the kernel's vmalloc area, which provides virtual memory allocations for kernel modules and drivers. Includes total, used, and largest free block sizes", "fieldConfig": { @@ -4985,7 +4985,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", @@ -5000,7 +5000,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", @@ -5015,7 +5015,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", @@ -5034,7 +5034,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory used by anonymous pages (not backed by files), including standard and huge page allocations. Includes heap, stack, and memory-mapped anonymous regions", "fieldConfig": { @@ -5120,7 +5120,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", @@ -5134,7 +5134,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", @@ -5152,7 +5152,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory that is locked in RAM and cannot be swapped out. Includes both kernel-unevictable memory and user-level memory locked with mlock()", "fieldConfig": { @@ -5255,7 +5255,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", @@ -5269,7 +5269,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", @@ -5287,7 +5287,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How much memory is directly mapped in the kernel using different page sizes (4K, 2M, 1G). Helps monitor large page utilization in the direct map region", "fieldConfig": { @@ -5648,7 +5648,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", @@ -5662,7 +5662,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", @@ -5677,7 +5677,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", @@ -5696,7 +5696,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays HugePages memory usage in bytes, including allocated, free, reserved, and surplus memory. All values are calculated based on the number of huge pages multiplied by their configured size", "fieldConfig": { @@ -5782,7 +5782,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5796,7 +5796,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5810,7 +5810,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5824,7 +5824,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5857,7 +5857,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of memory pages being read from or written to disk (page-in and page-out operations). High page-out may indicate memory pressure or swapping activity", "fieldConfig": { @@ -5955,7 +5955,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -5969,7 +5969,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -5987,7 +5987,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate at which memory pages are being swapped in from or out to disk. High swap-out activity may indicate memory pressure", "fieldConfig": { @@ -6085,7 +6085,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6099,7 +6099,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6117,7 +6117,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of memory page faults, split into total, major (disk-backed), and derived minor (non-disk) faults. High major fault rates may indicate memory pressure or insufficient RAM", "fieldConfig": { @@ -6241,7 +6241,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6256,7 +6256,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6271,7 +6271,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6290,7 +6290,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of Out-of-Memory (OOM) kill events. A non-zero value indicates the kernel has terminated one or more processes due to memory exhaustion", "fieldConfig": { @@ -6392,7 +6392,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6425,7 +6425,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the system clock's estimated and maximum error, as well as its offset from the reference clock (e.g., via NTP). Useful for detecting synchronization drift", "fieldConfig": { @@ -6510,7 +6510,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", @@ -6526,7 +6526,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", @@ -6542,7 +6542,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", @@ -6562,7 +6562,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "NTP phase-locked loop (PLL) time constant used by the kernel to control time adjustments. Lower values mean faster correction but less stability", "fieldConfig": { @@ -6647,7 +6647,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", @@ -6666,7 +6666,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows whether the system clock is synchronized to a reliable time source, and the current frequency correction ratio applied by the kernel to maintain synchronization", "fieldConfig": { @@ -6751,7 +6751,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", @@ -6766,7 +6766,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", @@ -6781,7 +6781,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", @@ -6797,7 +6797,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", @@ -6817,7 +6817,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays the PPS signal's frequency offset and stability (jitter) in hertz. Useful for monitoring high-precision time sources like GPS or atomic clocks", "fieldConfig": { @@ -6902,7 +6902,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_frequency_hertz{instance=\"$node\",job=\"$job\"}", @@ -6917,7 +6917,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_stability_hertz{instance=\"$node\",job=\"$job\"}", @@ -6936,7 +6936,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks PPS signal timing jitter and shift compared to system clock", "fieldConfig": { @@ -7021,7 +7021,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_jitter_seconds{instance=\"$node\",job=\"$job\"}", @@ -7036,7 +7036,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_shift_seconds{instance=\"$node\",job=\"$job\"}", @@ -7055,7 +7055,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of PPS synchronization diagnostics including calibration events, jitter violations, errors, and frequency stability exceedances", "fieldConfig": { @@ -7144,7 +7144,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_calibration_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7159,7 +7159,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_error_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7174,7 +7174,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_stability_exceeded_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7190,7 +7190,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_jitter_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7224,7 +7224,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Processes currently in runnable or blocked states. Helps identify CPU contention or I/O wait bottlenecks.", "fieldConfig": { @@ -7314,7 +7314,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", @@ -7328,7 +7328,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", @@ -7346,7 +7346,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of processes in each state (e.g., running, sleeping, zombie). Requires --collector.processes to be enabled in node_exporter", "fieldConfig": { @@ -7517,7 +7517,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", @@ -7536,7 +7536,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of new processes being created on the system (forks/sec).", "fieldConfig": { @@ -7622,7 +7622,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7641,7 +7641,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows CPU saturation per core, calculated as the proportion of time spent waiting to run relative to total time demanded (running + waiting).", "fieldConfig": { @@ -7743,7 +7743,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7759,7 +7759,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7775,7 +7775,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n/\n(irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) + irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))\n", @@ -7795,7 +7795,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of active PIDs on the system and the configured maximum allowed. Useful for detecting PID exhaustion risk. Requires --collector.processes in node_exporter", "fieldConfig": { @@ -7911,7 +7911,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", @@ -7926,7 +7926,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", @@ -7945,7 +7945,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of active threads on the system and the configured thread limit. Useful for monitoring thread pressure. Requires --collector.processes in node_exporter", "fieldConfig": { @@ -8061,7 +8061,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", @@ -8076,7 +8076,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", @@ -8109,7 +8109,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-second rate of context switches and hardware interrupts. High values may indicate intense CPU or I/O activity", "fieldConfig": { @@ -8195,7 +8195,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8209,7 +8209,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8228,7 +8228,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "System load average over 1, 5, and 15 minutes. Reflects the number of active or waiting processes. Values above CPU core count may indicate overload", "fieldConfig": { @@ -8344,7 +8344,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load1{instance=\"$node\",job=\"$job\"}", @@ -8358,7 +8358,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load5{instance=\"$node\",job=\"$job\"}", @@ -8372,7 +8372,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load15{instance=\"$node\",job=\"$job\"}", @@ -8386,7 +8386,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", @@ -8405,7 +8405,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Real-time CPU frequency scaling per core, including average minimum and maximum allowed scaling frequencies", "fieldConfig": { @@ -8557,7 +8557,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}", @@ -8573,7 +8573,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})", @@ -8589,7 +8589,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})", @@ -8609,7 +8609,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of scheduling timeslices executed per CPU. Reflects how frequently the scheduler switches tasks on each core", "fieldConfig": { @@ -8694,7 +8694,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8713,7 +8713,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breaks down hardware interrupts by type and device. Useful for diagnosing IRQ load on network, disk, or CPU interfaces. Requires --collector.interrupts to be enabled in node_exporter", "fieldConfig": { @@ -8799,7 +8799,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8818,7 +8818,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of bits of entropy currently available to the system's random number generators (e.g., /dev/random). Low values may indicate that random number generation could block or degrade performance of cryptographic operations", "fieldConfig": { @@ -8934,7 +8934,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", @@ -8948,7 +8948,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_entropy_pool_size_bits{instance=\"$node\",job=\"$job\"}", @@ -8981,7 +8981,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Monitors hardware sensor temperatures and critical thresholds as exposed by Linux hwmon. Includes CPU, GPU, and motherboard sensors where available", "fieldConfig": { @@ -9087,7 +9087,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9102,7 +9102,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9116,7 +9116,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9131,7 +9131,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9145,7 +9145,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9163,7 +9163,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows how hard each cooling device (fan/throttle) is working relative to its maximum capacity", "fieldConfig": { @@ -9268,7 +9268,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "100 * node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"} / node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", @@ -9288,7 +9288,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows the online status of power supplies (e.g., AC, battery). A value of 1-Yes indicates the power supply is active/online", "fieldConfig": { @@ -9377,7 +9377,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", @@ -9397,7 +9397,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays the current fan speeds (RPM) from hardware sensors via the hwmon interface", "fieldConfig": { @@ -9483,7 +9483,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_fan_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9498,7 +9498,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_fan_min_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9532,7 +9532,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of systemd units in each operational state, such as active, failed, inactive, or transitioning", "fieldConfig": { @@ -9693,7 +9693,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", @@ -9708,7 +9708,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", @@ -9723,7 +9723,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", @@ -9738,7 +9738,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", @@ -9753,7 +9753,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", @@ -9772,7 +9772,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of active connections per systemd socket, as reported by the Node Exporter systemd collector", "fieldConfig": { @@ -9858,7 +9858,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_socket_current_connections{instance=\"$node\",job=\"$job\"}", @@ -9877,7 +9877,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of accepted connections per second for each systemd socket", "fieldConfig": { @@ -9963,7 +9963,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -9982,7 +9982,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of systemd socket connection refusals per second, typically due to service unavailability or backlog overflow", "fieldConfig": { @@ -10068,7 +10068,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_systemd_socket_refused_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10101,7 +10101,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of I/O operations completed per second for the device (after merges), including both reads and writes", "fieldConfig": { @@ -10214,7 +10214,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10227,7 +10227,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10244,7 +10244,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of bytes read from or written to the device per second", "fieldConfig": { @@ -10357,7 +10357,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10371,7 +10371,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -10391,7 +10391,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", "fieldConfig": { @@ -10504,7 +10504,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10519,7 +10519,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10538,7 +10538,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Average queue length of the requests that were issued to the device", "fieldConfig": { @@ -10640,7 +10640,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10658,7 +10658,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of read and write requests merged per second that were queued to the device", "fieldConfig": { @@ -10771,7 +10771,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10784,7 +10784,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10801,7 +10801,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of time the disk spent actively processing I/O operations, including general I/O, discards (TRIM), and write cache flushes", "fieldConfig": { @@ -10903,7 +10903,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10917,7 +10917,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10931,7 +10931,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_flush_requests_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10950,7 +10950,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-second rate of discard (TRIM) and flush (write cache) operations. Useful for monitoring low-level disk activity on SSDs and advanced storage", "fieldConfig": { @@ -11051,7 +11051,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11065,7 +11065,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11079,7 +11079,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_flush_requests_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11098,7 +11098,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows how many disk sectors are discarded (TRIMed) per second. Useful for monitoring SSD behavior and storage efficiency", "fieldConfig": { @@ -11199,7 +11199,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discarded_sectors_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11217,7 +11217,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of in-progress I/O requests at the time of sampling (active requests in the disk queue)", "fieldConfig": { @@ -11319,7 +11319,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", @@ -11351,7 +11351,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file descriptors currently allocated system-wide versus the system limit. Important for detecting descriptor exhaustion risks", "fieldConfig": { @@ -11467,7 +11467,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", @@ -11481,7 +11481,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", @@ -11499,7 +11499,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of free file nodes (inodes) available per mounted filesystem. A low count may prevent file creation even if disk space is available", "fieldConfig": { @@ -11585,7 +11585,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11604,7 +11604,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Indicates filesystems mounted in read-only mode or reporting device-level I/O errors.", "fieldConfig": { @@ -11691,7 +11691,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11705,7 +11705,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", @@ -11724,7 +11724,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file nodes (inodes) available per mounted filesystem. Reflects maximum file capacity regardless of disk size", "fieldConfig": { @@ -11810,7 +11810,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11843,7 +11843,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of network packets received and transmitted per second, by interface.", "fieldConfig": { @@ -11942,7 +11942,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11957,7 +11957,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11976,7 +11976,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of packet-level errors for each network interface. Receive errors may indicate physical or driver issues; transmit errors may reflect collisions or hardware faults", "fieldConfig": { @@ -12075,7 +12075,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12089,7 +12089,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12107,7 +12107,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of dropped packets per network interface. Receive drops can indicate buffer overflow or driver issues; transmit drops may result from outbound congestion or queuing limits", "fieldConfig": { @@ -12206,7 +12206,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12220,7 +12220,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12238,7 +12238,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of compressed network packets received and transmitted per interface. These are common in low-bandwidth or special interfaces like PPP or SLIP", "fieldConfig": { @@ -12337,7 +12337,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12351,7 +12351,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12369,7 +12369,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of incoming multicast packets received per network interface. Multicast is used by protocols such as mDNS, SSDP, and some streaming or cluster services", "fieldConfig": { @@ -12468,7 +12468,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12486,7 +12486,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of received packets that could not be processed due to missing protocol or handler in the kernel. May indicate unsupported traffic or misconfiguration", "fieldConfig": { @@ -12585,7 +12585,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_nohandler_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12603,7 +12603,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of frame errors on received packets, typically caused by physical layer issues such as bad cables, duplex mismatches, or hardware problems", "fieldConfig": { @@ -12702,7 +12702,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12721,7 +12721,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks FIFO buffer overrun errors on network interfaces. These occur when incoming or outgoing packets are dropped due to queue or buffer overflows, often indicating congestion or hardware limits", "fieldConfig": { @@ -12820,7 +12820,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12834,7 +12834,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12852,7 +12852,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of packet collisions detected during transmission. Mostly relevant on half-duplex or legacy Ethernet networks", "fieldConfig": { @@ -12951,7 +12951,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12969,7 +12969,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of carrier errors during transmission. These typically indicate physical layer issues like faulty cabling or duplex mismatches", "fieldConfig": { @@ -13055,7 +13055,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -13073,7 +13073,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of ARP entries per interface. Useful for detecting excessive ARP traffic or table growth due to scanning or misconfiguration", "fieldConfig": { @@ -13159,7 +13159,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", @@ -13177,7 +13177,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current and maximum connection tracking entries used by Netfilter (nf_conntrack). High usage approaching the limit may cause packet drops or connection issues", "fieldConfig": { @@ -13293,7 +13293,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", @@ -13307,7 +13307,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", @@ -13325,7 +13325,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Operational and physical link status of each network interface. Values are Yes for 'up' or link present, and No for 'down' or no carrier.\"", "fieldConfig": { @@ -13411,7 +13411,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", @@ -13426,7 +13426,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", @@ -13442,7 +13442,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Maximum speed of each network interface as reported by the operating system. This is a static hardware capability, not current throughput", "fieldConfig": { @@ -13503,7 +13503,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"} * 8", @@ -13521,7 +13521,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "MTU (Maximum Transmission Unit) in bytes for each network interface. Affects packet size and transmission efficiency", "fieldConfig": { @@ -13581,7 +13581,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", @@ -13613,7 +13613,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks TCP socket usage and memory per node", "fieldConfig": { @@ -13700,7 +13700,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", @@ -13715,7 +13715,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", @@ -13730,7 +13730,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", @@ -13745,7 +13745,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", @@ -13764,7 +13764,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of UDP and UDPLite sockets currently in use", "fieldConfig": { @@ -13851,7 +13851,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", @@ -13866,7 +13866,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", @@ -13885,7 +13885,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Total number of sockets currently in use across all protocols (TCP, UDP, UNIX, etc.), as reported by /proc/net/sockstat", "fieldConfig": { @@ -13972,7 +13972,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", @@ -13991,7 +13991,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of FRAG and RAW sockets currently in use. RAW sockets are used for custom protocols or tools like ping; FRAG sockets are used internally for IP packet defragmentation", "fieldConfig": { @@ -14078,7 +14078,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", @@ -14093,7 +14093,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", @@ -14112,7 +14112,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "TCP/UDP socket memory usage in kernel (in pages)", "fieldConfig": { @@ -14199,7 +14199,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", @@ -14214,7 +14214,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", @@ -14233,7 +14233,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Kernel memory used by TCP, UDP, and IP fragmentation buffers", "fieldConfig": { @@ -14320,7 +14320,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", @@ -14335,7 +14335,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", @@ -14350,7 +14350,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", @@ -14367,7 +14367,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Packets processed and dropped by the softnet network stack per CPU. Drops may indicate CPU saturation or network driver limitations", "fieldConfig": { @@ -14466,7 +14466,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14481,7 +14481,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14500,7 +14500,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How often the kernel was unable to process all packets in the softnet queue before time ran out. Frequent squeezes may indicate CPU contention or driver inefficiency", "fieldConfig": { @@ -14590,7 +14590,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14609,7 +14609,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the number of packets processed or dropped by Receive Packet Steering (RPS), a mechanism to distribute packet processing across CPUs", "fieldConfig": { @@ -14715,7 +14715,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_received_rps_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14730,7 +14730,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_flow_limit_count_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14764,7 +14764,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of octets sent and received at the IP layer, as reported by /proc/net/netstat", "fieldConfig": { @@ -14863,7 +14863,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14878,7 +14878,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14896,7 +14896,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP segments sent and received per second, including data and control segments", "fieldConfig": { @@ -15005,7 +15005,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15020,7 +15020,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15039,7 +15039,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of UDP datagrams sent and received per second, based on /proc/net/netstat", "fieldConfig": { @@ -15137,7 +15137,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15152,7 +15152,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15171,7 +15171,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of ICMP messages sent and received per second, including error and control messages", "fieldConfig": { @@ -15273,7 +15273,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15288,7 +15288,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15307,7 +15307,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks various TCP error and congestion-related events, including retransmissions, timeouts, dropped connections, and buffer issues", "fieldConfig": { @@ -15393,7 +15393,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15409,7 +15409,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15425,7 +15425,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15440,7 +15440,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15452,7 +15452,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15464,7 +15464,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15476,7 +15476,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15489,7 +15489,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15502,7 +15502,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPTimeouts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15519,7 +15519,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of UDP and UDPLite datagram delivery errors, including missing listeners, buffer overflows, and protocol-specific issues", "fieldConfig": { @@ -15608,7 +15608,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15623,7 +15623,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15638,7 +15638,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15650,7 +15650,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15665,7 +15665,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15684,7 +15684,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of incoming ICMP messages that contained protocol-specific errors, such as bad checksums or invalid lengths", "fieldConfig": { @@ -15782,7 +15782,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15801,7 +15801,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP SYN cookies sent, validated, and failed. These are used to protect against SYN flood attacks and manage TCP handshake resources under load", "fieldConfig": { @@ -15902,7 +15902,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15918,7 +15918,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15934,7 +15934,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15954,7 +15954,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of currently established TCP connections and the system's max supported limit. On Linux, MaxConn may return -1 to indicate a dynamic/unlimited configuration", "fieldConfig": { @@ -16070,7 +16070,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", @@ -16086,7 +16086,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", @@ -16106,7 +16106,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of UDP packets currently queued in the receive (RX) and transmit (TX) buffers. A growing queue may indicate a bottleneck", "fieldConfig": { @@ -16191,7 +16191,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"rx\"}", @@ -16207,7 +16207,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"tx\"}", @@ -16227,7 +16227,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP connection initiations per second. 'Active' opens are initiated by this host. 'Passive' opens are accepted from incoming connections", "fieldConfig": { @@ -16313,7 +16313,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16328,7 +16328,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16347,7 +16347,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of TCP sockets in key connection states. Requires the --collector.tcpstat flag on node_exporter", "fieldConfig": { @@ -16434,7 +16434,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}", @@ -16449,7 +16449,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}", @@ -16465,7 +16465,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}", @@ -16481,7 +16481,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}", @@ -16497,7 +16497,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"close_wait\", instance=\"$node\", job=\"$job\"}", @@ -16531,7 +16531,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Duration of each individual collector executed during a Node Exporter scrape. Useful for identifying slow or failing collectors", "fieldConfig": { @@ -16620,7 +16620,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", @@ -16640,7 +16640,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of CPU time used by the process exposing this metric (user + system mode)", "fieldConfig": { @@ -16725,7 +16725,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16744,7 +16744,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the memory usage of the process exposing this metric (e.g., node_exporter), including current virtual memory and maximum virtual memory limit", "fieldConfig": { @@ -16884,7 +16884,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}", @@ -16899,7 +16899,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}", @@ -16918,7 +16918,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file descriptors used by the exporter process versus its configured limit", "fieldConfig": { @@ -17058,7 +17058,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", @@ -17072,7 +17072,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", @@ -17090,7 +17090,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows whether each Node Exporter collector scraped successfully (1 = success, 0 = failure), and whether the textfile collector returned an error.", "fieldConfig": { @@ -17156,7 +17156,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", @@ -17172,7 +17172,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "1 - node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", @@ -17205,7 +17205,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${datasource}" }, "includeAll": false, "label": "Datasource", @@ -17223,7 +17223,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "", "includeAll": false, @@ -17246,7 +17246,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", "includeAll": false, @@ -17269,7 +17269,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", "includeAll": false, diff --git a/dashboards/api_upload/Wire_Services.json b/dashboards/api_upload/Wire_Services.json index 07ddae75b..2456301c0 100644 --- a/dashboards/api_upload/Wire_Services.json +++ b/dashboards/api_upload/Wire_Services.json @@ -279,7 +279,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$service.*\"}[$__rate_interval])) by (pod)", @@ -307,7 +307,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$service.*\", resource=\"cpu\"}) by (pod)\n", @@ -459,7 +459,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -494,7 +494,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -627,7 +627,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", namespace=\"$namespace\"}[5m])) by (service)", @@ -701,7 +701,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", namespace=\"$namespace\"}[$__range])) by (service, method, handler)", @@ -800,7 +800,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "builder", "expr": "sum by(service) (increase(kube_deployment_metadata_generation{namespace=~\"$namespace\", deployment=\"$service\"}[$__rate_interval]))", @@ -921,7 +921,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"2..\", service=\"$service\", namespace=\"$namespace\"}[1m]))", @@ -949,7 +949,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", service=\"$service\", namespace=\"$namespace\"}[1m]))", @@ -1048,7 +1048,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "net_connections{service=\"$service\", namespace=\"$namespace\"}", @@ -1156,7 +1156,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(http_request_duration_seconds_count{service=\"$service\", handler!=\"/i/status\", namespace=\"$namespace\"}[1m])) by (method, handler)", @@ -1259,7 +1259,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"4..\", service=\"$service\", namespace=\"$namespace\"}[$__range])) by (handler, status_code)", @@ -1371,7 +1371,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", service=\"$service\", namespace=\"$namespace\"}[$__range])) by (handler)", @@ -1477,7 +1477,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "topk(5, avg(rate(http_request_duration_seconds_sum{service=\"$service\", handler!~\"/i/status|prometheus\", namespace=\"$namespace\"}[5m])) by (handler))", @@ -1575,7 +1575,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(container_network_receive_bytes_total{namespace=\"$namespace\", pod=~\"$service.*\"}[$__rate_interval])) by (pod)", @@ -1603,7 +1603,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "fieldConfig": { "defaults": { @@ -1697,7 +1697,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "-sum(irate(container_fs_writes_bytes_total{namespace=\"$namespace\",pod=~\"$service.*\"}[1m])) by (pod)", @@ -1722,7 +1722,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${datasource}" }, "includeAll": false, "name": "datasource", @@ -1787,7 +1787,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(namespace)", "includeAll": false, diff --git a/dashboards/manual_upload/cpu_and_memory.json b/dashboards/manual_upload/cpu_and_memory.json index cec47580c..978ebe416 100644 --- a/dashboards/manual_upload/cpu_and_memory.json +++ b/dashboards/manual_upload/cpu_and_memory.json @@ -43,7 +43,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -319,7 +319,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -347,7 +347,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -457,7 +457,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -573,7 +573,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -689,7 +689,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -962,7 +962,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -990,7 +990,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1100,7 +1100,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1216,7 +1216,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1345,7 +1345,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "description": "", "fieldConfig": { @@ -1527,7 +1527,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1557,7 +1557,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -1667,7 +1667,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\"}[5m])) by (namespace)", @@ -1767,7 +1767,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\"}) by (namespace)", @@ -1871,7 +1871,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{job=\"cadvisor\", container!=\"\"}[$__rate_interval])) by (namespace, pod, container) /sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", container!=\"\"}[$__rate_interval])) by (namespace, pod, container) > 0", @@ -1895,7 +1895,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${PROMETHEUS_DS}" }, "includeAll": false, "name": "PROMETHEUS_DS", diff --git a/dashboards/manual_upload/k8s_node_view.json b/dashboards/manual_upload/k8s_node_view.json index ea11e2c8f..33c1766e7 100644 --- a/dashboards/manual_upload/k8s_node_view.json +++ b/dashboards/manual_upload/k8s_node_view.json @@ -3885,13 +3885,13 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(node_uname_info{nodename=~\"(?i:($node)(\\.[a-z0-9.]+)?)\"}, instance)", + "definition": "label_values(node_uname_info, instance)", "hide": 2, "includeAll": false, "name": "instance", "options": [], "query": { - "query": "label_values(node_uname_info{nodename=~\"(?i:($node)(\\.[a-z0-9.]+)?)\"}, instance)", + "query": "label_values(node_uname_info, instance)", "refId": "StandardVariableQuery" }, "refresh": 2, diff --git a/dashboards/manual_upload/nginx_ingrerss_controller.json b/dashboards/manual_upload/nginx_ingrerss_controller.json index 511ce2357..55063bfb8 100644 --- a/dashboards/manual_upload/nginx_ingrerss_controller.json +++ b/dashboards/manual_upload/nginx_ingrerss_controller.json @@ -141,7 +141,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -243,7 +243,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -345,7 +345,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "editorMode": "code", "exemplar": false, @@ -447,7 +447,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "round(sum(irate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", "format": "time_series", @@ -527,7 +527,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -612,7 +612,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"5.*\"}[2m])) / sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -693,7 +693,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "avg(nginx_ingress_controller_success{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"})", "format": "time_series", @@ -775,7 +775,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "count(nginx_ingress_controller_config_last_reload_successful{controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", "format": "time_series", @@ -898,7 +898,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "round(sum(irate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", "format": "time_series", @@ -1049,7 +1049,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\",status!~\"5.*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_requests{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", "format": "time_series", @@ -1155,7 +1155,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum (irate (nginx_ingress_controller_request_size_sum{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", "format": "time_series", @@ -1189,7 +1189,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1341,7 +1341,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1475,7 +1475,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1763,7 +1763,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (le, ingress))", "format": "table", @@ -1789,7 +1789,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "expr": "sum(irate(nginx_ingress_controller_request_size_sum{ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", "format": "table", @@ -1819,7 +1819,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "fieldConfig": { "defaults": { @@ -1964,7 +1964,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${PROMETHEUS_DS}" }, "includeAll": false, "name": "PROMETHEUS_DS", @@ -1982,7 +1982,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "definition": "", "includeAll": true, @@ -2028,7 +2028,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${PROMETHEUS_DS}" }, "definition": "", "includeAll": true, diff --git a/dashboards/manual_upload/node_exporter_full.json b/dashboards/manual_upload/node_exporter_full.json index e29b5fa42..8700b0a3e 100644 --- a/dashboards/manual_upload/node_exporter_full.json +++ b/dashboards/manual_upload/node_exporter_full.json @@ -60,7 +60,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Resource pressure via PSI", "fieldConfig": { @@ -130,7 +130,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -146,7 +146,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -163,7 +163,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -180,7 +180,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -201,7 +201,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Overall CPU busy percentage (averaged across all cores)", "fieldConfig": { @@ -270,7 +270,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -290,7 +290,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "System load over all CPU cores together", "fieldConfig": { @@ -359,7 +359,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -379,7 +379,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Real RAM usage excluding cache and reclaimable memory", "fieldConfig": { @@ -438,7 +438,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -458,7 +458,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of swap space currently used by the system", "fieldConfig": { @@ -527,7 +527,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -545,7 +545,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Used Root FS", "fieldConfig": { @@ -614,7 +614,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -633,7 +633,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -694,7 +694,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -711,7 +711,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -773,7 +773,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -791,7 +791,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -857,7 +857,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -875,7 +875,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -945,7 +945,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -965,7 +965,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1031,7 +1031,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1049,7 +1049,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1115,7 +1115,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1146,7 +1146,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "CPU time spent busy vs idle, split by activity type", "fieldConfig": { @@ -1304,7 +1304,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1321,7 +1321,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1336,7 +1336,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1350,7 +1350,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1364,7 +1364,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1378,7 +1378,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -1396,7 +1396,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "RAM and swap usage overview, including caches", "fieldConfig": { @@ -1550,7 +1550,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", @@ -1565,7 +1565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", @@ -1580,7 +1580,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", @@ -1594,7 +1594,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", @@ -1608,7 +1608,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", @@ -1626,7 +1626,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-interface network traffic (receive and transmit) in bits per second", "fieldConfig": { @@ -1719,7 +1719,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -1733,7 +1733,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -1751,7 +1751,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of filesystem space used for each mounted device", "fieldConfig": { @@ -1833,7 +1833,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "((node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"} - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) * 100", @@ -1861,7 +1861,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "CPU time usage split by state, normalized across all CPU cores", "fieldConfig": { @@ -2098,7 +2098,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2113,7 +2113,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"user\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2127,7 +2127,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"nice\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2141,7 +2141,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"iowait\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2155,7 +2155,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"irq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2169,7 +2169,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"softirq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2183,7 +2183,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"steal\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2197,7 +2197,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", @@ -2212,7 +2212,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))) > 0", @@ -2231,7 +2231,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breakdown of physical memory and swap usage. Hardware-detected memory errors are also displayed", "fieldConfig": { @@ -2619,7 +2619,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2634,7 +2634,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", @@ -2649,7 +2649,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2663,7 +2663,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", @@ -2678,7 +2678,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", @@ -2693,7 +2693,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", @@ -2708,7 +2708,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", @@ -2723,7 +2723,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", @@ -2738,7 +2738,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", @@ -2757,7 +2757,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Incoming and outgoing network traffic per interface", "fieldConfig": { @@ -2855,7 +2855,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -2869,7 +2869,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", @@ -2887,7 +2887,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Network interface utilization as a percentage of its maximum capacity", "fieldConfig": { @@ -2985,7 +2985,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}) * 100", @@ -2999,7 +2999,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}) * 100", @@ -3018,7 +3018,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk I/O operations per second for each device", "fieldConfig": { @@ -3116,7 +3116,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3129,7 +3129,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3146,7 +3146,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk I/O throughput per device", "fieldConfig": { @@ -3244,7 +3244,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3259,7 +3259,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", @@ -3278,7 +3278,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Amount of available disk space per mounted filesystem, excluding rootfs. Based on block availability to non-root users", "fieldConfig": { @@ -3364,7 +3364,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3380,7 +3380,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3395,7 +3395,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3414,7 +3414,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Disk usage (used = total - available) per mountpoint", "fieldConfig": { @@ -3500,7 +3500,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -3518,7 +3518,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of time the disk was actively processing I/O operations", "fieldConfig": { @@ -3604,7 +3604,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", @@ -3624,7 +3624,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How often tasks experience CPU, memory, or I/O delays. “Some” indicates partial slowdown; “Full” indicates all tasks are stalled. Based on Linux PSI metrics:\nhttps://docs.kernel.org/accounting/psi.html", "fieldConfig": { @@ -3734,7 +3734,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3748,7 +3748,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3763,7 +3763,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3778,7 +3778,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3793,7 +3793,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3808,7 +3808,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -3841,7 +3841,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays committed memory usage versus the system's commit limit. Exceeding the limit is allowed under Linux overcommit policies but may increase OOM risks under high load", "fieldConfig": { @@ -3948,7 +3948,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", @@ -3962,7 +3962,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", @@ -3980,7 +3980,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory currently dirty (modified but not yet written to disk), being actively written back, or held by writeback buffers. High dirty or writeback memory may indicate disk I/O pressure or delayed flushing", "fieldConfig": { @@ -4066,7 +4066,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", @@ -4080,7 +4080,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", @@ -4094,7 +4094,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", @@ -4108,7 +4108,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", @@ -4127,7 +4127,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Kernel slab memory usage, separated into reclaimable and non-reclaimable categories. Reclaimable memory can be freed under memory pressure (e.g., caches), while unreclaimable memory is locked by the kernel for core functions", "fieldConfig": { @@ -4213,7 +4213,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", @@ -4227,7 +4227,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", @@ -4245,7 +4245,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory used for mapped files (such as libraries) and shared memory (shmem and tmpfs), including variants backed by huge pages", "fieldConfig": { @@ -4332,7 +4332,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", @@ -4346,7 +4346,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", @@ -4360,7 +4360,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", @@ -4375,7 +4375,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", @@ -4394,7 +4394,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Proportion of memory pages in the kernel's active and inactive LRU lists relative to total RAM. Active pages have been recently used, while inactive pages are less recently accessed but still resident in memory", "fieldConfig": { @@ -4512,7 +4512,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", @@ -4526,7 +4526,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "(node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})\n", @@ -4544,7 +4544,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breakdown of memory pages in the kernel's active and inactive LRU lists, separated by anonymous (heap, tmpfs) and file-backed (caches, mmap) pages.", "fieldConfig": { @@ -4631,7 +4631,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", @@ -4646,7 +4646,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", @@ -4661,7 +4661,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", @@ -4676,7 +4676,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", @@ -4695,7 +4695,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks kernel memory used for CPU-local structures, per-thread stacks, and bounce buffers used for I/O on DMA-limited devices. These areas are typically small but critical for low-level operations", "fieldConfig": { @@ -4782,7 +4782,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", @@ -4796,7 +4796,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", @@ -4811,7 +4811,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", @@ -4831,7 +4831,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Usage of the kernel's vmalloc area, which provides virtual memory allocations for kernel modules and drivers. Includes total, used, and largest free block sizes", "fieldConfig": { @@ -4947,7 +4947,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", @@ -4962,7 +4962,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", @@ -4977,7 +4977,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", @@ -4996,7 +4996,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory used by anonymous pages (not backed by files), including standard and huge page allocations. Includes heap, stack, and memory-mapped anonymous regions", "fieldConfig": { @@ -5082,7 +5082,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", @@ -5096,7 +5096,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", @@ -5114,7 +5114,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Memory that is locked in RAM and cannot be swapped out. Includes both kernel-unevictable memory and user-level memory locked with mlock()", "fieldConfig": { @@ -5217,7 +5217,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", @@ -5231,7 +5231,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", @@ -5249,7 +5249,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How much memory is directly mapped in the kernel using different page sizes (4K, 2M, 1G). Helps monitor large page utilization in the direct map region", "fieldConfig": { @@ -5610,7 +5610,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", @@ -5624,7 +5624,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", @@ -5639,7 +5639,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", @@ -5658,7 +5658,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays HugePages memory usage in bytes, including allocated, free, reserved, and surplus memory. All values are calculated based on the number of huge pages multiplied by their configured size", "fieldConfig": { @@ -5744,7 +5744,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5758,7 +5758,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5772,7 +5772,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5786,7 +5786,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", @@ -5819,7 +5819,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of memory pages being read from or written to disk (page-in and page-out operations). High page-out may indicate memory pressure or swapping activity", "fieldConfig": { @@ -5917,7 +5917,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -5931,7 +5931,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -5949,7 +5949,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate at which memory pages are being swapped in from or out to disk. High swap-out activity may indicate memory pressure", "fieldConfig": { @@ -6047,7 +6047,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6061,7 +6061,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6079,7 +6079,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of memory page faults, split into total, major (disk-backed), and derived minor (non-disk) faults. High major fault rates may indicate memory pressure or insufficient RAM", "fieldConfig": { @@ -6203,7 +6203,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6218,7 +6218,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6233,7 +6233,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6252,7 +6252,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of Out-of-Memory (OOM) kill events. A non-zero value indicates the kernel has terminated one or more processes due to memory exhaustion", "fieldConfig": { @@ -6354,7 +6354,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -6387,7 +6387,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the system clock's estimated and maximum error, as well as its offset from the reference clock (e.g., via NTP). Useful for detecting synchronization drift", "fieldConfig": { @@ -6472,7 +6472,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", @@ -6488,7 +6488,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", @@ -6504,7 +6504,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", @@ -6524,7 +6524,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "NTP phase-locked loop (PLL) time constant used by the kernel to control time adjustments. Lower values mean faster correction but less stability", "fieldConfig": { @@ -6609,7 +6609,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", @@ -6628,7 +6628,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows whether the system clock is synchronized to a reliable time source, and the current frequency correction ratio applied by the kernel to maintain synchronization", "fieldConfig": { @@ -6713,7 +6713,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", @@ -6728,7 +6728,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", @@ -6743,7 +6743,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", @@ -6759,7 +6759,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", @@ -6779,7 +6779,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays the PPS signal's frequency offset and stability (jitter) in hertz. Useful for monitoring high-precision time sources like GPS or atomic clocks", "fieldConfig": { @@ -6864,7 +6864,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_frequency_hertz{instance=\"$node\",job=\"$job\"}", @@ -6879,7 +6879,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_stability_hertz{instance=\"$node\",job=\"$job\"}", @@ -6898,7 +6898,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks PPS signal timing jitter and shift compared to system clock", "fieldConfig": { @@ -6983,7 +6983,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_jitter_seconds{instance=\"$node\",job=\"$job\"}", @@ -6998,7 +6998,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_timex_pps_shift_seconds{instance=\"$node\",job=\"$job\"}", @@ -7017,7 +7017,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of PPS synchronization diagnostics including calibration events, jitter violations, errors, and frequency stability exceedances", "fieldConfig": { @@ -7106,7 +7106,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_calibration_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7121,7 +7121,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_error_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7136,7 +7136,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_stability_exceeded_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7152,7 +7152,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_timex_pps_jitter_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7186,7 +7186,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Processes currently in runnable or blocked states. Helps identify CPU contention or I/O wait bottlenecks.", "fieldConfig": { @@ -7276,7 +7276,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", @@ -7290,7 +7290,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", @@ -7308,7 +7308,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of processes in each state (e.g., running, sleeping, zombie). Requires --collector.processes to be enabled in node_exporter", "fieldConfig": { @@ -7479,7 +7479,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", @@ -7498,7 +7498,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of new processes being created on the system (forks/sec).", "fieldConfig": { @@ -7584,7 +7584,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7603,7 +7603,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows CPU saturation per core, calculated as the proportion of time spent waiting to run relative to total time demanded (running + waiting).", "fieldConfig": { @@ -7705,7 +7705,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7721,7 +7721,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -7737,7 +7737,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n/\n(irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) + irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))\n", @@ -7757,7 +7757,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of active PIDs on the system and the configured maximum allowed. Useful for detecting PID exhaustion risk. Requires --collector.processes in node_exporter", "fieldConfig": { @@ -7873,7 +7873,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", @@ -7888,7 +7888,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", @@ -7907,7 +7907,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of active threads on the system and the configured thread limit. Useful for monitoring thread pressure. Requires --collector.processes in node_exporter", "fieldConfig": { @@ -8023,7 +8023,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", @@ -8038,7 +8038,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", @@ -8071,7 +8071,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-second rate of context switches and hardware interrupts. High values may indicate intense CPU or I/O activity", "fieldConfig": { @@ -8157,7 +8157,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8171,7 +8171,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8190,7 +8190,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "System load average over 1, 5, and 15 minutes. Reflects the number of active or waiting processes. Values above CPU core count may indicate overload", "fieldConfig": { @@ -8306,7 +8306,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load1{instance=\"$node\",job=\"$job\"}", @@ -8320,7 +8320,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load5{instance=\"$node\",job=\"$job\"}", @@ -8334,7 +8334,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_load15{instance=\"$node\",job=\"$job\"}", @@ -8348,7 +8348,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", @@ -8367,7 +8367,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Real-time CPU frequency scaling per core, including average minimum and maximum allowed scaling frequencies", "fieldConfig": { @@ -8519,7 +8519,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}", @@ -8535,7 +8535,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})", @@ -8551,7 +8551,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})", @@ -8571,7 +8571,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of scheduling timeslices executed per CPU. Reflects how frequently the scheduler switches tasks on each core", "fieldConfig": { @@ -8656,7 +8656,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8675,7 +8675,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Breaks down hardware interrupts by type and device. Useful for diagnosing IRQ load on network, disk, or CPU interfaces. Requires --collector.interrupts to be enabled in node_exporter", "fieldConfig": { @@ -8761,7 +8761,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -8780,7 +8780,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of bits of entropy currently available to the system's random number generators (e.g., /dev/random). Low values may indicate that random number generation could block or degrade performance of cryptographic operations", "fieldConfig": { @@ -8896,7 +8896,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", @@ -8910,7 +8910,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_entropy_pool_size_bits{instance=\"$node\",job=\"$job\"}", @@ -8943,7 +8943,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Monitors hardware sensor temperatures and critical thresholds as exposed by Linux hwmon. Includes CPU, GPU, and motherboard sensors where available", "fieldConfig": { @@ -9049,7 +9049,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9064,7 +9064,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9078,7 +9078,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9093,7 +9093,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9107,7 +9107,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", "format": "time_series", @@ -9125,7 +9125,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows how hard each cooling device (fan/throttle) is working relative to its maximum capacity", "fieldConfig": { @@ -9230,7 +9230,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "100 * node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"} / node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", @@ -9250,7 +9250,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows the online status of power supplies (e.g., AC, battery). A value of 1-Yes indicates the power supply is active/online", "fieldConfig": { @@ -9339,7 +9339,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", @@ -9359,7 +9359,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Displays the current fan speeds (RPM) from hardware sensors via the hwmon interface", "fieldConfig": { @@ -9445,7 +9445,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_fan_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9460,7 +9460,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_hwmon_fan_min_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", @@ -9494,7 +9494,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of systemd units in each operational state, such as active, failed, inactive, or transitioning", "fieldConfig": { @@ -9655,7 +9655,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", @@ -9670,7 +9670,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", @@ -9685,7 +9685,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", @@ -9700,7 +9700,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", @@ -9715,7 +9715,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", @@ -9734,7 +9734,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current number of active connections per systemd socket, as reported by the Node Exporter systemd collector", "fieldConfig": { @@ -9820,7 +9820,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_systemd_socket_current_connections{instance=\"$node\",job=\"$job\"}", @@ -9839,7 +9839,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of accepted connections per second for each systemd socket", "fieldConfig": { @@ -9925,7 +9925,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -9944,7 +9944,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of systemd socket connection refusals per second, typically due to service unavailability or backlog overflow", "fieldConfig": { @@ -10030,7 +10030,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_systemd_socket_refused_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10063,7 +10063,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of I/O operations completed per second for the device (after merges), including both reads and writes", "fieldConfig": { @@ -10176,7 +10176,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10189,7 +10189,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10206,7 +10206,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of bytes read from or written to the device per second", "fieldConfig": { @@ -10319,7 +10319,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10333,7 +10333,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -10353,7 +10353,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", "fieldConfig": { @@ -10466,7 +10466,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10481,7 +10481,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10500,7 +10500,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Average queue length of the requests that were issued to the device", "fieldConfig": { @@ -10602,7 +10602,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10620,7 +10620,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of read and write requests merged per second that were queued to the device", "fieldConfig": { @@ -10733,7 +10733,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10746,7 +10746,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10763,7 +10763,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Percentage of time the disk spent actively processing I/O operations, including general I/O, discards (TRIM), and write cache flushes", "fieldConfig": { @@ -10865,7 +10865,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10879,7 +10879,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10893,7 +10893,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_flush_requests_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -10912,7 +10912,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Per-second rate of discard (TRIM) and flush (write cache) operations. Useful for monitoring low-level disk activity on SSDs and advanced storage", "fieldConfig": { @@ -11013,7 +11013,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11027,7 +11027,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11041,7 +11041,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_flush_requests_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11060,7 +11060,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows how many disk sectors are discarded (TRIMed) per second. Useful for monitoring SSD behavior and storage efficiency", "fieldConfig": { @@ -11161,7 +11161,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_disk_discarded_sectors_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11179,7 +11179,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of in-progress I/O requests at the time of sampling (active requests in the disk queue)", "fieldConfig": { @@ -11281,7 +11281,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", @@ -11313,7 +11313,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file descriptors currently allocated system-wide versus the system limit. Important for detecting descriptor exhaustion risks", "fieldConfig": { @@ -11429,7 +11429,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", @@ -11443,7 +11443,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", @@ -11461,7 +11461,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of free file nodes (inodes) available per mounted filesystem. A low count may prevent file creation even if disk space is available", "fieldConfig": { @@ -11547,7 +11547,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11566,7 +11566,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Indicates filesystems mounted in read-only mode or reporting device-level I/O errors.", "fieldConfig": { @@ -11653,7 +11653,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11667,7 +11667,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", @@ -11686,7 +11686,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file nodes (inodes) available per mounted filesystem. Reflects maximum file capacity regardless of disk size", "fieldConfig": { @@ -11772,7 +11772,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", @@ -11805,7 +11805,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of network packets received and transmitted per second, by interface.", "fieldConfig": { @@ -11904,7 +11904,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11919,7 +11919,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -11938,7 +11938,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of packet-level errors for each network interface. Receive errors may indicate physical or driver issues; transmit errors may reflect collisions or hardware faults", "fieldConfig": { @@ -12037,7 +12037,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12051,7 +12051,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12069,7 +12069,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of dropped packets per network interface. Receive drops can indicate buffer overflow or driver issues; transmit drops may result from outbound congestion or queuing limits", "fieldConfig": { @@ -12168,7 +12168,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12182,7 +12182,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12200,7 +12200,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of compressed network packets received and transmitted per interface. These are common in low-bandwidth or special interfaces like PPP or SLIP", "fieldConfig": { @@ -12299,7 +12299,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12313,7 +12313,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12331,7 +12331,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of incoming multicast packets received per network interface. Multicast is used by protocols such as mDNS, SSDP, and some streaming or cluster services", "fieldConfig": { @@ -12430,7 +12430,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12448,7 +12448,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of received packets that could not be processed due to missing protocol or handler in the kernel. May indicate unsupported traffic or misconfiguration", "fieldConfig": { @@ -12547,7 +12547,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_nohandler_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12565,7 +12565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of frame errors on received packets, typically caused by physical layer issues such as bad cables, duplex mismatches, or hardware problems", "fieldConfig": { @@ -12664,7 +12664,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12683,7 +12683,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks FIFO buffer overrun errors on network interfaces. These occur when incoming or outgoing packets are dropped due to queue or buffer overflows, often indicating congestion or hardware limits", "fieldConfig": { @@ -12782,7 +12782,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12796,7 +12796,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12814,7 +12814,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of packet collisions detected during transmission. Mostly relevant on half-duplex or legacy Ethernet networks", "fieldConfig": { @@ -12913,7 +12913,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -12931,7 +12931,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of carrier errors during transmission. These typically indicate physical layer issues like faulty cabling or duplex mismatches", "fieldConfig": { @@ -13017,7 +13017,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "rate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -13035,7 +13035,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of ARP entries per interface. Useful for detecting excessive ARP traffic or table growth due to scanning or misconfiguration", "fieldConfig": { @@ -13121,7 +13121,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", @@ -13139,7 +13139,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Current and maximum connection tracking entries used by Netfilter (nf_conntrack). High usage approaching the limit may cause packet drops or connection issues", "fieldConfig": { @@ -13255,7 +13255,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", @@ -13269,7 +13269,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", @@ -13287,7 +13287,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Operational and physical link status of each network interface. Values are Yes for 'up' or link present, and No for 'down' or no carrier.\"", "fieldConfig": { @@ -13373,7 +13373,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", @@ -13388,7 +13388,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", @@ -13404,7 +13404,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Maximum speed of each network interface as reported by the operating system. This is a static hardware capability, not current throughput", "fieldConfig": { @@ -13465,7 +13465,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"} * 8", @@ -13483,7 +13483,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "MTU (Maximum Transmission Unit) in bytes for each network interface. Affects packet size and transmission efficiency", "fieldConfig": { @@ -13543,7 +13543,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", @@ -13575,7 +13575,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks TCP socket usage and memory per node", "fieldConfig": { @@ -13662,7 +13662,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", @@ -13677,7 +13677,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", @@ -13692,7 +13692,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", @@ -13707,7 +13707,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", @@ -13726,7 +13726,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of UDP and UDPLite sockets currently in use", "fieldConfig": { @@ -13813,7 +13813,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", @@ -13828,7 +13828,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", @@ -13847,7 +13847,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Total number of sockets currently in use across all protocols (TCP, UDP, UNIX, etc.), as reported by /proc/net/sockstat", "fieldConfig": { @@ -13934,7 +13934,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", @@ -13953,7 +13953,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of FRAG and RAW sockets currently in use. RAW sockets are used for custom protocols or tools like ping; FRAG sockets are used internally for IP packet defragmentation", "fieldConfig": { @@ -14040,7 +14040,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", @@ -14055,7 +14055,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", @@ -14074,7 +14074,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "TCP/UDP socket memory usage in kernel (in pages)", "fieldConfig": { @@ -14161,7 +14161,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", @@ -14176,7 +14176,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", @@ -14195,7 +14195,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Kernel memory used by TCP, UDP, and IP fragmentation buffers", "fieldConfig": { @@ -14282,7 +14282,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", @@ -14297,7 +14297,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", @@ -14312,7 +14312,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", @@ -14329,7 +14329,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Packets processed and dropped by the softnet network stack per CPU. Drops may indicate CPU saturation or network driver limitations", "fieldConfig": { @@ -14428,7 +14428,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14443,7 +14443,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14462,7 +14462,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "How often the kernel was unable to process all packets in the softnet queue before time ran out. Frequent squeezes may indicate CPU contention or driver inefficiency", "fieldConfig": { @@ -14552,7 +14552,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14571,7 +14571,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the number of packets processed or dropped by Receive Packet Steering (RPS), a mechanism to distribute packet processing across CPUs", "fieldConfig": { @@ -14677,7 +14677,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_received_rps_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14692,7 +14692,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_softnet_flow_limit_count_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14726,7 +14726,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of octets sent and received at the IP layer, as reported by /proc/net/netstat", "fieldConfig": { @@ -14825,7 +14825,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14840,7 +14840,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14858,7 +14858,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP segments sent and received per second, including data and control segments", "fieldConfig": { @@ -14967,7 +14967,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -14982,7 +14982,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15001,7 +15001,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of UDP datagrams sent and received per second, based on /proc/net/netstat", "fieldConfig": { @@ -15099,7 +15099,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15114,7 +15114,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15133,7 +15133,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of ICMP messages sent and received per second, including error and control messages", "fieldConfig": { @@ -15235,7 +15235,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15250,7 +15250,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15269,7 +15269,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks various TCP error and congestion-related events, including retransmissions, timeouts, dropped connections, and buffer issues", "fieldConfig": { @@ -15355,7 +15355,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15371,7 +15371,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15387,7 +15387,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15402,7 +15402,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15414,7 +15414,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15426,7 +15426,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15438,7 +15438,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15451,7 +15451,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15464,7 +15464,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_TCPTimeouts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15481,7 +15481,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of UDP and UDPLite datagram delivery errors, including missing listeners, buffer overflows, and protocol-specific issues", "fieldConfig": { @@ -15570,7 +15570,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15585,7 +15585,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15600,7 +15600,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15612,7 +15612,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15627,7 +15627,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15646,7 +15646,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of incoming ICMP messages that contained protocol-specific errors, such as bad checksums or invalid lengths", "fieldConfig": { @@ -15744,7 +15744,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15763,7 +15763,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP SYN cookies sent, validated, and failed. These are used to protect against SYN flood attacks and manage TCP handshake resources under load", "fieldConfig": { @@ -15864,7 +15864,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15880,7 +15880,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15896,7 +15896,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -15916,7 +15916,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of currently established TCP connections and the system's max supported limit. On Linux, MaxConn may return -1 to indicate a dynamic/unlimited configuration", "fieldConfig": { @@ -16032,7 +16032,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", @@ -16048,7 +16048,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", @@ -16068,7 +16068,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of UDP packets currently queued in the receive (RX) and transmit (TX) buffers. A growing queue may indicate a bottleneck", "fieldConfig": { @@ -16153,7 +16153,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"rx\"}", @@ -16169,7 +16169,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"tx\"}", @@ -16189,7 +16189,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of TCP connection initiations per second. 'Active' opens are initiated by this host. 'Passive' opens are accepted from incoming connections", "fieldConfig": { @@ -16275,7 +16275,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16290,7 +16290,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16309,7 +16309,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of TCP sockets in key connection states. Requires the --collector.tcpstat flag on node_exporter", "fieldConfig": { @@ -16396,7 +16396,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}", @@ -16411,7 +16411,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}", @@ -16427,7 +16427,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}", @@ -16443,7 +16443,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}", @@ -16459,7 +16459,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_tcp_connection_states{state=\"close_wait\", instance=\"$node\", job=\"$job\"}", @@ -16493,7 +16493,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Duration of each individual collector executed during a Node Exporter scrape. Useful for identifying slow or failing collectors", "fieldConfig": { @@ -16582,7 +16582,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", @@ -16602,7 +16602,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Rate of CPU time used by the process exposing this metric (user + system mode)", "fieldConfig": { @@ -16687,7 +16687,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", @@ -16706,7 +16706,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Tracks the memory usage of the process exposing this metric (e.g., node_exporter), including current virtual memory and maximum virtual memory limit", "fieldConfig": { @@ -16846,7 +16846,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}", @@ -16861,7 +16861,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}", @@ -16880,7 +16880,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Number of file descriptors used by the exporter process versus its configured limit", "fieldConfig": { @@ -17020,7 +17020,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", @@ -17034,7 +17034,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", @@ -17052,7 +17052,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "description": "Shows whether each Node Exporter collector scraped successfully (1 = success, 0 = failure), and whether the textfile collector returned an error.", "fieldConfig": { @@ -17118,7 +17118,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", @@ -17134,7 +17134,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "1 - node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", @@ -17167,7 +17167,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${datasource}" }, "includeAll": false, "label": "Datasource", @@ -17185,7 +17185,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "", "includeAll": false, @@ -17208,7 +17208,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", "includeAll": false, @@ -17231,7 +17231,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", "includeAll": false, diff --git a/dashboards/manual_upload/wire_service.json b/dashboards/manual_upload/wire_service.json index a978b3325..574be1f21 100644 --- a/dashboards/manual_upload/wire_service.json +++ b/dashboards/manual_upload/wire_service.json @@ -241,7 +241,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$service.*\"}[$__rate_interval])) by (pod)", @@ -269,7 +269,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$service.*\", resource=\"cpu\"}) by (pod)\n", @@ -421,7 +421,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -456,7 +456,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -589,7 +589,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", namespace=\"$namespace\"}[5m])) by (service)", @@ -663,7 +663,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", namespace=\"$namespace\"}[$__range])) by (service, method, handler)", @@ -762,7 +762,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "builder", "expr": "sum by(service) (increase(kube_deployment_metadata_generation{namespace=~\"$namespace\", deployment=\"$service\"}[$__rate_interval]))", @@ -883,7 +883,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"2..\", service=\"$service\", namespace=\"$namespace\"}[1m]))", @@ -911,7 +911,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", service=\"$service\", namespace=\"$namespace\"}[1m]))", @@ -1010,7 +1010,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "net_connections{service=\"$service\", namespace=\"$namespace\"}", @@ -1118,7 +1118,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(http_request_duration_seconds_count{service=\"$service\", handler!=\"/i/status\", namespace=\"$namespace\"}[1m])) by (method, handler)", @@ -1221,7 +1221,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"4..\", service=\"$service\", namespace=\"$namespace\"}[$__range])) by (handler, status_code)", @@ -1333,7 +1333,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(increase(http_request_duration_seconds_count{status_code=~\"5..\", service=\"$service\", namespace=\"$namespace\"}[$__range])) by (handler)", @@ -1439,7 +1439,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "topk(5, avg(rate(http_request_duration_seconds_sum{service=\"$service\", handler!~\"/i/status|prometheus\", namespace=\"$namespace\"}[5m])) by (handler))", @@ -1537,7 +1537,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(irate(container_network_receive_bytes_total{namespace=\"$namespace\", pod=~\"$service.*\"}[$__rate_interval])) by (pod)", @@ -1565,7 +1565,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "fieldConfig": { "defaults": { @@ -1659,7 +1659,7 @@ { "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "editorMode": "code", "expr": "-sum(irate(container_fs_writes_bytes_total{namespace=\"$namespace\",pod=~\"$service.*\"}[1m])) by (pod)", @@ -1684,7 +1684,7 @@ { "current": { "text": "wire-cluster-prometheus-operator", - "value": "cenv3r35m756oe" + "value": "${datasource}" }, "includeAll": false, "name": "datasource", @@ -1749,7 +1749,7 @@ }, "datasource": { "type": "prometheus", - "uid": "cenv3r35m756oe" + "uid": "${datasource}" }, "definition": "label_values(namespace)", "includeAll": false, diff --git a/offline/instrument_monitoring.md b/offline/instrument_monitoring.md index 353199a9a..4e4bef14f 100644 --- a/offline/instrument_monitoring.md +++ b/offline/instrument_monitoring.md @@ -2,96 +2,262 @@ This document explains how to instrument the Wire server Kubernetes deployment with Prometheus and Grafana monitoring. -Follow these guidelines to instrument your deployed wire cluster for monitoring. These instructions bring you through setting up the prometheus operator (with the kube-prometheus-helm stack) to scrape metrics, exposing those metrics as a datasource for Grafana. Additionally, if you are using our wire-in-a-box setup, we setup a grafana VM, with dashboards. +Follow these guidelines to instrument your deployed Wire cluster for monitoring. These instructions walk you through setting up the Prometheus Operator with the kube-prometheus-stack Helm chart to scrape metrics and exposing those metrics as a data source for Grafana. The steps below assume that the user has already deployed the Wire Backend using our instructions at [Wire in a Box (WIAB) Staging](./wiab-staging.md) or [How to install wire (offline cluster)](./docs_ubuntu_22.04.md). ## Instrumentation Overview -- Setup Grafana (optional as the section describes how to setup grafana on a VM for test purpose) -- Configure Prometheus with customized kube-prometheus-stack helm chart -- Configure Prometheus scrape job for wire services -- Importing dashboards into Grafana +- Verify prerequisites on the adminhost and cluster nodes +- Set up Grafana (optional for test environments) +- Configure Prometheus with the customized kube-prometheus-stack Helm chart +- Enable ServiceMonitors for ingress-nginx, wire services, and SFTD +- Verify that Prometheus is scraping targets and Grafana can query them +- Import dashboards into Grafana +## Prerequisites -## Setup Grafana: -We do not provide grafana instrumentation for the production environment. We expect the customers/clients will bring their own grafana instance and can connect the prometheus datasource which will get shipped to the production environment. +Run the commands in this document from the root of the extracted `wire-server-deploy` bundle on the adminhost unless the section explicitly says to run a command on another machine. -If there is an exiting grafana instance or a new instance needs to be configured for the production environment, we encourage to follow the upstream [grafana installation document](https://grafana.com/docs/grafana/latest/setup-grafana/installation/). +Before you start, make sure the following are available: -In a test environment if there is no existing grafana then configuring a grafana instance on a VM will be good enough. Here is how to do it by running couple of scripts, in a virsh (wire-in-a-box) environment: +- A deployed offline Wire cluster as described in [Wire in a Box (WIAB) Staging](./wiab-staging.md) or [How to install wire (offline cluster)](./docs_ubuntu_22.04.md) +- Access to the adminhost with the `d` helper loaded, so `d helm ...`, `d kubectl ...`, `d yq ...`, and `d bash` work +- A reachable Kubernetes node that will host the Prometheus local PV, for example `kubenode3` +- A Grafana instance, or a dedicated VM if you want to install Grafana for test purposes +- The values files that will be updated in this guide: + - `charts/kube-prometheus-stack/values.yaml` + - `values/ingress-nginx-controller/values.yaml` + - `values/wire-server/values.yaml` + - `charts/sftd/values.yaml` if SFTD monitoring is required -### Configure a VM for grafana +If the bundle does not already contain the required Prometheus chart or container images, prepare them before continuing: -Make sure the `/bin` directory contains both `grafana-vm.sh` and `install-grafana.sh` scripts. +- If `charts/kube-prometheus-stack` is missing, follow [Getting the helm chart](#getting-the-helm-chart). +- If the Prometheus-related container images are not already present on the target node, follow [Download and load the dependent images](#download-and-load-the-dependent-images). -Run `grafana-vm.sh` +### Back up the values files before editing + +Take a backup of each values file that you will modify so you can revert individual changes if needed. ```bash -$ chmod +x .bin/grafana-vm.sh -$ .bin/grafana-vm.sh +timestamp="$(date +%F-%H%M%S)" + +[ -f charts/kube-prometheus-stack/values.yaml ] && cp charts/kube-prometheus-stack/values.yaml "charts/kube-prometheus-stack/values.yaml.bak.${timestamp}" +[ -f values/ingress-nginx-controller/values.yaml ] && cp values/ingress-nginx-controller/values.yaml "values/ingress-nginx-controller/values.yaml.bak.${timestamp}" +[ -f values/wire-server/values.yaml ] && cp values/wire-server/values.yaml "values/wire-server/values.yaml.bak.${timestamp}" + +# Only if you enable SFTD metrics later in this guide. +[ -f charts/sftd/values.yaml ] && cp charts/sftd/values.yaml "charts/sftd/values.yaml.bak.${timestamp}" ``` -This script will setup a VM with ip address `192.168.122.100` and name `grafananode`. This may take up to 30 minutes depending on your hardware. When it's done the VM state will be `Shut Off` and then it's need to started manually. -#### Check VM state and restart +## Set Up Grafana +We do not provide Grafana instrumentation for the production environment. We expect customers to bring their own Grafana instance and connect it to the Prometheus data source that will be shipped to the production environment. + +If there is an existing Grafana instance, or if a new instance needs to be configured for the production environment, follow the upstream [Grafana installation document](https://grafana.com/docs/grafana/latest/setup-grafana/installation/). If you already have Grafana set up, continue with the [Prometheus instructions](#configure-prometheus). + +In a test environment, if there is no existing Grafana instance, configuring Grafana on a VM is sufficient. + +### Configure a VM for Grafana + +Note: Skip this section if you have your own hypervisor to set up VMs and continue with [installing Grafana](#install-grafana-on-the-grafananode-vm). + +Make sure the `wire-server-deploy/bin` directory on your adminhost contains the `grafana-vm.sh` script, if not copy/download it at [grafana-vm.sh](../bin/grafana-vm.sh). It would require `sudo` privileges inside the script, so make sure the user running it has `sudo` access. + +Run `grafana-vm.sh` ```bash -sudo virsh list --all -sudo virsh start grafananode +$ chmod +x bin/grafana-vm.sh +$ bin/grafana-vm.sh ``` -When the VM is ready, you will be able to `ssh` to the VM. Now we can start installing Grafana. -#### Install Grafana on the grafananode VM +This script will set up a VM with a dynamic IP from `192.168.122.0/24`, user `demo`, and hostname `grafananode`. Expect the IP address to be displayed in the output and the SSH key to be present at `wire-server-deploy/ssh`. + +### Install Grafana on the grafananode VM + +Make sure the `wire-server-deploy/bin` directory on your adminhost contains the `install-grafana.sh` script, if not copy/download it at [install-grafana.sh](../bin/install-grafana.sh). Run `install-grafana.sh` on the `grafananode` VM. The script needs internet access to download the Grafana packages. -Run `install-grafana.sh` on grafananode VM. You can copy the file from `/bin` directory to the grafananode and can run from the host machine as following: +You can copy the file from the `bin/` directory to `grafananode` and run it from the adminhost as follows: ```bash -scp -i ~/.ssh/id_ed25519 ./bin/install-grafana.sh demo@192.168.122.100:/tmp/ -ssh demo@192.168.122.100 'bash /tmp/install-grafana.sh' +scp -i ssh/id_ed25519 bin/install-grafana.sh demo@grafananode:install-grafana.sh +ssh -i ssh/id_ed25519 demo@grafananode 'bash install-grafana.sh' ``` -This script will install grafana on the VM, however that VM is not accessible outside of the host machine. To make it accessible, we need to update the `iptables` rule of the host machine: + +This script installs Grafana on the VM and starts the service. However, Grafana is only accessible on the VM network. To make it accessible on the adminhost network, add `nft` rules on the `adminhost` machine as follows: ```bash -sudo iptables -t nat -A PREROUTING -p tcp --dport 3000 -j DNAT --to-destination 192.168.122.100:3000 -sudo iptables -A FORWARD -p tcp -d 192.168.122.100 --dport 3000 -j ACCEPT +# Host WAN interface name +INF_WAN=enp9s0 +sudo nft insert rule ip nat PREROUTING position 0 iifname $INF_WAN tcp dport 3000 dnat to grafananode:3000 ``` -Now the grafana can be accessed via a Web browser with the address: `http://:3000`. +Grafana can now be accessed from a web browser at `http://:3000`. Note: exposing Grafana to the network may have security implications and users should secure their instance (change default password, use firewalls, etc.) To log in to Grafana for the first time, use the default credentials provided by Grafana. After logging in, immediately change the credentials as recommended in [the grafana document](https://grafana.com/docs/grafana/latest/setup-grafana/sign-in-to-grafana/). ## Configure Prometheus -Prometheus operator will be configured to scrape metrics from k8s cluster and wire services by installing the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/README.md) helm chart. We have configured this chart with overridden values which will setup the following: +The Prometheus Operator will be configured to scrape metrics from the Kubernetes cluster and Wire services by installing the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/README.md) Helm chart. We have configured this chart with overridden values that set up the following: -- An `ingress` to expose the prometheus endpoint if enabled -- Basic authentication to the endpoint -- Automatic certificate creation with cert-manager (Assuming cert-manager is already present in the k8s cluster) - Setup a local persistent volume to use as prometheus data storage on a certain node -- Disable both Alertmanager and grafana operator which is part of the helm stack. +- Disable both Alertmanager and the Grafana component that is part of the Helm stack -Before we can install the helm chart, there are some items we need to take care of. First make sure the `wire-server-deploy` bundle has the `kube-prometheus-stack` chart. If the chart is not there, get it from one of the latest bundle and copy it to the current `charts` directory of your `wire-server-deploy` bundle. In case the `kube-prometheus-stack` chart needs to be copied in the running `wire-server-deploy` bundle there are some extra configurations needs to be made to have a successful deployment. The following sections cover both cases. +Before we proceed with installation, make sure the `wire-server-deploy` bundle has the `kube-prometheus-stack` chart and helm chart values are configured. -### Instrument prometheus to scrape metrics +### Getting the helm chart +If the chart is not present, then download it in this step. If the directory `charts/kube-prometheus-stack` already exists then please continue with [Instrument prometheus to scrape metrics](#instrument-prometheus-to-scrape-metrics). -All the configuration values are defined in the `values.yaml` file in the chart. Before running install/upgrade of the helm chart, please carefully check those values by following the comments in the file. +```bash +mkdir -p charts +curl -O https://s3-eu-west-1.amazonaws.com/public.wire.com/charts/kube-prometheus-stack-0.1.5.tgz +tar -xf kube-prometheus-stack-0.1.5.tgz -C charts +``` -Get the `kube-prometheus-stack` helm charts in the `/charts` directory, then modify the `kube-prometheus-stack/values.yaml`. Here is the step by step guidelines: +If the chart was missing from the `charts/` directory, also download all dependent images for the Helm chart as follows. + +### Download and load the dependent images + +```bash +mkdir -p prometheus-images-tars -Open the `values.yaml` file and read the configurations. +images=( + "quay.io/prometheus/node-exporter:v1.9.1" + "quay.io/prometheus-operator/prometheus-operator:v0.83.0" + "quay.io/prometheus/prometheus:v3.4.2" + "registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.4" + "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.16.0" + "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" +) + +# logic to find the above images +# d helm template test charts/kube-prometheus-stack | yq eval '.. | select(has("image")) | .image' | grep -i "/" | sort | uniq +# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/templates/prometheus-operator/deployment.yaml#L97 +# prometheus-config-reloader image is passed as an arguement to operator and hence, not visible at the templating. + +for image in "${images[@]}"; do + docker pull "$image" + + tar_name="$(echo "$image" | sed 's#[/:]#_#g').tar" + docker save -o "prometheus-images-tars/$tar_name" "$image" +done +``` + +Copy all the images to all kubenodes, below step illustrates an example on kubenode3: +```bash +scp -i ssh/id_ed25519 -r prometheus-images-tars demo@kubenode3:/home/demo/prometheus-images-tars +``` +Load all images to ctr: ```bash -cat charts/kube-prometheus-stack/values.yaml +ssh -i ssh/id_ed25519 demo@kubenode3 ' + for tar_file in /home/demo/prometheus-images-tars/*.tar; do + sudo ctr -n k8s.io images import "$tar_file" + done + + sudo ctr -n k8s.io images list | grep -E "node-exporter|prometheus-operator|prometheus|kube-webhook-certgen|kube-state-metrics" +' ``` -There are several configurable parts in values file -- The global part where we define the values to create a local persistent volume in a fixed k8s node. -- Then in the `prometheus:` field we set up the ingress (default value is `false`), certification and basic-auth +### Instrument prometheus to scrape metrics + +All the configuration values are defined in the `charts/kube-prometheus-stack/values.yaml` file in the chart. Before running install or upgrade of the Helm chart, carefully review those values by following the comments in the file. + +```yaml +# Variables to set locaL PVC Oon kubenode for Prometheus storage +# If this values get modified, please adjust the `nodeName` storageSize and `storageClassName` in the prometheusSpec: +nodeName: kubenode3 +storageSize: 50Gi +storageClassName: local-prometheus-storage +volumeMountPath: /mnt/prometheus-data + +# This is the custom values.yaml file for the Prometheus stack Helm chart. +kube-prometheus-stack: + prometheus: + ingress: + enabled: false + service: + type: NodePort + nodePort: 30090 + + prometheusSpec: + serviceMonitorSelector: {} + serviceMonitorNamespaceSelector: {} + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelector: {} + podMonitorNamespaceSelector: {} + podMonitorSelectorNilUsesHelmValues: false + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - kubenode3 + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: local-prometheus-storage + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + + retention: 15d + retentionSize: 45GiB + # Enable the Prometheus Operator to use the fallback scrape protocol only for the coturn service. + # This is useful for services that do not expose Prometheus metrics in the standard format. + additionalScrapeConfigs: + - job_name: 'coturn-with-fallback' + fallback_scrape_protocol: "PrometheusText0.0.4" + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + relabel_configs: + # Keep only coturn service endpoints + - source_labels: [__meta_kubernetes_service_name] + action: keep + regex: coturn + # Keep only the status-http port + - source_labels: [__meta_kubernetes_endpoint_port_name] + action: keep + regex: status-http + # Set the target address + - source_labels: [__address__] + target_label: __address__ + # Add service name as a label + - source_labels: [__meta_kubernetes_service_name] + target_label: service + # Add namespace as a label + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + # Add pod name as instance + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + # Both Grafana and Alertmanager is disabled in this configuration. + grafana: + enabled: false + alertmanager: + enabled: false +``` + +There are several configurable parts in the values file: + +- The global part where we define the values to create a local persistent volume on a fixed k8s node i.e. `kubenode3`. If you have your own storage class then skip configuring the `storageClassName`, `.storageSize`, `.volumeMountPath` and `.nodeName`. +- Then in the `prometheus:` field we keep ingress disabled and configure the service as a `NodePort` - In the `prometheusSpec:` field we first configure the operator to scrape metrics from all the service and pod monitors from any namespace -- In the `prometheusSpec.affinity:` field we configure the prometheus to be pinned on the node where the PV got created. +- In the `prometheusSpec.affinity:` field we configure the prometheus to be pinned on the node where the PV got created and where we have loaded our docker images. - In the `storageSpec:` field we configure the storage for prometheus data. -All the sections below described how and what to modify to have a successful prometheus instrumentation. +The sections below describe what to modify to complete the Prometheus instrumentation successfully. #### Define values to create a Local PV @@ -109,7 +275,7 @@ volumeMountPath: /mnt/prometheus-data ``` - nodeName: The specific node where the PV will be created, if the nodeName gets changed please update the nodeName in the `nodeAffinity` field too - storageSize: Give a volume size to the PV -- storageClassName: This class will be used by prometheus to claim the the volume +- storageClassName: This class will be used by Prometheus to claim the volume - volumeMountPath: Node local disk directory where prometheus will store the data If any of the values get changed, please adjust the corresponding values in the `kube-prometheus-stack.prometheusSpec:` fields and `kube-prometheus-stack.storageSpec:` fields. @@ -121,10 +287,7 @@ With the default values, the chart will create a persistent volume in the `kuben Create the volume mount path in the kubenode3 VM and provide necessary permissions for prometheus to access it. Here is how you do it. ```bash -ssh kubenode3 -sudo mkdir -p /mnt/prometheus-data -sudo chown -R 65534:65534 /mnt/prometheus-data -sudo chmod 755 /mnt/prometheus-data +ssh -i ssh/id_ed25519 demo@kubenode3 "sudo mkdir -p /mnt/prometheus-data && sudo chown -R 65534:65534 /mnt/prometheus-data && sudo chmod 755 /mnt/prometheus-data" ``` - ssh to kubenode3 @@ -132,19 +295,34 @@ sudo chmod 755 /mnt/prometheus-data - sets the Ownership to UID 65534 (nobody). Prometheus runs as a non-root user inside the container for security reasons. In prometheusSpec.securityContext, unless overridden, it runs as 65534 - sets the permissions of the directory so that Prometheus (running as a non-root user) can access and write to it. -#### Ingress and Basic auth credentials +#### Exposing Prometheus via ingress (optional, not recommended) + +By default, `ingress` is disabled for Prometheus. The Prometheus NodePort (`30090`) is sufficient for Grafana to reach Prometheus **when Grafana can directly reach the Kubernetes node IP on port 30090** — for example when Grafana runs on the adminhost or on a VM in the same L2/L3 network as the cluster nodes. -By default the `ingress` is disabled for prometheus, Ingress needs to be enabled for prometheus to use as datasource outside the k8s cluster. To enable the `ingress` update the `kube-prometheus-stack.prometheus.ingress:` field. +**When is ingress actually necessary?** If Grafana is on a separate machine or subnet that has no IP routing to the cluster node IPs (for example a different VLAN or a different network segment), then the NodePort is unreachable and you need to expose Prometheus via ingress. In that case, continue reading this section. -Prometheus ingress is configured with `basic-auth` for authetication. Basic auth secrets got created with `offline-secrets.sh` in the `values/kube-prometheus-stack/secrets.yaml` file during the preparation phase of the deployment. Please check the existence of the `values/kube-prometheus-stack/secrets.yaml` file +**Preferred deployment model (if possible):** Deploy Grafana inside the Kubernetes cluster when persistent storage is available (using an existing StorageClass or a new one). In this model, keep Prometheus internal and connect Grafana to the Prometheus Kubernetes service in the `monitoring` namespace (typically a `ClusterIP` service). This keeps Prometheus private within the cluster while Grafana can be exposed through ingress as needed. -If there is no `values/kube-prometheus-stack/secrets.yaml` file that means wire-server-deploy bundle does not have the necessary prometheus configuration components in it. To resolve it, either get the `offline-secrets.sh` from the latest bundle and create the secrets or create a `values/kube-prometheus-stack/secrets.yaml` file manually and add the basic auth credentials there as following: +We recommend keeping Prometheus ingress disabled whenever NodePort is reachable. Enabling ingress exposes Prometheus outside the cluster network boundary. + +> **Warning:** Exposing Prometheus via ingress makes it reachable outside the Kubernetes cluster. If you choose to enable ingress, configure authentication at the ingress layer here. Basic auth can also be configured for Prometheus itself if required, and you should still consider the TLS certificate implications described below. + +If you absolutely need to expose Prometheus via ingress, enable it by updating `kube-prometheus-stack.prometheus.ingress.enabled` to `true` in `charts/kube-prometheus-stack/values.yaml`. + +> **Recommended combination if ingress is required:** Use ingress only when Grafana cannot reach Prometheus on the NodePort from the same subnet. In that case, configure TLS and authentication together so the endpoint is encrypted and protected. The sections below cover each piece. Read all three before making changes — they should be configured together. + +##### Basic auth credentials + +Prometheus ingress is configured with `basic-auth` for authentication. Basic auth secrets are created by `offline-secrets.sh` and stored in `values/kube-prometheus-stack/prod-secrets.example.yaml` during the preparation phase. Verify that this file exists before proceeding. + +If `values/kube-prometheus-stack/prod-secrets.example.yaml` is missing, the bundle does not contain the necessary Prometheus configuration. Either obtain `offline-secrets.sh` from the latest bundle and regenerate the secrets, or create the file manually: ```bash -touch values/kube-prometheus-stack/secrets.yaml -nano values/kube-prometheus-stack/secrets.yaml +touch values/kube-prometheus-stack/prod-secrets.example.yaml +nano values/kube-prometheus-stack/prod-secrets.example.yaml ``` -Add prometheus auth credentials in the secrets.yaml + +Add the Prometheus auth credentials: ```yaml prometheus: @@ -153,29 +331,42 @@ prometheus: password: ``` -#### Get the domain name and certificate for the prometheus ingress +##### DNS hostname for the Prometheus ingress -- hosts: Assuming that the sub domain name for prometheus starts with `prometheus`. So the sub domain would be `prometheus.`. Put the right domain in the `hosts` and `tls.hosts` field. +When choosing a hostname for the Prometheus ingress, use the hostname that matches how Grafana and users will reach it in your environment. Set the hostname in the `hosts` and `tls.hosts` fields of the ingress block. The recommended subdomain is `prometheus.`. -- secretName: pick a secretName for certificate, for example it could be `prometheus-tls-cert`. After applying this chart cert-manager will create a certificate named `prometheus-tls-cert` and the issuer will be `clusterIssuer` +##### Certificate options for the Prometheus ingress -Cert-manager will facilitate creating managing the TLS signed Certificate resource for the prometheus ingress automatically as we are annotating cert-manager with the ingress-shim for prometheus ingress. It is defined in the `values.yaml` as following: +**Default Kubernetes certificate** + +If you do not need a certificate issued by cert-manager, you can omit cert-manager annotations and let the ingress controller use its default certificate. No additional cert-manager configuration is required. + +> **Grafana datasource note:** When using a self-signed or private CA certificate, you must configure the Grafana Prometheus datasource to either skip TLS verification or supply the CA certificate. In the Grafana datasource settings, under **TLS settings**, either enable **Skip TLS certificate validation** or upload the CA certificate under **CA cert**. + +**Certificate issued by cert-manager (for public DNS or a trusted private CA)** + +Cert-manager can automatically issue and renew a TLS certificate for the Prometheus ingress via the ingress-shim annotation defined in `values.yaml`: ```yaml -... annotations: cert-manager.io/cluster-issuer: letsencrypt-http01 ``` -We are using cluster-issuer to acquire the certificate required for this Ingress. It does not matter which namespace your Ingress resides, as ClusterIssuers are non-namespaced resources. -**Get the issuer from k8s env** +- `hosts`: set to `prometheus.` +- `tls.hosts`: set to the same hostname +- `tls.secretName`: choose a name for the TLS secret, for example `prometheus-tls-cert`. Cert-manager will create a `Certificate` resource with this name. + +ClusterIssuers are non-namespaced, so the namespace the ingress resides in does not matter. + +**Verify the cluster issuer** ```bash d kubectl get clusterissuer ``` -Make sure the `clusterIssuer` present in the k8s environment and if it does not match what we have in the `values.yaml`, replace it with the right one. -If the clusterIssuer does not exist and you only have namespaced scoped `issuer` then convert the `issuer` to `clusterIssuer` by updating the `issuer` `kind` in the `values/nginx-ingress-services/values.yaml` +If the issuer name in the cluster does not match the one in `values.yaml`, update `values.yaml` to use the correct name. + +If no `ClusterIssuer` exists and you only have a namespaced `Issuer`, promote it by updating `values/nginx-ingress-services/values.yaml`: ```yaml tls: @@ -187,13 +378,14 @@ tls: issuer: kind: ClusterIssuer ``` -Save the file and upgrade the nginx-ingress-service helm chart with: + +Then upgrade the nginx-ingress-services chart: ```bash d helm upgrade --install nginx-ingress-services ./charts/nginx-ingress-services --values ./values/nginx-ingress-services/values.yaml --values ./values/nginx-ingress-services/secrets.yaml ``` -Now the check the issuer again to make sure there is a clusterIssuer in the environment. Also check existing certificates are no have `clusterIssuer` as `issueRef`. +Verify the issuer again and confirm that existing certificates reference the `ClusterIssuer` as their `issuerRef`. #### Install the helm chart @@ -203,85 +395,58 @@ Before proceeding to this step, make sure the values.yaml file has been updated d helm upgrade --install prometheus \ ./charts/kube-prometheus-stack/ \ -f charts/kube-prometheus-stack/values.yaml \ - -f values/kube-prometheus-stack/secrets.yaml \ + -f values/kube-prometheus-stack/prod-secrets.example.yaml \ --namespace monitoring \ --create-namespace ``` - This command installs (or upgrades) the kube-prometheus-stack Helm chart with the release name `prometheus` in the `monitoring` namespace, using custom values.yaml. - Overrides the values of the upstream chart `kube-prometheus-stack` with custom values defined in the `charts/kube-prometheus-stack/values.yaml` -- Sets the auth secret for basic auth defined in the `values/kube-prometheus-stack/secrets.yaml`. - The `--create-namespace` flag will create the namespace if it does not exist. -After a successful deployment of the Chart, the output will show all the configured resources including basic auth info. -You should be able to browse the prometheus endpoint with `https://prometheus.`. Check the targets health once prometheus is ready: `https://prometheus./targets`. +After a successful deployment of the chart, the output will show all configured resources and some useful commands that can be issued inside `d`. +You should be able to reach the Prometheus endpoint locally at `http://:30090`. -Check the output with helm status command `$ helm status prometheus -n monitoring` +## Configure Wire services Helm charts to enable metrics -**Test the issuer after applying the chart** +### Scrape Metrics from ingress-nginx-controller -```bash -d kubectl get certificate prometheus-tls-cert -n -o yaml -``` +To scrape ingress-nginx-controller metrics, `metrics.enabled` and `metrics.serviceMonitor.enabled` must be enabled in `values/ingress-nginx-controller/values.yaml`. -The spec of the certificate will look like the following: +Run the following command to configure ingress-nginx metrics scraping. It works both when the block already exists and when it is missing. -```yaml -... -spec: - dnsNames: - - prometheus. - issuerRef: - group: cert-manager.io - kind: ClusterIssuer - name: letsencrypt-http01 - secretName: prometheus-tls-cert - .... +```bash +d yq eval -i '."ingress-nginx".controller.metrics.enabled = true | ."ingress-nginx".controller.metrics.serviceMonitor.enabled = true' values/ingress-nginx-controller/values.yaml ``` -The certificate should also be in the `Ready` state. -#### Scrape the metric from ingress-nginx - -To scrape ingress-nginx metrics, `serviceMonitor` needs to be enabled in the `values/ingress-nginx-controller/values.yaml` file. If the `metrics.serviceMonitor` enablement block is not present in the file, it needs to be manually added in the file. - -First take a look if the values have the `metrics.serviceMonitor` enablement block. If the block is present then ingress-nginx is ready to get scraped. +Verify that the values were set: ```bash -cat values/ingress-nginx-controller/values.yaml +d yq eval '{"metricsEnabled": ."ingress-nginx".controller.metrics.enabled, "serviceMonitorEnabled": ."ingress-nginx".controller.metrics.serviceMonitor.enabled}' values/ingress-nginx-controller/values.yaml ``` -If the metrics block is not in the values file then add the following block to the end of the file within `ingress-nginx.controller:` field +The output should look like this: ```yaml -ingress-nginx: - controller: - ..... - # Enable prometheus operator to scrape metrics from the ingress-nginx controller with servicemonitor. - metrics: - enabled: true - serviceMonitor: - enabled: true +metricsEnabled: true +serviceMonitorEnabled: true ``` -Save the file and upgrade the ingress-nginx helm chart. - -Before and after running the helm upgrade, find out on which node the ingress-nginx-controller pod is running. -```bash -d kubectl get pods -l app.kubernetes.io/name=ingress-nginx -o=custom-columns=NAME:.metadata.name,NODE:.spec.nodeName,IP:.status.hostIP -``` +Then upgrade the ingress-nginx helm chart. ```bash d helm upgrade --install ingress-nginx-controller ./charts/ingress-nginx-controller --values ./values/ingress-nginx-controller/values.yaml ``` -Note: After the helm upgrade it might happen that the ingress is scheduled to a different node which may cause the drop of the outbound traffic and you will get a 503 error. To resolve that please follow the [Incoming SSL Traffic section](./docs_ubuntu_22.04.md#incoming-ssl-traffic). -#### Scrape the metrics from the wire services +### Scrape the metrics from the wire services + +After the kube-prometheus-stack Helm installation, Kubernetes metrics will be scraped by the Prometheus Operator, but Wire service metrics will not. To scrape Wire service metrics with Prometheus, `ServiceMonitor` resources must be enabled for the Wire services. -After the kube-prometheus-stack helm install, the k8s metrics will be scraped by the prometheus operator but not the wire service metrics. To scrape wire service metrics with prometheus, `ServiceMonitor` CRD needs to be enabled for wire services. +If the Wire server was configured with a bundle that contains the kube-prometheus-stack Helm chart in the `charts` directory, enable `ServiceMonitor` for the Wire services in `values/wire-server/values.yaml`. -If the wire server was configured with the bundle which has kube-prometheus-stack helm chart in the `charts` directory, then enable `ServiceMonitor` for all the wire services in the `values/wire-server/values.yaml` file. +Run the following command to configure `metrics.serviceMonitor.enabled: true` for all required services. It works both when the `serviceMonitor` block already exists and when it is missing. -If the `values/wire-server/values.yaml` contains metrics value like: +A service entry in `values/wire-server/values.yaml` may already contain values like: ```yaml brig: # as like brig all the services will have the serviceMonitor value in the file. @@ -291,26 +456,39 @@ brig: # as like brig all the services will have the serviceMonitor value in the enabled: false ``` -You can run the following command to enable serviceMonitor for all the services +```bash +d yq eval -i ' + .brig.metrics.serviceMonitor.enabled = true | + .proxy.metrics.serviceMonitor.enabled = true | + .cannon.metrics.serviceMonitor.enabled = true | + .cargohold.metrics.serviceMonitor.enabled = true | + .galley.metrics.serviceMonitor.enabled = true | + .gundeck.metrics.serviceMonitor.enabled = true | + .nginz.metrics.serviceMonitor.enabled = true | + .spar.metrics.serviceMonitor.enabled = true +' values/wire-server/values.yaml +``` + +Verify that the values were set: ```bash -sed -i '/serviceMonitor:/ {n; s/enabled: .*/enabled: true/;}' values/wire-server/values.yaml +d yq eval '{"brig": .brig.metrics.serviceMonitor.enabled, "proxy": .proxy.metrics.serviceMonitor.enabled, "cannon": .cannon.metrics.serviceMonitor.enabled, "cargohold": .cargohold.metrics.serviceMonitor.enabled, "galley": .galley.metrics.serviceMonitor.enabled, "gundeck": .gundeck.metrics.serviceMonitor.enabled, "nginz": .nginz.metrics.serviceMonitor.enabled, "spar": .spar.metrics.serviceMonitor.enabled}' values/wire-server/values.yaml ``` -Incase the `values/wire-server/values.yaml` file does not contain the `serviceMonitor` enablement block then it needs to be manually added. As shown above, add the `serviceMonitor` enablement block with `metrics.serviceMonitor.enabled: true` setting for each wire services: `brig, proxy, cannon, cargohold, galley, gundeck, nginz, spar, legalhold, federator, background-worker`. As an example it will look like: +The output should look like this: ```yaml -background-worker: - config: - cassandra: - host: cassandra-external - # Enable for federation - enableFederation: false - metrics: - serviceMonitor: - enabled: true +brig: true +proxy: true +cannon: true +cargohold: true +galley: true +gundeck: true +nginz: true +spar: true ``` -Add the metrics block to all the above-mentioned services. + +If your deployment requirements also include `federator` or `background-worker`, enable their `metrics.serviceMonitor.enabled` values separately and include them in your verification output only when those components are part of the deployment. When `serviceMonitor` enablement block is enabled, please upgrade the wire-server helm chart like: @@ -319,27 +497,28 @@ d helm upgrade --install wire-server ./charts/wire-server --timeout=15m0s --valu ``` After a successful run, it will create `ServiceMonitor` CRD for each wire service which will get scraped by the prometheus operator. -Now the prometheus targets `https://prometheus./targets` will find the ServiceMonitors of wire services for scraping. Also check any particular metric with labels in the within prometheus query window by providing a metric name, such as: `http_request_duration_seconds_bucket` and run execute. -### Troubleshoot - -If the prometheus datasource/query endpoint does not return 200 rather a 503 which means there is something wrong with the configurations. Check the prometheus pod status first. +Verify that the ServiceMonitors were created in the `default` namespace: ```bash -d kubectl get pods -n monitoring -owide +d kubectl get servicemonitors -n default ``` -if the pod `prometheus-prometheus-kube-prometheus-prometheus-*` is not in the `Running` state and still in the initializing phase then take a look at the k8s events + +If you want to verify only the core Wire service monitors, you can filter them: ```bash -d kubectl describe pod prometheus-prometheus-kube-prometheus-prometheus-o -n monitoring -oyaml +d kubectl get servicemonitors -n default | grep -E "brig|proxy|cannon|cargohold|galley|gundeck|nginz|spar" ``` -The k8s events will provide enough hints to figure out whats the real issue, if it could not find/attach the storageclass and the volume, just got created via the helm chart. In that case check if the PVC is bound to the right storageclass +If you also enabled optional services such as `background-worker` or `federator`, extend the filter accordingly. + +Query the Prometheus HTTP API from a machine that can reach `http://:30090`: ```bash -d kubectl get pvc -n monitoring +curl -s "http://:30090/api/v1/targets?state=active" | jq '.data.activeTargets[] | select(.labels.namespace == "default") | {job: .labels.job, service: .labels.service, pod: .labels.pod, health: .health}' ``` -If the status is not `Bound` then it might require to remove the stale PV and create a new one by rerunning the helm. + +This returns the active targets being scraped from the `default` namespace, including the targets discovered through ServiceMonitors. If the Wire service targets appear here with `health` set to `up`, Prometheus is scraping them. ### Metrics Collection via Prometheus Operator @@ -359,13 +538,13 @@ These metrics are discovered and scraped based on label selectors defined in the **COTURN Metrics** -Coturn metrics are scraped by prometheus operator with a `scrapeConfig` job defined in the charts values.yaml file. So, when the chart is installed it will automatically configure the `coturn-with-fallback` job. It's defined this way to add `fallback_scrape_protocol: "PrometheusText0.0.4"` content-type for prometheus operator to scrape metrics. By default the content-type is blank and prometheus rejects to scrape. +Coturn metrics are scraped by the Prometheus Operator with a `scrapeConfig` job defined in the chart values file. When the chart is installed, it automatically configures the `coturn-with-fallback` job. It is defined this way to add `fallback_scrape_protocol: "PrometheusText0.0.4"` so the Prometheus Operator can scrape the metrics. By default, the content type is blank and Prometheus rejects the scrape. -**SFTD metrics* +**SFTD metrics** To enable SFTD metrics, you need to enable the SFTD `serviceMonitor` in the `charts/sftd/values.yaml` file. -Open the values.yaml in the edit mode and update to `metrics.serviceMonitor.enabled` field to `true`. +Open `values.yaml` and update `metrics.serviceMonitor.enabled` to `true`. ```bash nano charts/sftd/values.yaml @@ -381,24 +560,75 @@ Then run the `sftd` helm upgrade command d helm upgrade --install sftd ./charts/sftd --set 'nodeSelector.wire\.com/role=sftd' --values values/sftd/values.yaml ``` -### Setup prometheus as datasource for grafana +### Set Up Prometheus as a Datasource for Grafana + +1. Open Grafana in a browser. +2. Go to **Connections** -> **Data sources**. +3. Click **Add data source** and select **Prometheus**. +4. In **Connection**, set the Prometheus URL based on how Prometheus is exposed: + - NodePort (default setup, no ingress): `http://:30090` + - Ingress with TLS enabled: `https://prometheus.` + - Ingress without TLS: `http://prometheus.` -Now open the grafana with the browser and click the Data sources tab. -- Choose Prometheus as data source and put the prometheus ingress endpoint as connection parameter. -- Select Basic Authentication in the Authentication part and provide the prometheus credentials -- Skip TLS Client Authentication or choose it if you have all the certificate info at hand. +5. Configure authentication based on your Prometheus ingress setup: + - If ingress `basic-auth` is enabled, turn on **Basic auth** in Grafana datasource settings and set: + - **User**: `prometheus.auth.username` + - **Password**: `prometheus.auth.password` + (values from `values/kube-prometheus-stack/prod-secrets.example.yaml`) + - If basic auth is not enabled on Prometheus, keep **Basic auth** disabled. -Test your datasource by clicking the Metrics in the Drilldown section. By choosing the configured datasource you should be able to see the metrics. +6. Configure TLS options when using `https://`: + - If certificate is publicly trusted, keep default TLS settings. + - If certificate is self-signed or signed by a private CA, under **TLS settings** either: + - enable **Skip TLS certificate validation**, or + - provide the CA certificate in **CA cert**. +7. Click **Save & test**. +8. Confirm success message in Grafana (for example: data source is working / HTTP 200). +9. Optional validation: click **Explore**, select the Prometheus datasource, and run `up` to verify live metrics are returned. -### Importing dashboards into Grafana +### Verify Metrics in Grafana Explore -In the artifacts dashboards directory, there is a script `dashboards/grafana_sync.sh` which will take care of the uploading all the dashboards from `dashboards/api_upload` directory. This directory contains the JSON formatted dashboards which are tailored to upload via API. Dashboards JSON's comes with two different flavour, one for manual upload and one for api upload. The following sections describe both options: +After the Prometheus data source is configured, verify the scrape status in Grafana: + +1. Open Grafana and go to **Explore**. +2. Select the Prometheus datasource. +3. Run a simple query such as `up` to confirm that Prometheus is returning time series. +4. Run `prometheus_target_scrape_pool_targets` to see the number of targets in each scrape pool. +5. Run `sum(prometheus_target_scrape_pool_targets)` to plot the total number of endpoints currently configured for scraping. + +If `prometheus_target_scrape_pool_targets` does not return data, check Prometheus itself in `http://:30090/targets` and confirm the Prometheus server is healthy and scraping its own internal metrics. + +### Troubleshoot + +If the Prometheus data source or query endpoint returns `503` instead of `200`, there is likely a configuration issue. Check the Prometheus pod status first. + +```bash +d kubectl get pods -n monitoring -owide +``` + +If the pod `prometheus-prometheus-kube-prometheus-prometheus-*` is not in the `Running` state and is still initializing, inspect the Kubernetes events. + +```bash +d kubectl describe pod prometheus-prometheus-kube-prometheus-prometheus-o -n monitoring -oyaml +``` + +The Kubernetes events usually provide enough detail to identify the issue. If Prometheus cannot find or attach the storage class or volume that was created by the Helm chart, check whether the PVC is bound to the correct storage class. + +```bash +d kubectl get pvc -n monitoring +``` + +If the status is not `Bound`, you may need to remove the stale PV and create a new one by rerunning the Helm chart. + +### Import Dashboards into Grafana + +In the artifacts dashboards directory, there is a script `dashboards/grafana_sync.sh` that uploads all dashboards from the `dashboards/api_upload` directory. This directory contains JSON dashboards tailored for API upload. The dashboards come in two variants, one for manual upload and one for API upload. The following sections describe both options. #### Upload via API -Before proceeding to run the script, it requires an API token and Grafana url where the dashboards will be uploaded. +Before running the script, make sure you have an API token and the Grafana URL where the dashboards will be uploaded. **How to get the API token** @@ -408,7 +638,7 @@ On the left side panel of Grafana, find the `Administration` link, then extend t - Add a new service account (provide a display name and Role as either `Editor` or `Admin`) - Proceed to create the account and then create the token (do not forget to copy the token to a safe place) -Replace the `` and `` with the granafa instance URL where the dashboards will be uploaded and the token you just created. Make sure you can ping the grafana url from the machine where with script will run. +Replace `` and `` with the Grafana instance URL and the token you just created. Make sure you can reach the Grafana URL from the machine where the script will run. ```bash cat dashboards/grafana_sync.sh @@ -417,9 +647,11 @@ Then run the script ```bash chmod +x dashboards/grafana_sync.sh -./dashboards/grafana_sync.sh +d ./dashboards/grafana_sync.sh ``` +Note: GRAFANA_URL must be reachable from the `adminhost-wire-server-deploy` container i.e. `d`. + #### Manual Upload `dashboards/manual_upload` directory consists the dashboard JSON's which can be uploaded manually. To upload manually, @@ -430,4 +662,4 @@ chmod +x dashboards/grafana_sync.sh - Click "Import" -All the dashboards should be uploaded. If the dashboard does not show any graph, refresh the dashboard or open the individual dashboard panel in the `edit` mode and refresh the `Query inspector`. \ No newline at end of file +All the dashboards should be uploaded. If the dashboard does not show any graph, refresh the dashboard or open the individual dashboard panel in the `edit` mode and refresh the `Query inspector`. diff --git a/offline/multi-ingress.md b/offline/multi-ingress.md new file mode 100644 index 000000000..92fb92c0d --- /dev/null +++ b/offline/multi-ingress.md @@ -0,0 +1,215 @@ +# How to setup Mutli-ingress for Wire backend <5.14 + +For instructions related to `Wire Backend >= 5.14` they can be found at https://docs.wire.com/latest/how-to/install/multi-ingress.html. + +## Take backups before modifying the current Helm values:: +```bash +d bash +cp values/wire-server/values.yaml values/wire-server/values.yaml-pre-multi-ingress +cp values/webapp/values.yaml values/webapp/values.yaml-pre-multi-ingress +``` + +## Instructions for required changes in wire-server values + +Wire-server backend values can be found at `values/wire-server/values.yaml`. Apart from the values already configured for the `green.example.org` domain, find each component in the file and update only the fields mentioned below: + +### Galley + +```yaml +galley: + config: + settings: + conversationCodeURI: https://account.green.example.org/conversation-join/ + multiIngress: + red.example.com: https://account.red.example.com/conversation-join/ +``` + +### Cargohold + +Comment out `s3DownloadEndpoint`, and place all endpoints under `multiIngress`: + +```yaml +cargohold: + config: + aws: + #s3DownloadEndpoint: https://assets.green.example.org + multiIngress: + nginz-https.green.example.org: https://assets.green.example.org + nginz-https.red.example.com: https://assets.red.example.com +``` + +### Cannon + +```yaml +cannon: + nginx_conf: + additional_external_env_domains: + - red.example.com +``` + +### Nginz + +```yaml +nginz: + nginx_conf: + env: prod + external_env_domain: green.example.org + deeplink: + endpoints: + backendURL: "https://nginz-https.green.example.org" + backendWSURL: "https://nginz-ssl.green.example.org" + teamsURL: "https://teams.green.example.org" + accountsURL: "https://account.green.example.org" + blackListURL: "https://clientblacklist.green.example.org/prod" + websiteURL: "https://wire.com" + title: "My Custom Wire Backend" + additional_external_env_domains: + - red.example.com +``` + +### Deploy wire-server chart + +After making the above changes in `values/wire-server/values.yaml`, the wire-server helm chart should be **redeployed** as: + +```bash +helm upgrade --install wire-server ./charts/wire-server --timeout=15m0s --values ./values/wire-server/values.yaml --values ./values/wire-server/secrets.yaml +``` + +## Instructions for required changes in webapp values + +Webapp values can be found at `values/webapp/values.yaml`, Override the whole file with the following: + +```yaml +replicaCount: 3 +# image: +# tag: some-tag (only override if you want a newer/different version than what is in the chart) +config: + externalUrls: + backendRest: "nginz-https.[[hostname]]" + backendWebsocket: "nginz-ssl.[[hostname]]" + backendDomain: "[[hostname]]" + backendTeamSettings: "teams.[[hostname]]" + appHost: "webapp.[[hostname]]" +# See full list of available environment variables: https://github.com/wireapp/wire-web-config-default/blob/master/wire-webapp/.env.defaults +envVars: + APP_NAME: "Webapp" + ENFORCE_HTTPS: "true" + FEATURE_CHECK_CONSENT: "false" + ENABLE_DYNAMIC_HOSTNAME: "true" + # Note: disabling showing the user creation is not the same thing as user creation being disabled. + # To disable user/team creation completely from backend, update the brig configuration in wire-server + FEATURE_ENABLE_ACCOUNT_REGISTRATION: "true" + FEATURE_ENABLE_DEBUG: "false" + FEATURE_ENABLE_PHONE_LOGIN: "false" + FEATURE_ENABLE_SSO: "false" + FEATURE_SHOW_LOADING_INFORMATION: "false" + URL_ACCOUNT_BASE: "https://account.[[hostname]]" + #URL_MOBILE_BASE: "https://wire-pwa-staging.zinfra.io" # TODO: is this needed? + URL_PRIVACY_POLICY: "https://www.[[hostname]]/terms-conditions" + URL_SUPPORT_BASE: "https://www.[[hostname]]/support" + URL_TEAMS_BASE: "https://teams.[[hostname]]" + URL_TEAMS_CREATE: "https://teams.[[hostname]]" + URL_TERMS_OF_USE_PERSONAL: "https://www.[[hostname]]/terms-conditions" + URL_TERMS_OF_USE_TEAMS: "https://www.[[hostname]]/terms-conditions" + URL_WEBSITE_BASE: "https://www.[[hostname]]" + CSP_EXTRA_CONNECT_SRC: "https://*.[[hostname]], wss://*.[[hostname]], https://sft.calling-prod-v01.wire.com" + CSP_EXTRA_IMG_SRC: "https://*.[[hostname]]" + CSP_EXTRA_SCRIPT_SRC: "https://*.[[hostname]]" + CSP_EXTRA_DEFAULT_SRC: "https://*.[[hostname]]" + CSP_EXTRA_FONT_SRC: "https://*.[[hostname]]" + CSP_EXTRA_FRAME_SRC: "https://*.[[hostname]]" + CSP_EXTRA_MANIFEST_SRC: "https://*.[[hostname]]" + CSP_EXTRA_OBJECT_SRC: "https://*.[[hostname]]" + CSP_EXTRA_MEDIA_SRC: "https://*.[[hostname]]" + CSP_EXTRA_PREFETCH_SRC: "https://*.[[hostname]]" + CSP_EXTRA_STYLE_SRC: "https://*.[[hostname]]" + CSP_EXTRA_WORKER_SRC: "https://*.[[hostname]]" +``` + +### Deploy webapp helm chart + +**Re-deploy** the webapp helm chart as following: + +```bash +helm upgrade --install webapp ./charts/webapp --timeout=15m0s --values ./values/webapp/values.yaml +``` + +## Instructions for required changes in nginx-ingress-services values + +The `nginx-ingress-service` chart should be deployed **multiple times**, once for each domain multi ingress domain. + +For each additional domain (e.g., `red.example.com`), you must deploy the `nginx-ingress-service` chart with: + +- **Unique release names** (e.g., `nginx-ingress-services-example-com`) +- **Domain-specific values files** with distinct configurations +- **Separate TLS certificates** (e.g., `values/nginx-ingress-services/example-com-key.pem`, `values/nginx-ingress-services/example-com-cert.pem`) + +### Prepare values for example-com domain + +Prepare a unique helm values file for `red.example.com` domain as `values/nginx-ingress-services/example-com-values.yaml`: + +```yaml +ingressName: example-com +nameOverride: nginx-multi-ingress-example-com +teamSettings: + enabled: true +accountPages: + enabled: true +tls: + enabled: true + # NOTE: enable to automate certificate issuing with jetstack/cert-manager instead of + # providing your own certs in secrets.yaml. Cert-manager is not installed automatically, + # it needs to be installed beforehand (see ./../../charts/certificate-manager/README.md) + useCertManager: false + issuer: + kind: ClusterIssuer + +config: + dns: + https: nginz-https.red.example.com + base: red.example.com + ssl: nginz-ssl.red.example.com + webapp: webapp.red.example.com + fakeS3: assets.red.example.com + teamSettings: teams.red.example.com + accountPages: account.red.example.com + # uncomment below to activate cert acquisition for federator ingress + # federator: federator.red.example.com + renderCSPInIngress: true + isAdditionalIngress: true + +# Redirection configuration for fake-aws-s3 +service: + useFakeS3: true + s3: + externalPort: 9000 + serviceName: minio-external +``` + +### Deploy example-com domain chart + +Deploy this chart as follows: + +```bash +helm upgrade --install nginx-ingress-services-example-com charts/nginx-ingress-services -f values/nginx-ingress-services/example-com-values.yaml --set-file secrets.tlsWildcardCert=values/nginx-ingress-services/example-com-cert.pem --set-file secrets.tlsWildcardKey=values/nginx-ingress-services/example-com-key.pem +``` + +### Patch the CSP (Content security policy) for each multi-ingress domain + +The below patch is only required when the Webapp is used for calling via multi-ingress. + +```bash +d bash +kubectl get ingress nginx-ingress-example-com -o yaml > nginx-ingress-example-com.yaml +MULTI_DOMAIN="red.example.com" +SFT_DOMAIN="sft.calling-prod-v01.wire.com" +sed -i "s|} https://\\*\\.${MULTI_DOMAIN};|} https://*.${MULTI_DOMAIN} https://${SFT_DOMAIN};|" nginx-ingress-example-com.yaml +# debug command to verify +kubectl diff -f nginx-ingress-example-com.yaml +kubectl apply -f nginx-ingress-example-com.yaml +``` + +### How do you verify whether the `red.example.com` domain is working? +- Open the webapp at `https://webapp.red.example.com` +- Log in with any user and try messaging, file uploads, downloads, calling etc +- A Deeplink can be used if the domain-specific deeplinks (applicable for version 5.5) are managed. diff --git a/offline/wiab-staging.md b/offline/wiab-staging.md index 8c3d68486..a342f75fa 100644 --- a/offline/wiab-staging.md +++ b/offline/wiab-staging.md @@ -52,7 +52,6 @@ Our deployment will be into 7 VMs with [Ubuntu 22](https://releases.ubuntu.com/j - **kubenodes (kubenode1, kubenode2, kubenode3):** Run the Kubernetes cluster and host Wire backend services - **datanodes (datanode1, datanode2, datanode3):** Run distributed data services: - Cassandra - - PostgreSQL - Elasticsearch - Minio - RabbitMQ @@ -117,7 +116,7 @@ cd wire-server-deploy **Step 2: Configure your Ansible inventory for your physical machine** -A sample inventory is available at [ansible/inventory/demo/wiab-staging.yml](https://github.com/wireapp/wire-server-deploy/blob/master/ansible/inventory/demo/wiab-staging.yml). +A sample inventory is available at [ansible/inventory/demo/wiab-staging.yml](../ansible/inventory/demo/wiab-staging.yml). Replace example.com with your physical machine (`adminhost`) address where KVM is available and adjust other variables like `ansible_user` and `ansible_ssh_private_key_file`. The SSH user for ansible `ansible_user` should have password-less `sudo` access. The adminhost should be running Ubuntu 22.04. From here on, we would refer the physical machine as `adminhost`. The `private_deployment` variable determines whether the VMs created below will have internet access. When set to `true` (default value), no internet access is available to VMs. Check [Network Traffic Configuration](#network-traffic-configuration) to understand more about it. @@ -130,13 +129,82 @@ ansible-playbook -i ansible/inventory/demo/wiab-staging.yml ansible/wiab-staging *Note: Ansible core version 2.16.3 or compatible is required for this step* -## Ensure secondary ansible inventory for VMs +## When VMs are ready Now you should have 7 VMs running on your `adminhost`. If you have used the ansible playbook, you should also have a directory `/home/ansible_user/wire-server-deploy` with all resources required for further deployment. If you didn't use the above playbook, download the `wire-server-deploy` artifact shared by Wire support and extract it with tar. +```bash +wget https://s3-eu-west-1.amazonaws.com/public.wire.com/artifacts/wire-server-deploy-static-.tgz +tar xvzf wire-server-deploy-static-.tgz +cd wire-server-deploy +``` + Ensure the inventory file `ansible/inventory/offline/inventory.yml` in the directory `/home/ansible_user/wire-server-deploy` contains values corresponding to your VMs. If you have already used the [Ansible playbook above](#getting-started-with-ansible-playbook) to set up VMs, this file should have been prepared for you. -The purpose of secondary ansible inventory is to interact only with the VMs. All the operations concerning the secondary inventory are meant to install datastores and k8s services. +The purpose of this secondary Ansible inventory is to interact only with the 7 VMs after they have been created. It is used by the offline deployment steps to install Kubernetes and the stateful services. Our kubernetes solution uses `Calico` as the default `Container Network Interface (CNI)` plugin for cluster networking and ensure the [kernel requirements](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kernel-dependencies) are met on the VMs before deploying Kubernetes. + +If the provisioning playbook did not generate it for you, create it from the template [ansible/inventory/offline/staging.yml](../ansible/inventory/offline/staging.yml): + +```bash +cp ansible/inventory/offline/staging.yml ansible/inventory/offline/inventory.yml +``` + +Then edit `ansible/inventory/offline/inventory.yml` and replace all placeholder values. + +**Critical values to review in the inventory:** + +- `all.vars.ansible_user`: the SSH user present on every VM, should be part of `sudoers` list. +- `all.vars.ansible_ssh_private_key_file`: uncomment and set this if you authenticate with a private key, for example `ssh/id_ed25519`. +- `assethost.hosts.assethost.ansible_host`: IP address of the asset host VM. +- `kube-node.hosts.kubenode1|2|3.ansible_host`: IP addresses of the three Kubernetes VMs. +- `datanodes.hosts.datanode1|2|3.ansible_host`: IP addresses of the three data VMs. +- `cassandra.vars.cassandra_network_interface`, `elasticsearch.vars.elasticsearch_network_interface`, `minio.vars.minio_network_interface`, `rmq-cluster.vars.rabbitmq_network_interface`: the network interface name used by those services inside each data VM, for example `enp1s0`. Do not assume this value; verify it on your machines. +- `rmq-cluster.vars.rabbitmq_cluster_master`: the RabbitMQ primary node. Keep this aligned with the hostname of one of the data nodes, typically `datanode1`. + +> **Note:** If your environment uses a non-standard MTU (e.g. cloud providers, VPNs, or overlay networks), you must [configure the MTU](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/CNI/calico.md#configuring-interface-mtu) for Calico in `k8s-cluster.vars`. Ensure all VMs have the same MTU on their primary interface: +> ```bash +> ip link show +> ``` +> Then set: +> ```yaml +> # k8s-cluster.vars. +> calico_mtu: +> calico_veth_mtu: +> ``` +> As a rule of thumb: +> - `calico_mtu = underlying network MTU - encapsulation overhead` +> - `calico_veth_mtu` ≤ `calico_mtu` + +**Hostnames matter:** + +- The inventory hostnames `assethost`, `kubenode1`, `kubenode2`, `kubenode3`, `datanode1`, `datanode2`, and `datanode3` should match the actual hostnames configured inside the VMs. +- This is especially important for RabbitMQ, because the nodes in `rmq-cluster` must match each VM's real hostname. +- `datanode1` is also referenced as the Cassandra seed and as the default RabbitMQ cluster master in the template, so change those only if your topology differs. + +**SSH authentication options:** + +- If the VMs are reachable with a private key, set `ansible_ssh_private_key_file` in the inventory and run Ansible normally. +- If you rely on an SSH agent, keep `ansible_ssh_private_key_file` commented out and ensure the agent on the `adminhost` can reach all VMs. +- If you do not use a private key entry in the inventory and password authentication is enabled on the VMs, add `--ask-pass` when running ansible-playbooks manually and `--ask-become-pass` for sudo access. +- Our installation scripts are non-interactive, define `ansible_password` and `ansible_become_pass` in the inventory instead of relying on interactive password prompts. + +Before running the offline deployment scripts, verify that the inventory resolves to the expected machines. The commands below assume you are running them from `/home/ansible_user/wire-server-deploy` on the `adminhost`. + +```bash +# confirm the inventory hostnames match the actual VM hostnames +ansible all -i ansible/inventory/offline/inventory.yml -m shell -a 'hostname' + +# verify the default IPv4 interface and address reported by Ansible +ansible all -i ansible/inventory/offline/inventory.yml -m setup -a 'filter=ansible_default_ipv4' + +# verify time and timezone consistency across the machines +ansible all -i ansible/inventory/offline/inventory.yml -m shell -a 'date' + +# verify if the MTU is consistent across all the VMs +d ansible all -i ansible/inventory/offline/inventory.yml -m shell -a "ip link show | grep mtu" +``` + +If any hostname, IP address, SSH setting, or interface name is wrong at this stage, correct `ansible/inventory/offline/inventory.yml` before continuing. The next deployment steps assume this inventory is accurate. ## Next steps @@ -153,6 +221,7 @@ Once the inventory is ready, please continue with the following steps: ``` - You can always use this alias `d` later to interact with the ansible playbooks, k8s cluster and the helm charts. - The docker container mounts everything here from the `wire-server-deploy` directory, hence this acts an entry point for all the future interactions with ansible, k8s and helm charts. + - Please ensure that this environment doesn't contain `quay.io/wire/wire-server-deploy` docker image from previous installations, if it does then such images need to be removed. - **[Generating secrets](docs_ubuntu_22.04.md#generating-secrets)** - Run `bin/offline-secrets.sh` to generate fresh secrets for Minio and coturn services. It uses the docker container images shipped inside the `wire-server-deploy` directory. @@ -170,34 +239,50 @@ Once the inventory is ready, please continue with the following steps: ```bash d ./bin/offline-cluster.sh ``` - - Run the above command to deploy Kubernetes and stateful services (Cassandra, PostgreSQL, Elasticsearch, Minio, RabbitMQ). This script deploys all infrastructure needed for Wire backend operations. + - Run the above command to deploy Kubernetes and stateful services (Cassandra, Elasticsearch, Minio, RabbitMQ). This script deploys all infrastructure needed for Wire backend operations. + +To confirm if the kubernetes cluster has been setup correctly. All pods should be in `Running` or `Completed` state. Any `CrashLoopBackOff`, `Error`, or `Pending` states indicate a problem.: +```bash +d kubectl -n kube-system get pods +``` ### Helm Operations to install wire services and supporting helm charts **Helm chart deployment (automated):** The script `bin/helm-operations.sh` will deploy the charts for you. It prepares `values.yaml`/`secrets.yaml`, customizes them for your domain/IPs, then runs Helm installs/upgrades in the correct order. Prepare the values before running it. **User-provided inputs (set these before running):** -- `TARGET_SYSTEM`: your domain (e.g., `wire.example.com` or `example.dev`). -- `CERT_MASTER_EMAIL`: email used by cert-manager for ACME registration. -- `HOST_IP`: public IP that matches your DNS A record (auto-detected if empty). +- `TARGET_SYSTEM`: your domain (e.g., `wire.example.com` or `example.dev`) using which you have created subdomains, check more at [How to set up DNS records](https://docs.wire.com/latest/how-to/install/demo-wiab.html#dns-requirements). +- `CERT_MASTER_EMAIL`: email used by cert-manager for ACME registration (by default=TRUE). +- `DEPLOY_CALLING_SERVICES`: set to `TRUE` or `FALSE` to control deployment of the calling services (`sftd` and `coturn`). Default is `TRUE`. +- `HOST_IP`: the IP address on which traffic for Wire calling services is expected to arrive. This should match your public DNS A record since we are expected to deploy Wire and calling services behind a single firewall. The calling traffic configuration described in [Network Traffic Configuration](#network-traffic-configuration) and [Configure the port redirection in Nftables](coturn.md#configure-the-port-redirection-in-nftables). It is not required if `DEPLOY_CALLING_SERVICES=FALSE` + +**Calling services behavior:** +- When `DEPLOY_CALLING_SERVICES=TRUE` and `HOST_IP` is not passed, the script tries to detect the publicly visible address for this setup by running `wget -qO- https://api.ipify.org`. +- When `DEPLOY_CALLING_SERVICES=FALSE`, the script skips deployment of `sftd` and `coturn`, and it does not evaluate any `HOST_IP`-dependent logic. **TLS / certificate behavior (cert-manager vs. Bring Your Own):** - By default, `bin/helm-operations.sh` has `DEPLOY_CERT_MANAGER=TRUE`, which installs cert-manager and configures a Let’s Encrypt (HTTP-01) issuer for the ingress charts. -- If you **do not** want Let’s Encrypt / cert-manager (for example, you are using **[Bring Your Own certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates)**), disable this step by passing the environment variable `DEPLOY_CERT_MANAGER=FALSE` when running `bin/helm-operations.sh`. - - When choosing `DEPLOY_CERT_MANAGER=FALSE`, ensure your ingress is configured with your own TLS secret(s) as described at [Acquiring / Deploying SSL Certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates). +- If you **do not** want Let’s Encrypt / cert-manager for TLS certs for the ingress, disable this step by passing the environment variable `DEPLOY_CERT_MANAGER=FALSE` when running `bin/helm-operations.sh`. + - When choosing `DEPLOY_CERT_MANAGER=FALSE`, ensure your ingress is configured with your own TLS secret(s) as described at [Acquiring / Deploying SSL Certificates](docs_ubuntu_22.04.md#acquiring--deploying-ssl-certificates). The `nginx-ingress-services` should be deployed manually. - When choosing `DEPLOY_CERT_MANAGER=TRUE`, ensure if further network configuration is required by following [cert-manager behaviour in NAT / bridge environments](#cert-manager-behaviour-in-nat--bridge-environments). **To run the automated helm chart deployment with your variables**: ```bash # example command - verify the variables before running it -d sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" DEPLOY_CERT_MANAGER=TRUE ./bin/helm-operations.sh' +d sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" DEPLOY_CERT_MANAGER=TRUE DEPLOY_CALLING_SERVICES=TRUE HOST_IP="a.a.a.a" ./bin/helm-operations.sh' +``` + +If you do not want to deploy the calling services, run: + +```bash +d sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" DEPLOY_CERT_MANAGER=TRUE DEPLOY_CALLING_SERVICES=FALSE ./bin/helm-operations.sh' ``` **Charts deployed by the script:** -- External datastores and helpers: `cassandra-external`, `elasticsearch-external`, `minio-external`, `rabbitmq-external`, `databases-ephemeral`, `reaper`, `fake-aws`, `demo-smtp`. +- External datastores and helpers: `cassandra-external`, `elasticsearch-external`, `minio-external`, `rabbitmq-external`, `databases-ephemeral`, `reaper`, `fake-aws`, `smtp`. - Wire services: `wire-server`, `webapp`, `account-pages`, `team-settings`. - Ingress and certificates: `ingress-nginx-controller`, `cert-manager`, `nginx-ingress-services`. -- Calling services: `sftd`, `coturn`. +- Calling services: `sftd`, `coturn` when `DEPLOY_CALLING_SERVICES=TRUE`. **Values and secrets generation:** - Creates `values.yaml` and `secrets.yaml` from `prod-values.example.yaml` and `prod-secrets.example.yaml` for each chart under `values/`. @@ -205,6 +290,13 @@ d sh -c 'TARGET_SYSTEM="example.dev" CERT_MASTER_EMAIL="certmaster@example.dev" *Note: The `bin/helm-operations.sh` script above deploys these charts; you do not need to run the Helm commands manually unless you want to customize or debug.* +**Manually removing non-required helm charts**: +- If some helm charts are not required in your environment like `demo-smtp` for email relaying then use the following command to uninstall them: +```bash +#d helm uninstall CHART_NAME +d helm uninstall demo-smtp +``` + ## Network Traffic Configuration ### Bring traffic from the adminhost to Wire services in the k8s cluster diff --git a/terraform/examples/wiab-staging-hetzner/main.tf b/terraform/examples/wiab-staging-hetzner/main.tf index 890ddd7fa..8703b02ee 100644 --- a/terraform/examples/wiab-staging-hetzner/main.tf +++ b/terraform/examples/wiab-staging-hetzner/main.tf @@ -9,8 +9,8 @@ locals { # Server type preferences with fallbacks (optimized for availability) preferred_server_types = { - small = ["cx33", "cpx22", "cx43"] # For assethost and adminhost - medium = ["cx43", "cx53", "cpx42"] # For datanodes and k8s_nodes + small = ["cpx22", "cpx32", "cpx42"] # For assethost and adminhost + medium = ["cpx42", "cpx52", "cpx62"] # For datanodes and k8s_nodes } }