From 774cdc19be76acddc3b6c514534964790ac7da83 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 15 Jun 2026 10:43:00 +0530 Subject: [PATCH 01/10] DLPX-96312 Add InfluxDB/Telegraf infrastructure for Engine Performance Analytics PR URL: https://www.github.com/delphix/performance-diagnostics/pull/119 --- .../skills/perf-diagnostics-deploy/SKILL.md | 233 +++++++++++++++ debian/control | 2 +- debian/postinst | 8 + debian/rules | 7 +- influxdb/delphix-influxdb-init | 266 ++++++++++++++++++ influxdb/delphix-influxdb-service | 23 ++ influxdb/delphix-influxdb.service | 16 ++ influxdb/influxdb-init.conf | 14 + influxdb/influxdb-nginx.conf | 17 ++ influxdb/influxdb.toml | 10 + influxdb/perf_influxdb | 47 ++++ telegraf/connstat-stats.sh | 69 +++++ telegraf/delphix-telegraf-service | 24 ++ telegraf/telegraf.base | 126 +++++---- telegraf/telegraf.inputs.dct | 7 - telegraf/telegraf.inputs.playbook | 88 +----- telegraf/telegraf.inputs.storage_io | 99 +++++++ 17 files changed, 916 insertions(+), 140 deletions(-) create mode 100644 .claude/skills/perf-diagnostics-deploy/SKILL.md create mode 100644 influxdb/delphix-influxdb-init create mode 100644 influxdb/delphix-influxdb-service create mode 100644 influxdb/delphix-influxdb.service create mode 100644 influxdb/influxdb-init.conf create mode 100644 influxdb/influxdb-nginx.conf create mode 100644 influxdb/influxdb.toml create mode 100644 influxdb/perf_influxdb create mode 100755 telegraf/connstat-stats.sh create mode 100644 telegraf/telegraf.inputs.storage_io diff --git a/.claude/skills/perf-diagnostics-deploy/SKILL.md b/.claude/skills/perf-diagnostics-deploy/SKILL.md new file mode 100644 index 0000000..1a4e819 --- /dev/null +++ b/.claude/skills/perf-diagnostics-deploy/SKILL.md @@ -0,0 +1,233 @@ +--- +name: perf-diagnostics-deploy +description: Use when making changes to Telegraf or InfluxDB config files in the performance-diagnostics repo and wanting to test those changes on a live Delphix engine via SSH +--- + +# Performance Diagnostics Deploy & Verify + +## Overview + +Workflow for making config changes to the performance-diagnostics repo, deploying them to a Delphix test engine over SSH, and verifying the changes are working correctly via InfluxDB queries. + +## Workflow + +```dot +digraph deploy { + "Make changes" -> "Ask: test on engine?"; + "Ask: test on engine?" -> "Done" [label="no"]; + "Ask: test on engine?" -> "Ask: which engine hostname?" [label="yes"]; + "Ask: which engine hostname?" -> "SSH as delphix, ask for password"; + "SSH as delphix, ask for password" -> "Deploy changed files"; + "Deploy changed files" -> "Restart services"; + "Restart services" -> "Wait 5 min"; + "Wait 5 min" -> "Query InfluxDB, verify changes"; +} +``` + +## Step 1 — Make Changes + +Make the requested code/config changes. Summarise what was changed and why before asking about testing. + +## Step 2 — Ask to Test + +``` +Changes done. Do you want to test these on a Delphix engine? +``` + +If no → stop. If yes → proceed. + +## Step 3 — Get Engine + Password + +``` +Which Delphix engine hostname should I deploy to? +``` + +Then SSH using `sshpass`: +```bash +sshpass -p "$PASSWORD" ssh -o StrictHostKeyChecking=no delphix@$HOST "..." +``` + +Ask for the SSH password if not already known. Default user is always `delphix`. + +## Step 4 — Deploy Changed Files + +**File locations on engine:** + +| Repo path | Engine path | +|---|---| +| `telegraf/telegraf.base` | `/etc/telegraf/telegraf.base` | +| `telegraf/telegraf.inputs.*` | `/etc/telegraf/telegraf.inputs.*` | +| `telegraf/connstat-stats.sh` | `/etc/telegraf/connstat-stats.sh` | +| `telegraf/nfs-threads.sh` | `/etc/telegraf/nfs-threads.sh` | +| `telegraf/zcache-stats.sh` | `/etc/telegraf/zcache-stats.sh` | +| `telegraf/zpool-iostat-o.sh` | `/etc/telegraf/zpool-iostat-o.sh` | +| `telegraf/delphix-telegraf-service` | `/usr/bin/delphix-telegraf-service` | +| `telegraf/perf_playbook` | `/usr/bin/perf_playbook` | +| `telegraf/delphix-telegraf.service` | `/lib/systemd/system/delphix-telegraf.service` | +| `influxdb/delphix-influxdb-init` | `/usr/bin/delphix-influxdb-init` | +| `influxdb/delphix-influxdb-service` | `/usr/bin/delphix-influxdb-service` | +| `influxdb/perf_influxdb` | `/usr/bin/perf_influxdb` | +| `influxdb/influxdb.toml` | `/etc/influxdb/influxdb.toml` | +| `influxdb/influxdb-init.conf` | `/etc/influxdb/influxdb-init.conf` | +| `influxdb/delphix-influxdb.service` | `/lib/systemd/system/delphix-influxdb.service` | +| `influxdb/influxdb-nginx.conf` | `/opt/delphix/server/etc/nginx/conf.d/influxdb.conf` | + +Only copy files that were actually changed. Use `scp` to `/tmp/` first, then `sudo cp` to destination. Set `chmod +x` on any shell scripts. + +**InfluxDB data directory:** `/var/lib/influxdb/engine` + +## Step 5 — Restart Services + +```bash +sudo systemctl restart delphix-influxdb +sleep 5 +sudo systemctl restart delphix-telegraf + +# Confirm both are active +systemctl is-active delphix-influxdb +systemctl is-active delphix-telegraf +``` + +## Step 6 — Wait 5 Minutes + +Wait for data to flow into InfluxDB. Use `ScheduleWakeup` with `delaySeconds: 270` (within cache window). + +## Step 7 — Verify via InfluxDB Query + +Get the InfluxDB credentials from the engine: +```bash +sudo cat /etc/influxdb/influxdb_meta +``` + +Query InfluxDB using the Flux API to verify the changes for the **last 5 minutes** of data. Tailor the query to what was changed: + +| Change type | What to verify | +|---|---| +| New measurement added | `from(bucket:"default") |> range(start: -5m) |> filter(fn: (r) => r._measurement == "new_measurement") |> count()` | +| Field removed (e.g. `wwid` tag) | Check tag keys don't include the removed tag | +| Histogram processors | Verify `hist_estat_*` measurements exist with bucket fields | +| `microseconds` field | Check field exists in relevant measurements | +| `connstat` aggregation | Verify `tcp_stats` has `service` and `connections` fields | + +Query via curl: +```bash +curl -s -X POST "http://localhost:8086/api/v2/query?org=delphix" \ + -H "Authorization: Token $INFLUXDB_READ_TOKEN" \ + -H "Content-Type: application/vnd.flux" \ + -d 'from(bucket:"default") |> range(start: -5m) |> filter(fn:(r) => r._measurement == "MEASUREMENT") |> limit(n:5)' +``` + +Report results clearly: what measurements exist, what fields/tags are present, and whether the change is confirmed working. + +Then ask: + +``` +Verification done. Do you want to commit these changes? +``` + +If no → stop. If yes → proceed to Step 8. + +## Step 8 — Commit Changes + +Show the latest commit: +```bash +git log -1 --oneline +``` + +Ask: +``` +Latest commit: " " +Do you want to (1) amend that commit or (2) create a new commit? +``` + +**If amend:** +```bash +git add -A +git commit --amend --no-edit +``` + +**If new commit:** +Ask: +``` +What should the commit message be? (include the Jira ID, e.g. "DLPX-12345 Fix xyz") +``` + +Then: +```bash +git add -A +git commit -m "" +``` + +Then ask: +``` +Commit done. Do you want to push and update/raise a PR? +``` + +If no → stop. If yes → proceed to Step 9. + +## Step 9 — Push and PR + +`git review` is a Delphix tool that pushes the branch and creates/updates the PR in one command. Use it instead of `git push` + `gh pr create`. + +First check for an existing open PR on the current branch: +```bash +gh pr list --head "$(git branch --show-current)" --state open +``` + +**If PR exists → update it:** + +```bash +git review -r +``` + +Then fetch the current PR description and update it to reflect the new changes: +```bash +gh pr view --json body +gh pr edit --body "..." +``` +Keep the existing structure but add/update the relevant sections. + +**If no PR exists → raise a new one:** + +Ask: +``` +What is the Jira ticket number for this PR? (e.g. DLPX-12345) +``` + +Fetch the Jira issue using the Jira MCP tool (`mcp__jira__jira_get_issue`) to understand the problem context. Then run: + +```bash +git review +``` + +This creates the PR as a draft. Get the PR URL from the output, then set a full description: + +```bash +gh pr edit --title ": " --body "$(cat <<'EOF' +## Summary +- + +## Problem + + +## Solution + + +## Testing +- [ ] Deployed to test engine +- [ ] InfluxDB queries confirmed data flowing correctly +- [ ] + +Jira: +EOF +)" +``` + +Return the PR URL to the user. + +## Common Mistakes + +- Forgetting `chmod +x` on shell scripts → Telegraf fails with `EXEC` error +- Restarting Telegraf before InfluxDB is ready → Telegraf starts with `[[outputs.discard]]` +- Querying before 5 minutes pass → no data in range, looks broken but isn't +- Copying the wrong file path (influxdb vs telegraf directories) diff --git a/debian/control b/debian/control index 173d013..1cbf646 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker-ce +Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker-ce, influxdb2, curl Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/debian/postinst b/debian/postinst index ea9a0ce..44224e3 100644 --- a/debian/postinst +++ b/debian/postinst @@ -24,6 +24,14 @@ if ! groups "$USER" | grep -q "\b$GROUP\b"; then fi fi +# Remove the influxdb2 package default config — we use influxdb.toml exclusively. +rm -f /etc/influxdb/config.toml + +# Reload nginx to pick up the InfluxDB proxy location block. +if nginx -t -c /etc/nginx/nginx.conf &>/dev/null && systemctl is-active --quiet nginx; then + nginx -s reload +fi + #DEBHELPER# exit 0 \ No newline at end of file diff --git a/debian/rules b/debian/rules index d6f4f00..c84f85c 100755 --- a/debian/rules +++ b/debian/rules @@ -13,11 +13,12 @@ # need to rename a couple files, so do that here. # override_dh_auto_build: - mkdir -p build/cmd/ + mkdir -p build/cmd/ build/influxdb/ cp cmd/estat.py build/cmd/estat cp cmd/stbtrace.py build/cmd/stbtrace cp cmd/nfs_threads.py build/cmd/nfs_threads cp cmd/dsp.py build/cmd/dsp + cp influxdb/influxdb-nginx.conf build/influxdb/influxdb.conf override_dh_auto_install: dh_install build/cmd/* /usr/bin @@ -26,3 +27,7 @@ override_dh_auto_install: dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin dh_install telegraf/delphix-telegraf.service /lib/systemd/system dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf + dh_install influxdb/delphix-influxdb-service influxdb/delphix-influxdb-init influxdb/perf_influxdb /usr/bin + dh_install influxdb/delphix-influxdb.service /lib/systemd/system + dh_install influxdb/influxdb.toml influxdb/influxdb-init.conf /etc/influxdb + dh_install build/influxdb/influxdb.conf /opt/delphix/server/etc/nginx/conf.d diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init new file mode 100644 index 0000000..5b7435c --- /dev/null +++ b/influxdb/delphix-influxdb-init @@ -0,0 +1,266 @@ +#!/bin/bash -eu +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# One-time InfluxDB initialization: creates org, bucket, admin token, +# a read-only token for DCT Smart Proxy, and writes the +# [[outputs.influxdb_v2]] stanza to /etc/telegraf/telegraf.outputs.influxdb, +# which is included by delphix-telegraf-service when INFLUXDB_ENABLED flag exists. +# Skips setup if InfluxDB is already initialized. +# + +INFLUXDB_URL="http://127.0.0.1:8086" +INFLUXDB_CONFIG_DIR="/etc/influxdb" +INFLUXDB_META_FILE="$INFLUXDB_CONFIG_DIR/influxdb_meta" +# State file written immediately after /api/v2/setup so the script can resume +# if it is interrupted before the metadata file is fully written. +INFLUXDB_SETUP_STATE_FILE="$INFLUXDB_CONFIG_DIR/influxdb_setup_state" +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb +INFLUXDB_INIT_CONF="$INFLUXDB_CONFIG_DIR/influxdb-init.conf" + +# Load tunable configuration (org, bucket, retention, wait parameters). +# shellcheck source=/etc/influxdb/influxdb-init.conf +# shellcheck disable=SC1091 +source "$INFLUXDB_INIT_CONF" + +INFLUXDB_ADMIN_USER="admin" +INFLUXDB_ADMIN_PASSWORD="" + +# +# Log a message to stderr with a timestamp. +# +log() { + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2 +} + +# +# Extract a field from a JSON string using python3. +# +json_field() { + local json="$1" + local field="$2" + echo "$json" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())$field)" || + { log "ERROR: Failed to parse field '$field' from JSON response."; return 1; } +} + +# +# POST to the InfluxDB HTTP API. Exits with an error if the request fails. +# +influx_post() { + local endpoint="$1" + local data="$2" + local auth_header="${3:-}" + + local curl_args=(-sf -X POST "$INFLUXDB_URL$endpoint" -H 'Content-Type: application/json' -d "$data") + [[ -n "$auth_header" ]] && curl_args+=(-H "Authorization: Token $auth_header") + + local response + response=$(curl "${curl_args[@]}") || + { log "ERROR: HTTP POST to '$endpoint' failed."; return 1; } + echo "$response" +} + +mkdir -p "$INFLUXDB_CONFIG_DIR" + +# Skip if already fully initialized. +if [[ -f "$INFLUXDB_META_FILE" ]]; then + log "InfluxDB already initialized, skipping." + exit 0 +fi + +# +# Wait for InfluxDB to be ready. +# +ready=false +for i in $(seq 1 "$INFLUXDB_WAIT_RETRIES"); do + if curl -sf "$INFLUXDB_URL/health" &>/dev/null; then + ready=true + break + fi + sleep "$INFLUXDB_WAIT_INTERVAL" +done + +if [[ "$ready" != "true" ]]; then + log "ERROR: InfluxDB did not become ready after $((INFLUXDB_WAIT_RETRIES * INFLUXDB_WAIT_INTERVAL))s." + exit 1 +fi + +# +# Initial setup — creates org, bucket, and returns admin token + IDs. +# /api/v2/setup is a one-shot operation; if the script is interrupted after +# this point and re-run, the state file lets us skip setup and reuse the +# already-created admin token. +# +ADMIN_TOKEN="" +ORG_ID="" +BUCKET_ID="" +SUPPORT_BUCKET_ID="" + +if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then + while IFS= read -r line; do + key="${line%%=*}" + value="${line#*=}" + case "$key" in + ADMIN_TOKEN) ADMIN_TOKEN="$value" ;; + ORG_ID) ORG_ID="$value" ;; + BUCKET_ID) BUCKET_ID="$value" ;; + SUPPORT_BUCKET_ID) SUPPORT_BUCKET_ID="$value" ;; + INFLUXDB_ADMIN_PASSWORD) INFLUXDB_ADMIN_PASSWORD="$value" ;; + WRITE_TOKEN) WRITE_TOKEN="$value" ;; + READ_TOKEN) READ_TOKEN="$value" ;; + SUPPORT_WRITE_TOKEN) SUPPORT_WRITE_TOKEN="$value" ;; + esac + done <"$INFLUXDB_SETUP_STATE_FILE" +else + # Generate password only when actually running setup for the first time. + INFLUXDB_ADMIN_PASSWORD="$(openssl rand -hex 16)" + SETUP_RESPONSE=$(influx_post "/api/v2/setup" "{ + \"username\": \"$INFLUXDB_ADMIN_USER\", + \"password\": \"$INFLUXDB_ADMIN_PASSWORD\", + \"org\": \"$INFLUXDB_ORG\", + \"bucket\": \"$INFLUXDB_BUCKET\", + \"retentionPeriodSeconds\": $INFLUXDB_RETENTION_SECONDS + }") || exit 1 + + ADMIN_TOKEN=$(json_field "$SETUP_RESPONSE" "['auth']['token']") || exit 1 + ORG_ID=$(json_field "$SETUP_RESPONSE" "['org']['id']") || exit 1 + BUCKET_ID=$(json_field "$SETUP_RESPONSE" "['bucket']['id']") || exit 1 + + # Persist admin token + IDs + password immediately so a subsequent re-run + # can resume without repeating the one-shot setup call, and so the password + # stored in influxdb_meta always matches what InfluxDB was initialised with. + old_umask="$(umask)" + umask 077 + tmp_state="$(mktemp "${INFLUXDB_SETUP_STATE_FILE}.XXXXXX")" + printf 'ADMIN_TOKEN=%s\nORG_ID=%s\nBUCKET_ID=%s\nINFLUXDB_ADMIN_PASSWORD=%s\n' \ + "$ADMIN_TOKEN" "$ORG_ID" "$BUCKET_ID" "$INFLUXDB_ADMIN_PASSWORD" >"$tmp_state" + chmod 600 "$tmp_state" + mv "$tmp_state" "$INFLUXDB_SETUP_STATE_FILE" + umask "$old_umask" +fi + +# +# Create the support_metrics bucket (skipped if already persisted in state). +# +if [[ -z "$SUPPORT_BUCKET_ID" ]]; then + SUPPORT_BUCKET_RESPONSE=$(influx_post "/api/v2/buckets" "{ + \"orgID\": \"$ORG_ID\", + \"name\": \"$INFLUXDB_SUPPORT_BUCKET\", + \"retentionRules\": [{\"type\": \"expire\", \"everySeconds\": $INFLUXDB_SUPPORT_RETENTION_SECONDS}] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_BUCKET_ID=$(json_field "$SUPPORT_BUCKET_RESPONSE" "['id']") || exit 1 + printf 'SUPPORT_BUCKET_ID=%s\n' "$SUPPORT_BUCKET_ID" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# Token creation is guarded so that on crash-resume (setup state exists but +# meta file not yet written), we reuse already-created tokens rather than +# creating orphaned duplicates in InfluxDB on each retry. +WRITE_TOKEN="${WRITE_TOKEN:-}" +READ_TOKEN="${READ_TOKEN:-}" +SUPPORT_WRITE_TOKEN="${SUPPORT_WRITE_TOKEN:-}" + +# +# Create a write-only token for Telegraf (skipped if already persisted in state). +# +if [[ -z "$WRITE_TOKEN" ]]; then + WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + WRITE_TOKEN=$(json_field "$WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'WRITE_TOKEN=%s\n' "$WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Create a read-only token for DCT Smart Proxy (skipped if already persisted in state). +# +if [[ -z "$READ_TOKEN" ]]; then + READ_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"dct-read-token\", + \"permissions\": [ + {\"action\": \"read\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + READ_TOKEN=$(json_field "$READ_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'READ_TOKEN=%s\n' "$READ_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Create a write-only token for the support_metrics bucket (skipped if already persisted). +# +if [[ -z "$SUPPORT_WRITE_TOKEN" ]]; then + SUPPORT_WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-support-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$SUPPORT_BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_WRITE_TOKEN=$(json_field "$SUPPORT_WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'SUPPORT_WRITE_TOKEN=%s\n' "$SUPPORT_WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Write two [[outputs.influxdb_v2]] stanzas to a dedicated telegraf output file: +# - default bucket: Grafana-facing measurements currently used in dashboards: +# cpu, disk, diskio, net, tcp_stats, zfs, +# estat_nfs, estat_iscsi, hist_estat_nfs, hist_estat_iscsi, hist_estat_backend-io +# - support_metrics bucket: everything else (not yet in any dashboard panel) +# The flag is read by delphix-telegraf-service to conditionally include this output. +# +cat >"$INFLUXDB_OUTPUT" <"$tmp_meta" </dev/null || true diff --git a/influxdb/delphix-influxdb-service b/influxdb/delphix-influxdb-service new file mode 100644 index 0000000..eac68a4 --- /dev/null +++ b/influxdb/delphix-influxdb-service @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Wrapper script to start InfluxDB 2.x and run first-time initialization. +# + +INFLUXDB_CONFIG=/etc/influxdb/influxdb.toml +INFLUXDB_INIT=/usr/bin/delphix-influxdb-init + +# Start influxd in the background. +# influxd does not support a --config-path flag; config file is passed via env var. +INFLUXD_CONFIG_PATH="$INFLUXDB_CONFIG" /usr/bin/influxd & +INFLUXDB_PID=$! + +# Run initialization (the init script handles waiting for InfluxDB to be ready) +if ! $INFLUXDB_INIT; then + echo "ERROR: delphix-influxdb-init failed, stopping influxd" >&2 + kill "$INFLUXDB_PID" 2>/dev/null + exit 1 +fi + +wait "$INFLUXDB_PID" diff --git a/influxdb/delphix-influxdb.service b/influxdb/delphix-influxdb.service new file mode 100644 index 0000000..ec69c0b --- /dev/null +++ b/influxdb/delphix-influxdb.service @@ -0,0 +1,16 @@ +[Unit] +Description=Delphix InfluxDB Time Series Database +Documentation=https://docs.influxdata.com/influxdb/v2/ +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +User=root +ExecStart=/usr/bin/delphix-influxdb-service +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/influxdb/influxdb-init.conf b/influxdb/influxdb-init.conf new file mode 100644 index 0000000..dfab4ac --- /dev/null +++ b/influxdb/influxdb-init.conf @@ -0,0 +1,14 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Configuration for delphix-influxdb-init. +# Sourced by /usr/bin/delphix-influxdb-init at runtime. +# + +INFLUXDB_ORG="delphix" +INFLUXDB_BUCKET="default" +INFLUXDB_RETENTION_SECONDS=2592000 # 30 days (720h) +INFLUXDB_SUPPORT_BUCKET="support_metrics" +INFLUXDB_SUPPORT_RETENTION_SECONDS=2592000 # 30 days +INFLUXDB_WAIT_RETRIES=30 +INFLUXDB_WAIT_INTERVAL=2 diff --git a/influxdb/influxdb-nginx.conf b/influxdb/influxdb-nginx.conf new file mode 100644 index 0000000..ba2a74a --- /dev/null +++ b/influxdb/influxdb-nginx.conf @@ -0,0 +1,17 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Proxy InfluxDB 2.x API through nginx so external clients (DCT, Grafana) +# can reach it over HTTPS using the engine's existing TLS certificate. +# InfluxDB itself binds to 127.0.0.1:8086 (HTTP, localhost only). +# +location /influxdb/ { + proxy_pass http://127.0.0.1:8086/; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_read_timeout 999d; + proxy_buffering off; +} diff --git a/influxdb/influxdb.toml b/influxdb/influxdb.toml new file mode 100644 index 0000000..49b7e3f --- /dev/null +++ b/influxdb/influxdb.toml @@ -0,0 +1,10 @@ +# +# Copyright 2026 Delphix. All rights reserved. +# +# InfluxDB 2.x Configuration +# + +bolt-path = "/var/lib/influxdb/influxd.bolt" +engine-path = "/var/lib/influxdb/engine" +http-bind-address = "127.0.0.1:8086" +log-level = "warn" diff --git a/influxdb/perf_influxdb b/influxdb/perf_influxdb new file mode 100644 index 0000000..00baa6f --- /dev/null +++ b/influxdb/perf_influxdb @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Script that enables and disables InfluxDB metric output for Telegraf. +# + +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb + +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_influxdb() { + date + [[ ! -f $INFLUXDB_OUTPUT ]] && die "$INFLUXDB_OUTPUT not found. Run delphix-influxdb-init first." + echo "Enabling InfluxDB Metric Output" + touch $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +function disable_influxdb() { + date + echo "Disabling InfluxDB Metric Output" + rm -f $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_influxdb ;; +disable) disable_influxdb ;; +*) usage ;; +esac diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh new file mode 100755 index 0000000..2d6284f --- /dev/null +++ b/telegraf/connstat-stats.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# +# Collect per-connection TCP stats from connstat and aggregate by remote +# endpoint (laddr:raddr:service) to bound cardinality on engines with many +# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or +# Elastic Data (many connections per object storage endpoint IP). +# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack. +# +# Service name lookup reads from /etc/services, matching LocalTCPStatsCollector +# exactly. lport is checked before rport so that listening services (where the +# engine is the server) are identified correctly. Falls back to "unknown". +# +# Output fields per aggregated endpoint: +# laddr, raddr, service +# inbytes, outbytes, retranssegs, suna, unsent (summed across connections) +# swnd, cwnd, rwnd, rtt (averaged across connections) +# connections (count of aggregated conns) +# +/usr/bin/connstat -PLe -i 10 -T u \ + -o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \ + | awk -F',' ' +BEGIN { + # Load port->service mapping from /etc/services, same as LocalTCPStatsCollector. + # Pattern matches lines of the form: "servicename port/tcp" + while ((getline line < "/etc/services") > 0) { + sub(/^[[:space:]]+/, "", line) + if (line ~ /^(#|$)/) continue + n = split(line, f, /[[:space:]]+/) + if (n >= 2 && f[2] ~ /\/tcp/) { + split(f[2], pf, "/") + port = pf[1] + 0 + if (!(port in svc)) svc[port] = f[1] + } + } + close("/etc/services") + # Delphix-specific ports not present in /etc/services. + svc[8415] = "dlpx-sp" + svc[50001] = "network-throughput-test" + svc[8341] = "oracle-logsync" + svc[9100] = "dlpx-connector" +} +/^=/ { + for (key in cnt) { + n = cnt[key] + split(key, k, SUBSEP) + print k[1] "," k[2] "," k[3] "," \ + inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ + int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n + } + fflush() + delete cnt; delete inb; delete outb; delete ret + delete sun; delete uns; delete sw; delete cw; delete rw; delete rt + next +} +NF == 13 { + if (($2 + 0) in svc) { + service = svc[$2 + 0] + } else if (($4 + 0) in svc) { + service = svc[$4 + 0] + } else { + service = "unknown" + } + key = $1 SUBSEP $3 SUBSEP service + inb[key] += $5; outb[key] += $6; ret[key] += $7 + sun[key] += $8; uns[key] += $9 + sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 + cnt[key]++ +} +' diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service index 72df797..935465c 100755 --- a/telegraf/delphix-telegraf-service +++ b/telegraf/delphix-telegraf-service @@ -3,7 +3,10 @@ BASE_CONFIG=/etc/telegraf/telegraf.base DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose DCT_INPUTS=/etc/telegraf/telegraf.inputs.dct PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +STORAGE_IO_INPUTS=/etc/telegraf/telegraf.inputs.storage_io +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf @@ -21,6 +24,10 @@ function playbook_is_enabled() { [[ -f $PLAYBOOK_FLAG ]] } +function influxdb_is_enabled() { + [[ -f $INFLUXDB_FLAG ]] +} + rm -f $TELEGRAF_CONFIG if engine_is_object_based; then @@ -43,4 +50,21 @@ else fi fi +if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then + if [[ -f $STORAGE_IO_INPUTS ]]; then + cat $STORAGE_IO_INPUTS >> $TELEGRAF_CONFIG + fi + cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG +else + if influxdb_is_enabled; then + logger -t delphix-telegraf "WARNING: INFLUXDB_ENABLED is set but $INFLUXDB_OUTPUT is missing — metrics will be discarded. Run delphix-influxdb-init to restore InfluxDB output." + fi + # No InfluxDB output configured. Add discard so Telegraf can start — + # it requires at least one output plugin. + echo "[[outputs.discard]]" >> $TELEGRAF_CONFIG +fi + +# Restrict permissions so the InfluxDB write token is not world-readable. +chmod 640 $TELEGRAF_CONFIG + /usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 7abd9a4..f516875 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -11,53 +11,20 @@ ############################################################################### # OUTPUT PLUGINS # ############################################################################### -# Define the main metric output file, excluding aggregated stats and -# Performance Playbook (estat) data. -[[outputs.file]] - files = ["/var/log/telegraf/metrics.json"] - rotation_max_size = "50MB" - rotation_max_archives = 9 - data_format = "json" - namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*", "docker*"] - -# Define output file for ZFS related metrics -[[outputs.file]] - files = ["/var/log/telegraf/metrics_zfs.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["zpool*", "zcache*", "zfs"] - -# Define output file for Performance Playbook (estat) metrics -[[outputs.file]] - files = ["/var/log/telegraf/metrics_estat.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["*estat_*"] - -# Define output file for aggregate statistics -[[outputs.file]] - files = ["/var/log/telegraf/metric_aggregates.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["agg_*"] - -# Enable Live Monitoring, intended for internal Delphix use only: -#[[outputs.influxdb]] -# urls = ["http://dbsvr.company.com:8086"] -# database = "live_metrics" -# skip_database_creation = true -# data_format = "influx" +# All metrics are ingested into InfluxDB. The output stanza is written by +# delphix-influxdb-init to /etc/telegraf/telegraf.outputs.influxdb and +# appended here by delphix-telegraf-service when InfluxDB is enabled. +# Use 'perf_influxdb enable|disable' to toggle and restart Telegraf. ############################################################################### # INPUT PLUGINS # ############################################################################### -# Get CPU usage +# Get CPU usage — only cpu-total, not per-core (reduces data volume on +# many-CPU engines; agg_cpu automatically inherits this restriction). +# percpu defaults to true so must be explicitly set to false. [[inputs.cpu]] - percpu = true + percpu = false totalcpu = true collect_cpu_time = false report_active = false @@ -65,31 +32,58 @@ # Get mount point stats [[inputs.disk]] + interval = "60s" mount_points = ["/","/domain0"] - -# Get disk I/O stats + tagexclude = ["fstype", "mode"] + fieldpass = ["used", "free", "total"] + +# Get disk I/O stats for whole disks only — partitions add cardinality without +# diagnostic value and account for ~30% of diskio/agg_diskio line volume. +# Excluded: +# zd* — ZFS zvol internal block devices +# *p[0-9]* — NVMe partitions (nvme0n1p1, nvme0n1p9, etc.) +# sd*[0-9]* — SCSI/SATA partitions (sda1, sdb2, etc.) +# wwid is a redundant 100+ char tag; the short-form name tag is sufficient. [[inputs.diskio]] - -# Track stats for the current metric files -[[inputs.filestat]] - files = ["/var/log/telegraf/metrics.json", - "/var/log/telegraf/metrics_estat.json", - "/var/log/telegraf/metrics_zfs.json", - "/var/log/telegraf/metric_aggregates.json"] + interval = "60s" + tagdrop = {name = ["zd*", "*p[0-9]*", "sd*[0-9]*"]} + tagexclude = ["wwid"] + fieldpass = ["reads", "writes", "read_bytes", "write_bytes", "read_time", "write_time", "iops_in_progress"] # Get Memory stats [[inputs.mem]] + fieldpass = ["used", "available", "total", "free", "cached", "buffered", "dirty", "slab"] # Get some network interface stats [[inputs.net]] fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] +# Per-endpoint TCP stats (bytes, RTT, window sizes) via connstat. +# Aggregated by (laddr, raddr, service) to mirror the aggregation in +# LocalTCPStatsCollector — avoids cardinality explosion on Oracle dNFS +# engines (hundreds of connections per VDB host) and Elastic Data engines +# (many connections per object storage endpoint IP). +# Cumulative fields (inbytes, outbytes, etc.) are summed; window/RTT fields +# are averaged; connections = number of TCP connections aggregated. +[[inputs.execd]] + command = ["/etc/telegraf/connstat-stats.sh"] + name_override = "tcp_stats" + signal = "none" + restart_delay = "30s" + data_format = "csv" + csv_delimiter = "," + csv_trim_space = true + csv_column_names = ["laddr", "raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] + csv_column_types = ["string", "string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] + csv_tag_columns = ["laddr", "raddr", "service"] + # Track CPU and Memory for the "delphix-mgmt" service (and children). [[inputs.procstat]] systemd_unit = "delphix-mgmt.service" include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Track CPU and Memory for the "zfs-object-agent" service (and children). [[inputs.procstat]] @@ -97,19 +91,43 @@ include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Get process counts [[inputs.processes]] -# Get swap memory usage -[[inputs.swap]] - # Get misc 'other' stats (load and uptime) [[inputs.system]] # ZFS kstats (arcstat, abdstat, zfetch, etc) +# arcstats_l2_* fields are L2ARC stats — unused on all appliances (no L2ARC). [[inputs.zfs]] interval = "1m" + fieldpass = [ + "arcstats_anon_data", "arcstats_anon_evictable_data", + "arcstats_anon_evictable_metadata", "arcstats_anon_metadata", + "arcstats_arc_need_free", "arcstats_arc_no_grow", "arcstats_arc_prune", + "arcstats_arc_sys_free", "arcstats_async_upgrade_sync", + "arcstats_c", "arcstats_data_size", + "arcstats_demand_data_hits", "arcstats_demand_data_misses", + "arcstats_demand_hit_predictive_prefetch", + "arcstats_evict_not_enough", "arcstats_evict_skip", + "arcstats_hits", "arcstats_misses", + "arcstats_memory_available_bytes", "arcstats_memory_direct_count", + "arcstats_memory_free_bytes", "arcstats_memory_indirect_count", + "arcstats_metadata_size", + "arcstats_mfu_data", "arcstats_mfu_evictable_data", + "arcstats_mfu_evictable_metadata", "arcstats_mfu_ghost_hits", + "arcstats_mfu_hits", "arcstats_mfu_metadata", + "arcstats_mru_data", "arcstats_mru_evictable_data", + "arcstats_mru_evictable_metadata", "arcstats_mru_ghost_hits", + "arcstats_mru_hits", "arcstats_mru_metadata", + "arcstats_prefetch_data_hits", "arcstats_prefetch_data_misses", + "arcstats_size", + "zil_commit_count", "zil_itx_count", "zil_commit_stall_count", + "zfetchstats_hits", "zfetchstats_misses", + "dmu_tx_dirty_throttle", "dmu_tx_delay" + ] # Detailed ZFS pool metrics from "zpool_influxdb" (noisy) #[[inputs.exec]] @@ -127,5 +145,5 @@ drop_original = false stats = ["min", "max", "mean", "stdev"] name_prefix = "agg_" - namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"] + namepass = ["cpu","disk","diskio","mem","net","processes","system"] diff --git a/telegraf/telegraf.inputs.dct b/telegraf/telegraf.inputs.dct index 07ceb4d..07dc47f 100644 --- a/telegraf/telegraf.inputs.dct +++ b/telegraf/telegraf.inputs.dct @@ -11,12 +11,5 @@ ] docker_label_exclude = ["com.docker.compose.*", "resty*"] - [[outputs.file]] - files = ["/var/log/telegraf/metrics_docker.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["docker*"] - ####################### End of Docker/DCT services Metrics ####################### diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index 5ed7e21..dcfa71e 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -1,31 +1,8 @@ ############################################################################## -# Performance Playbook (estat, nfs_threads) collection - -# Collect output from "estat nfs -jm 10" -[[inputs.execd]] - command = ["estat", "nfs", "-jm", "10"] - name_override = "estat_nfs" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] - -# Collect output from "estat iscsi -jm 10" -[[inputs.execd]] - command = ["estat", "iscsi", "-jm", "10"] - name_override = "estat_iscsi" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] +# Performance Playbook (estat, nfs_threads) collection +# Note: estat_nfs, estat_iscsi, and estat_backend-io live in +# telegraf.inputs.storage_io and are always collected when InfluxDB is +# enabled, independent of playbook state. # Collect output from "estat zpl -jm 10" [[inputs.execd]] @@ -40,19 +17,6 @@ ] json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] -# Collect output from "estat backend-io -jm 10" -[[inputs.execd]] - command = ["estat", "backend-io", "-jm", "10"] - name_override = "estat_backend-io" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] - # Collect output from "estat zvol -jm 10" [[inputs.execd]] command = ["estat", "zvol", "-jm", "10"] @@ -123,47 +87,17 @@ ############################################################################### # PROCESSOR PLUGINS # ############################################################################### -# Convert strings from estat into integer values so they don't get dropped +# Convert strings from estat into integer values so they don't get dropped. +# Scoped to playbook-only metrics; estat_nfs/iscsi/backend-io have their own +# converter in telegraf.inputs.storage_io. [[processors.converter]] + namepass = ["estat_zpl", "estat_zio", "estat_zvol", "estat_zio-queue", "estat_metaslab-alloc"] [processors.converter.fields] integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] -# The estat output contains a nested latency histogram, so we need to -# parse that out as a new array metric rather than a non-JSON string. -# -# From this: -# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}" -# to this: -# "microseconds":"{20000:5,30000:15,40000:3,50000:24}" -# -# Clone the original so we have a "new" metric with a "hist_" name prefix -[[processors.clone]] - order = 1 - name_prefix = "hist_" - namepass = ["estat_*"] - -# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects -[[processors.regex]] - order = 2 - namepass = ["hist_estat_*"] - [[processors.regex.fields]] - key = "microseconds" - pattern = "{(\\d+),(\\d+)}" - replacement = "\"${1}\":${2}" - [[processors.regex.fields]] - key = "microseconds" - pattern = ".*" - replacement = "{$0}" - -# Now parse out the arrays for "hist_estat_*" metrics -[[processors.parser]] - order = 3 - merge = "override" - parse_fields = ["microseconds"] - drop_original = false - data_format = "json" - namepass = ["hist_estat_*"] - fieldpass = ["microseconds"] +# Note: histogram processors (clone/regex/parser) for all estat_* measurements +# live in telegraf.inputs.storage_io, which is always included when InfluxDB +# is enabled. No duplication needed here. # End of Processor section ############################################################################## diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io new file mode 100644 index 0000000..2341147 --- /dev/null +++ b/telegraf/telegraf.inputs.storage_io @@ -0,0 +1,99 @@ +############################################################################## +# Storage I/O collection: NFS server, iSCSI target, and backend disk I/O. +# Always included when InfluxDB is enabled, independent of playbook state. +# NFS/iSCSI/backend-IO are also collected continuously by delphix-stat into +# analytics_datapoint (the source for Support Grafana), so full histogram +# capture here mirrors that existing always-on precedent. + +# Collect output from "estat nfs -jm 10" +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat iscsi -jm 10" +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat backend-io -jm 10" (stbtrace io equivalent) +[[inputs.execd]] + command = ["estat", "backend-io", "-jm", "10"] + name_override = "estat_backend-io" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Convert estat string fields to integers so they are not dropped by Telegraf. +[[processors.converter]] + namepass = ["estat_nfs", "estat_iscsi", "estat_backend-io"] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# Clone estat_* measurements as hist_estat_* to hold histogram data only. +# microseconds is removed from the originals (order 2 below) so it lives in +# hist_estat_* exclusively — no duplication. +# Keeps the original format "{20000,5},{30000,15}" compatible with import code. +[[processors.clone]] + order = 1 + name_prefix = "hist_" + namepass = ["estat_*"] + +# Drop microseconds from all original estat_* measurements after cloning. +# Covers both storage_io (estat_nfs/iscsi/backend-io) and playbook +# (estat_zpl/zvol/zio/etc) measurements in one place. +[[processors.strings]] + order = 2 + namepass = ["estat_*"] + fieldexclude = ["microseconds"] + +# Expand hist_estat_* microseconds histogram strings into per-bucket rows for +# Grafana heatmap support. Each "{upper_bound_us,count}" pair becomes a +# separate metric with le= tag and count field, replacing the +# opaque string. Runs after clone (order=1) and strings (order=2) so the +# microseconds field is still present in hist_estat_* at this point. +[[processors.starlark]] + order = 3 + namepass = ["hist_estat_*"] + source = ''' +def apply(metric): + ms = metric.fields.get("microseconds") + if ms == None: + return [metric] + + result = [] + for pair in ms[1:-1].split("},{"): + parts = pair.split(",") + if len(parts) == 2: + m = deepcopy(metric) + m.tags["le"] = parts[0] + for k in list(m.fields.keys()): + m.fields.pop(k) + m.fields["count"] = int(parts[1]) + result.append(m) + + return result if result else [metric] +''' + +# End of Storage I/O section +############################################################################## From c55fa18f80240e9fa67884bd06ce662541b919d5 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 15 Jun 2026 10:44:51 +0530 Subject: [PATCH 02/10] DLPX-88427 Fix estat metaslab-alloc for ZFS 2.4.99 (2026.3+) PR URL: https://www.github.com/delphix/performance-diagnostics/pull/121 --- bpf/estat/metaslab-alloc.c | 159 ++++++++++++++++++++++++++++++------- 1 file changed, 131 insertions(+), 28 deletions(-) diff --git a/bpf/estat/metaslab-alloc.c b/bpf/estat/metaslab-alloc.c index a413d39..473adae 100644 --- a/bpf/estat/metaslab-alloc.c +++ b/bpf/estat/metaslab-alloc.c @@ -12,9 +12,9 @@ #define VD_NAME_SIZE 32 typedef struct { u64 ts; - u64 size; + u64 size; /* psize stored by metaslab_alloc_dva_entry (old path) */ u64 asize; - u64 alloc_time; + u8 dva_owned; /* 1 if created by metaslab_alloc_dva_entry (old path) */ char vd_name[VD_NAME_SIZE]; } data_t; @@ -40,6 +40,27 @@ equal_to_pool(char *str) return (true); } +/* + * In ZFS 2.4.99+ (2026.3 and later) the allocation path no longer goes + * through metaslab_alloc_dva at all — metaslab_group_alloc is called + * directly by a different caller. We therefore use metaslab_group_alloc + * as both the start-timing and pool-filter point. The pool name is + * available via mg->mg_class->mc_spa->spa_name. + * + * On older ZFS versions metaslab_alloc_dva is still the outer entry point. + * We probe it to set up the data_map entry (dva_owned=1) and store psize so + * metaslab_alloc_dva_exit can emit an "allocation failures" metric if the + * overall DVA allocation fails. metaslab_group_alloc_entry then populates + * vdev information into the same entry. + * + * Lifetime of map entries: + * Old path: created by metaslab_alloc_dva_entry, deleted by + * metaslab_alloc_dva_exit (metaslab_group_alloc_exit only + * clears per-group fields so multiple groups can be tried). + * New path: created and deleted by metaslab_group_alloc_entry/exit pair + * (no outer dva probe fires). + */ + // @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry int metaslab_alloc_dva_entry(struct pt_regs *ctx, @@ -51,8 +72,9 @@ metaslab_alloc_dva_entry(struct pt_regs *ctx, if (!equal_to_pool(spa->spa_name)) return (0); - data.ts = bpf_ktime_get_ns(); - data.size = psize; + data.ts = bpf_ktime_get_ns(); + data.size = psize; + data.dva_owned = 1; data_map.update(&tid, &data); @@ -67,18 +89,54 @@ metaslab_group_alloc_entry(struct pt_regs *ctx, u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); - if (data == NULL || data->ts == 0) - return (0); - - data->asize = asize; - data->alloc_time = bpf_ktime_get_ns(); - - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_path); + if (data != NULL && data->dva_owned) { + /* + * Older path: metaslab_alloc_dva_entry already created the + * entry and set the start timestamp. Just fill in vdev info. + */ + data->asize = asize; + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_path); + } else { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); + } } else { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_ops->vdev_op_type); + /* + * Newer path: metaslab_alloc_dva is not in the call chain. + * Create the entry here, filtering by pool via mc_spa. + * Read each pointer level explicitly: BCC's automatic + * three-level dereference (mg->mg_class->mc_spa->spa_name) + * does not produce a usable address for bpf_probe_read. + */ + metaslab_class_t *mc = NULL; + spa_t *spa = NULL; + bpf_probe_read(&mc, sizeof(mc), &mg->mg_class); + if (!mc) + return (0); + bpf_probe_read(&spa, sizeof(spa), &mc->mc_spa); + if (!spa) + return (0); + if (!equal_to_pool(spa->spa_name)) + return (0); + + data_t d = {}; + d.ts = bpf_ktime_get_ns(); + d.asize = asize; + /* dva_owned stays 0: this entry is owned by group entry/exit */ + + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(d.vd_name, + sizeof(d.vd_name), mg->mg_vd->vdev_path); + } else { + bpf_probe_read_str(d.vd_name, + sizeof(d.vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); + } + + data_map.update(&tid, &d); } return (0); @@ -97,18 +155,56 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) if (data == NULL || data->ts == 0) return (0); - if (PT_REGS_RC(ctx) == -1ULL) { + /* + * metaslab_group_alloc returns a metaslab_t * (or similar pointer) + * in both old and new ZFS. NULL (0) = failure, non-NULL = success. + */ + if (PT_REGS_RC(ctx) == 0) { axis = failure; } else { axis = success; } + /* + * Guard against garbage in vd_name (DLPX-88427): a kernel bug on + * some engine versions causes raw memory bytes to appear here. + * A single non-printable byte anywhere in the string breaks JSON + * output (Python decodes with backslashreplace and the result is + * concatenated into JSON without escaping). Scan all bytes up to + * the first NUL; replace the whole name with "unknown" if the + * string is empty or any byte is outside printable ASCII (0x20-0x7e). + */ + bool vd_valid = (data->vd_name[0] != '\0'); +#pragma unroll + for (int _i = 0; _i < VD_NAME_SIZE; _i++) { + char _c = data->vd_name[_i]; + if (_c == '\0') + break; + if (_c < 0x20 || _c > 0x7e) + vd_valid = false; + } + if (!vd_valid) { + char unknown[] = "unknown"; + __builtin_memcpy(data->vd_name, unknown, sizeof(unknown)); + } + AGGREGATE_DATA(data->vd_name, axis, bpf_ktime_get_ns() - data->ts, data->asize); - data->asize = 0; - data->alloc_time = 0; - data->vd_name[0] = '\0'; + if (data->dva_owned) { + /* + * Old path: metaslab_alloc_dva may call metaslab_group_alloc + * multiple times (one per group tried). Reset per-group fields + * so the next attempt gets a fresh vdev name, but leave the + * entry alive so metaslab_alloc_dva_exit can record an + * "allocation failures" metric if the overall DVA fails. + */ + data->asize = 0; + data->vd_name[0] = '\0'; + } else { + /* New path: no outer dva exit will run; clean up now. */ + data_map.delete(&tid); + } return (0); } @@ -118,23 +214,30 @@ int metaslab_alloc_dva_exit(struct pt_regs *ctx, spa_t *spa, metaslab_class_t *mc, uint64_t psize) { + /* spa, mc, psize match the probed function signature but are unused. */ + (void)spa; (void)mc; (void)psize; + u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); if (data == NULL || data->ts == 0) return (0); - if (PT_REGS_RC(ctx) == 0) - return (0); - - char name[] = "allocation failures"; - char axis = 0; - AGGREGATE_DATA(name, &axis, - bpf_ktime_get_ns() - data->ts, data->size); + /* + * A live entry exists on both success and failure paths; only the + * return code determines whether to emit an "allocation failures" + * metric (non-zero RC = failure). psize is read from data->size + * rather than the kretprobe argument registers, which may be + * clobbered by the time the function returns. + */ + if (PT_REGS_RC(ctx) != 0) { + char name[] = "allocation failures"; + char axis = 0; + AGGREGATE_DATA(name, &axis, + bpf_ktime_get_ns() - data->ts, data->size); + } data->ts = 0; - data->size = 0; - data_map.delete(&tid); return (0); From 04b809ba4b417a285fb6bdff3af992e7df64747f Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 15 Jun 2026 10:47:06 +0530 Subject: [PATCH 03/10] DLPX-96312 Add InfluxDB/Telegraf infrastructure for Engine Performance Analytics PR URL: https://www.github.com/delphix/performance-diagnostics/pull/121 --- .../skills/perf-diagnostics-deploy/SKILL.md | 233 +++++++++++++++ bpf/estat/metaslab-alloc.c | 159 +++++++++-- debian/control | 2 +- debian/postinst | 8 + debian/rules | 7 +- influxdb/delphix-influxdb-init | 266 ++++++++++++++++++ influxdb/delphix-influxdb-service | 23 ++ influxdb/delphix-influxdb.service | 16 ++ influxdb/influxdb-init.conf | 14 + influxdb/influxdb-nginx.conf | 17 ++ influxdb/influxdb.toml | 10 + influxdb/perf_influxdb | 47 ++++ telegraf/connstat-stats.sh | 69 +++++ telegraf/delphix-telegraf-service | 24 ++ telegraf/telegraf.base | 126 +++++---- telegraf/telegraf.inputs.dct | 7 - telegraf/telegraf.inputs.playbook | 88 +----- telegraf/telegraf.inputs.storage_io | 99 +++++++ 18 files changed, 1047 insertions(+), 168 deletions(-) create mode 100644 .claude/skills/perf-diagnostics-deploy/SKILL.md create mode 100644 influxdb/delphix-influxdb-init create mode 100644 influxdb/delphix-influxdb-service create mode 100644 influxdb/delphix-influxdb.service create mode 100644 influxdb/influxdb-init.conf create mode 100644 influxdb/influxdb-nginx.conf create mode 100644 influxdb/influxdb.toml create mode 100644 influxdb/perf_influxdb create mode 100755 telegraf/connstat-stats.sh create mode 100644 telegraf/telegraf.inputs.storage_io diff --git a/.claude/skills/perf-diagnostics-deploy/SKILL.md b/.claude/skills/perf-diagnostics-deploy/SKILL.md new file mode 100644 index 0000000..1a4e819 --- /dev/null +++ b/.claude/skills/perf-diagnostics-deploy/SKILL.md @@ -0,0 +1,233 @@ +--- +name: perf-diagnostics-deploy +description: Use when making changes to Telegraf or InfluxDB config files in the performance-diagnostics repo and wanting to test those changes on a live Delphix engine via SSH +--- + +# Performance Diagnostics Deploy & Verify + +## Overview + +Workflow for making config changes to the performance-diagnostics repo, deploying them to a Delphix test engine over SSH, and verifying the changes are working correctly via InfluxDB queries. + +## Workflow + +```dot +digraph deploy { + "Make changes" -> "Ask: test on engine?"; + "Ask: test on engine?" -> "Done" [label="no"]; + "Ask: test on engine?" -> "Ask: which engine hostname?" [label="yes"]; + "Ask: which engine hostname?" -> "SSH as delphix, ask for password"; + "SSH as delphix, ask for password" -> "Deploy changed files"; + "Deploy changed files" -> "Restart services"; + "Restart services" -> "Wait 5 min"; + "Wait 5 min" -> "Query InfluxDB, verify changes"; +} +``` + +## Step 1 — Make Changes + +Make the requested code/config changes. Summarise what was changed and why before asking about testing. + +## Step 2 — Ask to Test + +``` +Changes done. Do you want to test these on a Delphix engine? +``` + +If no → stop. If yes → proceed. + +## Step 3 — Get Engine + Password + +``` +Which Delphix engine hostname should I deploy to? +``` + +Then SSH using `sshpass`: +```bash +sshpass -p "$PASSWORD" ssh -o StrictHostKeyChecking=no delphix@$HOST "..." +``` + +Ask for the SSH password if not already known. Default user is always `delphix`. + +## Step 4 — Deploy Changed Files + +**File locations on engine:** + +| Repo path | Engine path | +|---|---| +| `telegraf/telegraf.base` | `/etc/telegraf/telegraf.base` | +| `telegraf/telegraf.inputs.*` | `/etc/telegraf/telegraf.inputs.*` | +| `telegraf/connstat-stats.sh` | `/etc/telegraf/connstat-stats.sh` | +| `telegraf/nfs-threads.sh` | `/etc/telegraf/nfs-threads.sh` | +| `telegraf/zcache-stats.sh` | `/etc/telegraf/zcache-stats.sh` | +| `telegraf/zpool-iostat-o.sh` | `/etc/telegraf/zpool-iostat-o.sh` | +| `telegraf/delphix-telegraf-service` | `/usr/bin/delphix-telegraf-service` | +| `telegraf/perf_playbook` | `/usr/bin/perf_playbook` | +| `telegraf/delphix-telegraf.service` | `/lib/systemd/system/delphix-telegraf.service` | +| `influxdb/delphix-influxdb-init` | `/usr/bin/delphix-influxdb-init` | +| `influxdb/delphix-influxdb-service` | `/usr/bin/delphix-influxdb-service` | +| `influxdb/perf_influxdb` | `/usr/bin/perf_influxdb` | +| `influxdb/influxdb.toml` | `/etc/influxdb/influxdb.toml` | +| `influxdb/influxdb-init.conf` | `/etc/influxdb/influxdb-init.conf` | +| `influxdb/delphix-influxdb.service` | `/lib/systemd/system/delphix-influxdb.service` | +| `influxdb/influxdb-nginx.conf` | `/opt/delphix/server/etc/nginx/conf.d/influxdb.conf` | + +Only copy files that were actually changed. Use `scp` to `/tmp/` first, then `sudo cp` to destination. Set `chmod +x` on any shell scripts. + +**InfluxDB data directory:** `/var/lib/influxdb/engine` + +## Step 5 — Restart Services + +```bash +sudo systemctl restart delphix-influxdb +sleep 5 +sudo systemctl restart delphix-telegraf + +# Confirm both are active +systemctl is-active delphix-influxdb +systemctl is-active delphix-telegraf +``` + +## Step 6 — Wait 5 Minutes + +Wait for data to flow into InfluxDB. Use `ScheduleWakeup` with `delaySeconds: 270` (within cache window). + +## Step 7 — Verify via InfluxDB Query + +Get the InfluxDB credentials from the engine: +```bash +sudo cat /etc/influxdb/influxdb_meta +``` + +Query InfluxDB using the Flux API to verify the changes for the **last 5 minutes** of data. Tailor the query to what was changed: + +| Change type | What to verify | +|---|---| +| New measurement added | `from(bucket:"default") |> range(start: -5m) |> filter(fn: (r) => r._measurement == "new_measurement") |> count()` | +| Field removed (e.g. `wwid` tag) | Check tag keys don't include the removed tag | +| Histogram processors | Verify `hist_estat_*` measurements exist with bucket fields | +| `microseconds` field | Check field exists in relevant measurements | +| `connstat` aggregation | Verify `tcp_stats` has `service` and `connections` fields | + +Query via curl: +```bash +curl -s -X POST "http://localhost:8086/api/v2/query?org=delphix" \ + -H "Authorization: Token $INFLUXDB_READ_TOKEN" \ + -H "Content-Type: application/vnd.flux" \ + -d 'from(bucket:"default") |> range(start: -5m) |> filter(fn:(r) => r._measurement == "MEASUREMENT") |> limit(n:5)' +``` + +Report results clearly: what measurements exist, what fields/tags are present, and whether the change is confirmed working. + +Then ask: + +``` +Verification done. Do you want to commit these changes? +``` + +If no → stop. If yes → proceed to Step 8. + +## Step 8 — Commit Changes + +Show the latest commit: +```bash +git log -1 --oneline +``` + +Ask: +``` +Latest commit: " " +Do you want to (1) amend that commit or (2) create a new commit? +``` + +**If amend:** +```bash +git add -A +git commit --amend --no-edit +``` + +**If new commit:** +Ask: +``` +What should the commit message be? (include the Jira ID, e.g. "DLPX-12345 Fix xyz") +``` + +Then: +```bash +git add -A +git commit -m "" +``` + +Then ask: +``` +Commit done. Do you want to push and update/raise a PR? +``` + +If no → stop. If yes → proceed to Step 9. + +## Step 9 — Push and PR + +`git review` is a Delphix tool that pushes the branch and creates/updates the PR in one command. Use it instead of `git push` + `gh pr create`. + +First check for an existing open PR on the current branch: +```bash +gh pr list --head "$(git branch --show-current)" --state open +``` + +**If PR exists → update it:** + +```bash +git review -r +``` + +Then fetch the current PR description and update it to reflect the new changes: +```bash +gh pr view --json body +gh pr edit --body "..." +``` +Keep the existing structure but add/update the relevant sections. + +**If no PR exists → raise a new one:** + +Ask: +``` +What is the Jira ticket number for this PR? (e.g. DLPX-12345) +``` + +Fetch the Jira issue using the Jira MCP tool (`mcp__jira__jira_get_issue`) to understand the problem context. Then run: + +```bash +git review +``` + +This creates the PR as a draft. Get the PR URL from the output, then set a full description: + +```bash +gh pr edit --title ": " --body "$(cat <<'EOF' +## Summary +- + +## Problem + + +## Solution + + +## Testing +- [ ] Deployed to test engine +- [ ] InfluxDB queries confirmed data flowing correctly +- [ ] + +Jira: +EOF +)" +``` + +Return the PR URL to the user. + +## Common Mistakes + +- Forgetting `chmod +x` on shell scripts → Telegraf fails with `EXEC` error +- Restarting Telegraf before InfluxDB is ready → Telegraf starts with `[[outputs.discard]]` +- Querying before 5 minutes pass → no data in range, looks broken but isn't +- Copying the wrong file path (influxdb vs telegraf directories) diff --git a/bpf/estat/metaslab-alloc.c b/bpf/estat/metaslab-alloc.c index a413d39..473adae 100644 --- a/bpf/estat/metaslab-alloc.c +++ b/bpf/estat/metaslab-alloc.c @@ -12,9 +12,9 @@ #define VD_NAME_SIZE 32 typedef struct { u64 ts; - u64 size; + u64 size; /* psize stored by metaslab_alloc_dva_entry (old path) */ u64 asize; - u64 alloc_time; + u8 dva_owned; /* 1 if created by metaslab_alloc_dva_entry (old path) */ char vd_name[VD_NAME_SIZE]; } data_t; @@ -40,6 +40,27 @@ equal_to_pool(char *str) return (true); } +/* + * In ZFS 2.4.99+ (2026.3 and later) the allocation path no longer goes + * through metaslab_alloc_dva at all — metaslab_group_alloc is called + * directly by a different caller. We therefore use metaslab_group_alloc + * as both the start-timing and pool-filter point. The pool name is + * available via mg->mg_class->mc_spa->spa_name. + * + * On older ZFS versions metaslab_alloc_dva is still the outer entry point. + * We probe it to set up the data_map entry (dva_owned=1) and store psize so + * metaslab_alloc_dva_exit can emit an "allocation failures" metric if the + * overall DVA allocation fails. metaslab_group_alloc_entry then populates + * vdev information into the same entry. + * + * Lifetime of map entries: + * Old path: created by metaslab_alloc_dva_entry, deleted by + * metaslab_alloc_dva_exit (metaslab_group_alloc_exit only + * clears per-group fields so multiple groups can be tried). + * New path: created and deleted by metaslab_group_alloc_entry/exit pair + * (no outer dva probe fires). + */ + // @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry int metaslab_alloc_dva_entry(struct pt_regs *ctx, @@ -51,8 +72,9 @@ metaslab_alloc_dva_entry(struct pt_regs *ctx, if (!equal_to_pool(spa->spa_name)) return (0); - data.ts = bpf_ktime_get_ns(); - data.size = psize; + data.ts = bpf_ktime_get_ns(); + data.size = psize; + data.dva_owned = 1; data_map.update(&tid, &data); @@ -67,18 +89,54 @@ metaslab_group_alloc_entry(struct pt_regs *ctx, u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); - if (data == NULL || data->ts == 0) - return (0); - - data->asize = asize; - data->alloc_time = bpf_ktime_get_ns(); - - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_path); + if (data != NULL && data->dva_owned) { + /* + * Older path: metaslab_alloc_dva_entry already created the + * entry and set the start timestamp. Just fill in vdev info. + */ + data->asize = asize; + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_path); + } else { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); + } } else { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_ops->vdev_op_type); + /* + * Newer path: metaslab_alloc_dva is not in the call chain. + * Create the entry here, filtering by pool via mc_spa. + * Read each pointer level explicitly: BCC's automatic + * three-level dereference (mg->mg_class->mc_spa->spa_name) + * does not produce a usable address for bpf_probe_read. + */ + metaslab_class_t *mc = NULL; + spa_t *spa = NULL; + bpf_probe_read(&mc, sizeof(mc), &mg->mg_class); + if (!mc) + return (0); + bpf_probe_read(&spa, sizeof(spa), &mc->mc_spa); + if (!spa) + return (0); + if (!equal_to_pool(spa->spa_name)) + return (0); + + data_t d = {}; + d.ts = bpf_ktime_get_ns(); + d.asize = asize; + /* dva_owned stays 0: this entry is owned by group entry/exit */ + + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(d.vd_name, + sizeof(d.vd_name), mg->mg_vd->vdev_path); + } else { + bpf_probe_read_str(d.vd_name, + sizeof(d.vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); + } + + data_map.update(&tid, &d); } return (0); @@ -97,18 +155,56 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) if (data == NULL || data->ts == 0) return (0); - if (PT_REGS_RC(ctx) == -1ULL) { + /* + * metaslab_group_alloc returns a metaslab_t * (or similar pointer) + * in both old and new ZFS. NULL (0) = failure, non-NULL = success. + */ + if (PT_REGS_RC(ctx) == 0) { axis = failure; } else { axis = success; } + /* + * Guard against garbage in vd_name (DLPX-88427): a kernel bug on + * some engine versions causes raw memory bytes to appear here. + * A single non-printable byte anywhere in the string breaks JSON + * output (Python decodes with backslashreplace and the result is + * concatenated into JSON without escaping). Scan all bytes up to + * the first NUL; replace the whole name with "unknown" if the + * string is empty or any byte is outside printable ASCII (0x20-0x7e). + */ + bool vd_valid = (data->vd_name[0] != '\0'); +#pragma unroll + for (int _i = 0; _i < VD_NAME_SIZE; _i++) { + char _c = data->vd_name[_i]; + if (_c == '\0') + break; + if (_c < 0x20 || _c > 0x7e) + vd_valid = false; + } + if (!vd_valid) { + char unknown[] = "unknown"; + __builtin_memcpy(data->vd_name, unknown, sizeof(unknown)); + } + AGGREGATE_DATA(data->vd_name, axis, bpf_ktime_get_ns() - data->ts, data->asize); - data->asize = 0; - data->alloc_time = 0; - data->vd_name[0] = '\0'; + if (data->dva_owned) { + /* + * Old path: metaslab_alloc_dva may call metaslab_group_alloc + * multiple times (one per group tried). Reset per-group fields + * so the next attempt gets a fresh vdev name, but leave the + * entry alive so metaslab_alloc_dva_exit can record an + * "allocation failures" metric if the overall DVA fails. + */ + data->asize = 0; + data->vd_name[0] = '\0'; + } else { + /* New path: no outer dva exit will run; clean up now. */ + data_map.delete(&tid); + } return (0); } @@ -118,23 +214,30 @@ int metaslab_alloc_dva_exit(struct pt_regs *ctx, spa_t *spa, metaslab_class_t *mc, uint64_t psize) { + /* spa, mc, psize match the probed function signature but are unused. */ + (void)spa; (void)mc; (void)psize; + u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); if (data == NULL || data->ts == 0) return (0); - if (PT_REGS_RC(ctx) == 0) - return (0); - - char name[] = "allocation failures"; - char axis = 0; - AGGREGATE_DATA(name, &axis, - bpf_ktime_get_ns() - data->ts, data->size); + /* + * A live entry exists on both success and failure paths; only the + * return code determines whether to emit an "allocation failures" + * metric (non-zero RC = failure). psize is read from data->size + * rather than the kretprobe argument registers, which may be + * clobbered by the time the function returns. + */ + if (PT_REGS_RC(ctx) != 0) { + char name[] = "allocation failures"; + char axis = 0; + AGGREGATE_DATA(name, &axis, + bpf_ktime_get_ns() - data->ts, data->size); + } data->ts = 0; - data->size = 0; - data_map.delete(&tid); return (0); diff --git a/debian/control b/debian/control index 173d013..1cbf646 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker-ce +Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker-ce, influxdb2, curl Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/debian/postinst b/debian/postinst index ea9a0ce..44224e3 100644 --- a/debian/postinst +++ b/debian/postinst @@ -24,6 +24,14 @@ if ! groups "$USER" | grep -q "\b$GROUP\b"; then fi fi +# Remove the influxdb2 package default config — we use influxdb.toml exclusively. +rm -f /etc/influxdb/config.toml + +# Reload nginx to pick up the InfluxDB proxy location block. +if nginx -t -c /etc/nginx/nginx.conf &>/dev/null && systemctl is-active --quiet nginx; then + nginx -s reload +fi + #DEBHELPER# exit 0 \ No newline at end of file diff --git a/debian/rules b/debian/rules index d6f4f00..c84f85c 100755 --- a/debian/rules +++ b/debian/rules @@ -13,11 +13,12 @@ # need to rename a couple files, so do that here. # override_dh_auto_build: - mkdir -p build/cmd/ + mkdir -p build/cmd/ build/influxdb/ cp cmd/estat.py build/cmd/estat cp cmd/stbtrace.py build/cmd/stbtrace cp cmd/nfs_threads.py build/cmd/nfs_threads cp cmd/dsp.py build/cmd/dsp + cp influxdb/influxdb-nginx.conf build/influxdb/influxdb.conf override_dh_auto_install: dh_install build/cmd/* /usr/bin @@ -26,3 +27,7 @@ override_dh_auto_install: dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin dh_install telegraf/delphix-telegraf.service /lib/systemd/system dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf + dh_install influxdb/delphix-influxdb-service influxdb/delphix-influxdb-init influxdb/perf_influxdb /usr/bin + dh_install influxdb/delphix-influxdb.service /lib/systemd/system + dh_install influxdb/influxdb.toml influxdb/influxdb-init.conf /etc/influxdb + dh_install build/influxdb/influxdb.conf /opt/delphix/server/etc/nginx/conf.d diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init new file mode 100644 index 0000000..5b7435c --- /dev/null +++ b/influxdb/delphix-influxdb-init @@ -0,0 +1,266 @@ +#!/bin/bash -eu +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# One-time InfluxDB initialization: creates org, bucket, admin token, +# a read-only token for DCT Smart Proxy, and writes the +# [[outputs.influxdb_v2]] stanza to /etc/telegraf/telegraf.outputs.influxdb, +# which is included by delphix-telegraf-service when INFLUXDB_ENABLED flag exists. +# Skips setup if InfluxDB is already initialized. +# + +INFLUXDB_URL="http://127.0.0.1:8086" +INFLUXDB_CONFIG_DIR="/etc/influxdb" +INFLUXDB_META_FILE="$INFLUXDB_CONFIG_DIR/influxdb_meta" +# State file written immediately after /api/v2/setup so the script can resume +# if it is interrupted before the metadata file is fully written. +INFLUXDB_SETUP_STATE_FILE="$INFLUXDB_CONFIG_DIR/influxdb_setup_state" +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb +INFLUXDB_INIT_CONF="$INFLUXDB_CONFIG_DIR/influxdb-init.conf" + +# Load tunable configuration (org, bucket, retention, wait parameters). +# shellcheck source=/etc/influxdb/influxdb-init.conf +# shellcheck disable=SC1091 +source "$INFLUXDB_INIT_CONF" + +INFLUXDB_ADMIN_USER="admin" +INFLUXDB_ADMIN_PASSWORD="" + +# +# Log a message to stderr with a timestamp. +# +log() { + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2 +} + +# +# Extract a field from a JSON string using python3. +# +json_field() { + local json="$1" + local field="$2" + echo "$json" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())$field)" || + { log "ERROR: Failed to parse field '$field' from JSON response."; return 1; } +} + +# +# POST to the InfluxDB HTTP API. Exits with an error if the request fails. +# +influx_post() { + local endpoint="$1" + local data="$2" + local auth_header="${3:-}" + + local curl_args=(-sf -X POST "$INFLUXDB_URL$endpoint" -H 'Content-Type: application/json' -d "$data") + [[ -n "$auth_header" ]] && curl_args+=(-H "Authorization: Token $auth_header") + + local response + response=$(curl "${curl_args[@]}") || + { log "ERROR: HTTP POST to '$endpoint' failed."; return 1; } + echo "$response" +} + +mkdir -p "$INFLUXDB_CONFIG_DIR" + +# Skip if already fully initialized. +if [[ -f "$INFLUXDB_META_FILE" ]]; then + log "InfluxDB already initialized, skipping." + exit 0 +fi + +# +# Wait for InfluxDB to be ready. +# +ready=false +for i in $(seq 1 "$INFLUXDB_WAIT_RETRIES"); do + if curl -sf "$INFLUXDB_URL/health" &>/dev/null; then + ready=true + break + fi + sleep "$INFLUXDB_WAIT_INTERVAL" +done + +if [[ "$ready" != "true" ]]; then + log "ERROR: InfluxDB did not become ready after $((INFLUXDB_WAIT_RETRIES * INFLUXDB_WAIT_INTERVAL))s." + exit 1 +fi + +# +# Initial setup — creates org, bucket, and returns admin token + IDs. +# /api/v2/setup is a one-shot operation; if the script is interrupted after +# this point and re-run, the state file lets us skip setup and reuse the +# already-created admin token. +# +ADMIN_TOKEN="" +ORG_ID="" +BUCKET_ID="" +SUPPORT_BUCKET_ID="" + +if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then + while IFS= read -r line; do + key="${line%%=*}" + value="${line#*=}" + case "$key" in + ADMIN_TOKEN) ADMIN_TOKEN="$value" ;; + ORG_ID) ORG_ID="$value" ;; + BUCKET_ID) BUCKET_ID="$value" ;; + SUPPORT_BUCKET_ID) SUPPORT_BUCKET_ID="$value" ;; + INFLUXDB_ADMIN_PASSWORD) INFLUXDB_ADMIN_PASSWORD="$value" ;; + WRITE_TOKEN) WRITE_TOKEN="$value" ;; + READ_TOKEN) READ_TOKEN="$value" ;; + SUPPORT_WRITE_TOKEN) SUPPORT_WRITE_TOKEN="$value" ;; + esac + done <"$INFLUXDB_SETUP_STATE_FILE" +else + # Generate password only when actually running setup for the first time. + INFLUXDB_ADMIN_PASSWORD="$(openssl rand -hex 16)" + SETUP_RESPONSE=$(influx_post "/api/v2/setup" "{ + \"username\": \"$INFLUXDB_ADMIN_USER\", + \"password\": \"$INFLUXDB_ADMIN_PASSWORD\", + \"org\": \"$INFLUXDB_ORG\", + \"bucket\": \"$INFLUXDB_BUCKET\", + \"retentionPeriodSeconds\": $INFLUXDB_RETENTION_SECONDS + }") || exit 1 + + ADMIN_TOKEN=$(json_field "$SETUP_RESPONSE" "['auth']['token']") || exit 1 + ORG_ID=$(json_field "$SETUP_RESPONSE" "['org']['id']") || exit 1 + BUCKET_ID=$(json_field "$SETUP_RESPONSE" "['bucket']['id']") || exit 1 + + # Persist admin token + IDs + password immediately so a subsequent re-run + # can resume without repeating the one-shot setup call, and so the password + # stored in influxdb_meta always matches what InfluxDB was initialised with. + old_umask="$(umask)" + umask 077 + tmp_state="$(mktemp "${INFLUXDB_SETUP_STATE_FILE}.XXXXXX")" + printf 'ADMIN_TOKEN=%s\nORG_ID=%s\nBUCKET_ID=%s\nINFLUXDB_ADMIN_PASSWORD=%s\n' \ + "$ADMIN_TOKEN" "$ORG_ID" "$BUCKET_ID" "$INFLUXDB_ADMIN_PASSWORD" >"$tmp_state" + chmod 600 "$tmp_state" + mv "$tmp_state" "$INFLUXDB_SETUP_STATE_FILE" + umask "$old_umask" +fi + +# +# Create the support_metrics bucket (skipped if already persisted in state). +# +if [[ -z "$SUPPORT_BUCKET_ID" ]]; then + SUPPORT_BUCKET_RESPONSE=$(influx_post "/api/v2/buckets" "{ + \"orgID\": \"$ORG_ID\", + \"name\": \"$INFLUXDB_SUPPORT_BUCKET\", + \"retentionRules\": [{\"type\": \"expire\", \"everySeconds\": $INFLUXDB_SUPPORT_RETENTION_SECONDS}] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_BUCKET_ID=$(json_field "$SUPPORT_BUCKET_RESPONSE" "['id']") || exit 1 + printf 'SUPPORT_BUCKET_ID=%s\n' "$SUPPORT_BUCKET_ID" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# Token creation is guarded so that on crash-resume (setup state exists but +# meta file not yet written), we reuse already-created tokens rather than +# creating orphaned duplicates in InfluxDB on each retry. +WRITE_TOKEN="${WRITE_TOKEN:-}" +READ_TOKEN="${READ_TOKEN:-}" +SUPPORT_WRITE_TOKEN="${SUPPORT_WRITE_TOKEN:-}" + +# +# Create a write-only token for Telegraf (skipped if already persisted in state). +# +if [[ -z "$WRITE_TOKEN" ]]; then + WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + WRITE_TOKEN=$(json_field "$WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'WRITE_TOKEN=%s\n' "$WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Create a read-only token for DCT Smart Proxy (skipped if already persisted in state). +# +if [[ -z "$READ_TOKEN" ]]; then + READ_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"dct-read-token\", + \"permissions\": [ + {\"action\": \"read\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + READ_TOKEN=$(json_field "$READ_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'READ_TOKEN=%s\n' "$READ_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Create a write-only token for the support_metrics bucket (skipped if already persisted). +# +if [[ -z "$SUPPORT_WRITE_TOKEN" ]]; then + SUPPORT_WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-support-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$SUPPORT_BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_WRITE_TOKEN=$(json_field "$SUPPORT_WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'SUPPORT_WRITE_TOKEN=%s\n' "$SUPPORT_WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Write two [[outputs.influxdb_v2]] stanzas to a dedicated telegraf output file: +# - default bucket: Grafana-facing measurements currently used in dashboards: +# cpu, disk, diskio, net, tcp_stats, zfs, +# estat_nfs, estat_iscsi, hist_estat_nfs, hist_estat_iscsi, hist_estat_backend-io +# - support_metrics bucket: everything else (not yet in any dashboard panel) +# The flag is read by delphix-telegraf-service to conditionally include this output. +# +cat >"$INFLUXDB_OUTPUT" <"$tmp_meta" </dev/null || true diff --git a/influxdb/delphix-influxdb-service b/influxdb/delphix-influxdb-service new file mode 100644 index 0000000..eac68a4 --- /dev/null +++ b/influxdb/delphix-influxdb-service @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Wrapper script to start InfluxDB 2.x and run first-time initialization. +# + +INFLUXDB_CONFIG=/etc/influxdb/influxdb.toml +INFLUXDB_INIT=/usr/bin/delphix-influxdb-init + +# Start influxd in the background. +# influxd does not support a --config-path flag; config file is passed via env var. +INFLUXD_CONFIG_PATH="$INFLUXDB_CONFIG" /usr/bin/influxd & +INFLUXDB_PID=$! + +# Run initialization (the init script handles waiting for InfluxDB to be ready) +if ! $INFLUXDB_INIT; then + echo "ERROR: delphix-influxdb-init failed, stopping influxd" >&2 + kill "$INFLUXDB_PID" 2>/dev/null + exit 1 +fi + +wait "$INFLUXDB_PID" diff --git a/influxdb/delphix-influxdb.service b/influxdb/delphix-influxdb.service new file mode 100644 index 0000000..ec69c0b --- /dev/null +++ b/influxdb/delphix-influxdb.service @@ -0,0 +1,16 @@ +[Unit] +Description=Delphix InfluxDB Time Series Database +Documentation=https://docs.influxdata.com/influxdb/v2/ +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +User=root +ExecStart=/usr/bin/delphix-influxdb-service +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/influxdb/influxdb-init.conf b/influxdb/influxdb-init.conf new file mode 100644 index 0000000..dfab4ac --- /dev/null +++ b/influxdb/influxdb-init.conf @@ -0,0 +1,14 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Configuration for delphix-influxdb-init. +# Sourced by /usr/bin/delphix-influxdb-init at runtime. +# + +INFLUXDB_ORG="delphix" +INFLUXDB_BUCKET="default" +INFLUXDB_RETENTION_SECONDS=2592000 # 30 days (720h) +INFLUXDB_SUPPORT_BUCKET="support_metrics" +INFLUXDB_SUPPORT_RETENTION_SECONDS=2592000 # 30 days +INFLUXDB_WAIT_RETRIES=30 +INFLUXDB_WAIT_INTERVAL=2 diff --git a/influxdb/influxdb-nginx.conf b/influxdb/influxdb-nginx.conf new file mode 100644 index 0000000..ba2a74a --- /dev/null +++ b/influxdb/influxdb-nginx.conf @@ -0,0 +1,17 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Proxy InfluxDB 2.x API through nginx so external clients (DCT, Grafana) +# can reach it over HTTPS using the engine's existing TLS certificate. +# InfluxDB itself binds to 127.0.0.1:8086 (HTTP, localhost only). +# +location /influxdb/ { + proxy_pass http://127.0.0.1:8086/; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_read_timeout 999d; + proxy_buffering off; +} diff --git a/influxdb/influxdb.toml b/influxdb/influxdb.toml new file mode 100644 index 0000000..49b7e3f --- /dev/null +++ b/influxdb/influxdb.toml @@ -0,0 +1,10 @@ +# +# Copyright 2026 Delphix. All rights reserved. +# +# InfluxDB 2.x Configuration +# + +bolt-path = "/var/lib/influxdb/influxd.bolt" +engine-path = "/var/lib/influxdb/engine" +http-bind-address = "127.0.0.1:8086" +log-level = "warn" diff --git a/influxdb/perf_influxdb b/influxdb/perf_influxdb new file mode 100644 index 0000000..00baa6f --- /dev/null +++ b/influxdb/perf_influxdb @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Script that enables and disables InfluxDB metric output for Telegraf. +# + +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb + +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_influxdb() { + date + [[ ! -f $INFLUXDB_OUTPUT ]] && die "$INFLUXDB_OUTPUT not found. Run delphix-influxdb-init first." + echo "Enabling InfluxDB Metric Output" + touch $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +function disable_influxdb() { + date + echo "Disabling InfluxDB Metric Output" + rm -f $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_influxdb ;; +disable) disable_influxdb ;; +*) usage ;; +esac diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh new file mode 100755 index 0000000..2d6284f --- /dev/null +++ b/telegraf/connstat-stats.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# +# Collect per-connection TCP stats from connstat and aggregate by remote +# endpoint (laddr:raddr:service) to bound cardinality on engines with many +# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or +# Elastic Data (many connections per object storage endpoint IP). +# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack. +# +# Service name lookup reads from /etc/services, matching LocalTCPStatsCollector +# exactly. lport is checked before rport so that listening services (where the +# engine is the server) are identified correctly. Falls back to "unknown". +# +# Output fields per aggregated endpoint: +# laddr, raddr, service +# inbytes, outbytes, retranssegs, suna, unsent (summed across connections) +# swnd, cwnd, rwnd, rtt (averaged across connections) +# connections (count of aggregated conns) +# +/usr/bin/connstat -PLe -i 10 -T u \ + -o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \ + | awk -F',' ' +BEGIN { + # Load port->service mapping from /etc/services, same as LocalTCPStatsCollector. + # Pattern matches lines of the form: "servicename port/tcp" + while ((getline line < "/etc/services") > 0) { + sub(/^[[:space:]]+/, "", line) + if (line ~ /^(#|$)/) continue + n = split(line, f, /[[:space:]]+/) + if (n >= 2 && f[2] ~ /\/tcp/) { + split(f[2], pf, "/") + port = pf[1] + 0 + if (!(port in svc)) svc[port] = f[1] + } + } + close("/etc/services") + # Delphix-specific ports not present in /etc/services. + svc[8415] = "dlpx-sp" + svc[50001] = "network-throughput-test" + svc[8341] = "oracle-logsync" + svc[9100] = "dlpx-connector" +} +/^=/ { + for (key in cnt) { + n = cnt[key] + split(key, k, SUBSEP) + print k[1] "," k[2] "," k[3] "," \ + inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ + int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n + } + fflush() + delete cnt; delete inb; delete outb; delete ret + delete sun; delete uns; delete sw; delete cw; delete rw; delete rt + next +} +NF == 13 { + if (($2 + 0) in svc) { + service = svc[$2 + 0] + } else if (($4 + 0) in svc) { + service = svc[$4 + 0] + } else { + service = "unknown" + } + key = $1 SUBSEP $3 SUBSEP service + inb[key] += $5; outb[key] += $6; ret[key] += $7 + sun[key] += $8; uns[key] += $9 + sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 + cnt[key]++ +} +' diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service index 72df797..935465c 100755 --- a/telegraf/delphix-telegraf-service +++ b/telegraf/delphix-telegraf-service @@ -3,7 +3,10 @@ BASE_CONFIG=/etc/telegraf/telegraf.base DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose DCT_INPUTS=/etc/telegraf/telegraf.inputs.dct PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +STORAGE_IO_INPUTS=/etc/telegraf/telegraf.inputs.storage_io +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf @@ -21,6 +24,10 @@ function playbook_is_enabled() { [[ -f $PLAYBOOK_FLAG ]] } +function influxdb_is_enabled() { + [[ -f $INFLUXDB_FLAG ]] +} + rm -f $TELEGRAF_CONFIG if engine_is_object_based; then @@ -43,4 +50,21 @@ else fi fi +if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then + if [[ -f $STORAGE_IO_INPUTS ]]; then + cat $STORAGE_IO_INPUTS >> $TELEGRAF_CONFIG + fi + cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG +else + if influxdb_is_enabled; then + logger -t delphix-telegraf "WARNING: INFLUXDB_ENABLED is set but $INFLUXDB_OUTPUT is missing — metrics will be discarded. Run delphix-influxdb-init to restore InfluxDB output." + fi + # No InfluxDB output configured. Add discard so Telegraf can start — + # it requires at least one output plugin. + echo "[[outputs.discard]]" >> $TELEGRAF_CONFIG +fi + +# Restrict permissions so the InfluxDB write token is not world-readable. +chmod 640 $TELEGRAF_CONFIG + /usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 7abd9a4..f516875 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -11,53 +11,20 @@ ############################################################################### # OUTPUT PLUGINS # ############################################################################### -# Define the main metric output file, excluding aggregated stats and -# Performance Playbook (estat) data. -[[outputs.file]] - files = ["/var/log/telegraf/metrics.json"] - rotation_max_size = "50MB" - rotation_max_archives = 9 - data_format = "json" - namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*", "docker*"] - -# Define output file for ZFS related metrics -[[outputs.file]] - files = ["/var/log/telegraf/metrics_zfs.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["zpool*", "zcache*", "zfs"] - -# Define output file for Performance Playbook (estat) metrics -[[outputs.file]] - files = ["/var/log/telegraf/metrics_estat.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["*estat_*"] - -# Define output file for aggregate statistics -[[outputs.file]] - files = ["/var/log/telegraf/metric_aggregates.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["agg_*"] - -# Enable Live Monitoring, intended for internal Delphix use only: -#[[outputs.influxdb]] -# urls = ["http://dbsvr.company.com:8086"] -# database = "live_metrics" -# skip_database_creation = true -# data_format = "influx" +# All metrics are ingested into InfluxDB. The output stanza is written by +# delphix-influxdb-init to /etc/telegraf/telegraf.outputs.influxdb and +# appended here by delphix-telegraf-service when InfluxDB is enabled. +# Use 'perf_influxdb enable|disable' to toggle and restart Telegraf. ############################################################################### # INPUT PLUGINS # ############################################################################### -# Get CPU usage +# Get CPU usage — only cpu-total, not per-core (reduces data volume on +# many-CPU engines; agg_cpu automatically inherits this restriction). +# percpu defaults to true so must be explicitly set to false. [[inputs.cpu]] - percpu = true + percpu = false totalcpu = true collect_cpu_time = false report_active = false @@ -65,31 +32,58 @@ # Get mount point stats [[inputs.disk]] + interval = "60s" mount_points = ["/","/domain0"] - -# Get disk I/O stats + tagexclude = ["fstype", "mode"] + fieldpass = ["used", "free", "total"] + +# Get disk I/O stats for whole disks only — partitions add cardinality without +# diagnostic value and account for ~30% of diskio/agg_diskio line volume. +# Excluded: +# zd* — ZFS zvol internal block devices +# *p[0-9]* — NVMe partitions (nvme0n1p1, nvme0n1p9, etc.) +# sd*[0-9]* — SCSI/SATA partitions (sda1, sdb2, etc.) +# wwid is a redundant 100+ char tag; the short-form name tag is sufficient. [[inputs.diskio]] - -# Track stats for the current metric files -[[inputs.filestat]] - files = ["/var/log/telegraf/metrics.json", - "/var/log/telegraf/metrics_estat.json", - "/var/log/telegraf/metrics_zfs.json", - "/var/log/telegraf/metric_aggregates.json"] + interval = "60s" + tagdrop = {name = ["zd*", "*p[0-9]*", "sd*[0-9]*"]} + tagexclude = ["wwid"] + fieldpass = ["reads", "writes", "read_bytes", "write_bytes", "read_time", "write_time", "iops_in_progress"] # Get Memory stats [[inputs.mem]] + fieldpass = ["used", "available", "total", "free", "cached", "buffered", "dirty", "slab"] # Get some network interface stats [[inputs.net]] fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] +# Per-endpoint TCP stats (bytes, RTT, window sizes) via connstat. +# Aggregated by (laddr, raddr, service) to mirror the aggregation in +# LocalTCPStatsCollector — avoids cardinality explosion on Oracle dNFS +# engines (hundreds of connections per VDB host) and Elastic Data engines +# (many connections per object storage endpoint IP). +# Cumulative fields (inbytes, outbytes, etc.) are summed; window/RTT fields +# are averaged; connections = number of TCP connections aggregated. +[[inputs.execd]] + command = ["/etc/telegraf/connstat-stats.sh"] + name_override = "tcp_stats" + signal = "none" + restart_delay = "30s" + data_format = "csv" + csv_delimiter = "," + csv_trim_space = true + csv_column_names = ["laddr", "raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] + csv_column_types = ["string", "string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] + csv_tag_columns = ["laddr", "raddr", "service"] + # Track CPU and Memory for the "delphix-mgmt" service (and children). [[inputs.procstat]] systemd_unit = "delphix-mgmt.service" include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Track CPU and Memory for the "zfs-object-agent" service (and children). [[inputs.procstat]] @@ -97,19 +91,43 @@ include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Get process counts [[inputs.processes]] -# Get swap memory usage -[[inputs.swap]] - # Get misc 'other' stats (load and uptime) [[inputs.system]] # ZFS kstats (arcstat, abdstat, zfetch, etc) +# arcstats_l2_* fields are L2ARC stats — unused on all appliances (no L2ARC). [[inputs.zfs]] interval = "1m" + fieldpass = [ + "arcstats_anon_data", "arcstats_anon_evictable_data", + "arcstats_anon_evictable_metadata", "arcstats_anon_metadata", + "arcstats_arc_need_free", "arcstats_arc_no_grow", "arcstats_arc_prune", + "arcstats_arc_sys_free", "arcstats_async_upgrade_sync", + "arcstats_c", "arcstats_data_size", + "arcstats_demand_data_hits", "arcstats_demand_data_misses", + "arcstats_demand_hit_predictive_prefetch", + "arcstats_evict_not_enough", "arcstats_evict_skip", + "arcstats_hits", "arcstats_misses", + "arcstats_memory_available_bytes", "arcstats_memory_direct_count", + "arcstats_memory_free_bytes", "arcstats_memory_indirect_count", + "arcstats_metadata_size", + "arcstats_mfu_data", "arcstats_mfu_evictable_data", + "arcstats_mfu_evictable_metadata", "arcstats_mfu_ghost_hits", + "arcstats_mfu_hits", "arcstats_mfu_metadata", + "arcstats_mru_data", "arcstats_mru_evictable_data", + "arcstats_mru_evictable_metadata", "arcstats_mru_ghost_hits", + "arcstats_mru_hits", "arcstats_mru_metadata", + "arcstats_prefetch_data_hits", "arcstats_prefetch_data_misses", + "arcstats_size", + "zil_commit_count", "zil_itx_count", "zil_commit_stall_count", + "zfetchstats_hits", "zfetchstats_misses", + "dmu_tx_dirty_throttle", "dmu_tx_delay" + ] # Detailed ZFS pool metrics from "zpool_influxdb" (noisy) #[[inputs.exec]] @@ -127,5 +145,5 @@ drop_original = false stats = ["min", "max", "mean", "stdev"] name_prefix = "agg_" - namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"] + namepass = ["cpu","disk","diskio","mem","net","processes","system"] diff --git a/telegraf/telegraf.inputs.dct b/telegraf/telegraf.inputs.dct index 07ceb4d..07dc47f 100644 --- a/telegraf/telegraf.inputs.dct +++ b/telegraf/telegraf.inputs.dct @@ -11,12 +11,5 @@ ] docker_label_exclude = ["com.docker.compose.*", "resty*"] - [[outputs.file]] - files = ["/var/log/telegraf/metrics_docker.json"] - rotation_max_size = "30MB" - rotation_max_archives = 5 - data_format = "json" - namepass = ["docker*"] - ####################### End of Docker/DCT services Metrics ####################### diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index 5ed7e21..dcfa71e 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -1,31 +1,8 @@ ############################################################################## -# Performance Playbook (estat, nfs_threads) collection - -# Collect output from "estat nfs -jm 10" -[[inputs.execd]] - command = ["estat", "nfs", "-jm", "10"] - name_override = "estat_nfs" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] - -# Collect output from "estat iscsi -jm 10" -[[inputs.execd]] - command = ["estat", "iscsi", "-jm", "10"] - name_override = "estat_iscsi" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] +# Performance Playbook (estat, nfs_threads) collection +# Note: estat_nfs, estat_iscsi, and estat_backend-io live in +# telegraf.inputs.storage_io and are always collected when InfluxDB is +# enabled, independent of playbook state. # Collect output from "estat zpl -jm 10" [[inputs.execd]] @@ -40,19 +17,6 @@ ] json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] -# Collect output from "estat backend-io -jm 10" -[[inputs.execd]] - command = ["estat", "backend-io", "-jm", "10"] - name_override = "estat_backend-io" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] - # Collect output from "estat zvol -jm 10" [[inputs.execd]] command = ["estat", "zvol", "-jm", "10"] @@ -123,47 +87,17 @@ ############################################################################### # PROCESSOR PLUGINS # ############################################################################### -# Convert strings from estat into integer values so they don't get dropped +# Convert strings from estat into integer values so they don't get dropped. +# Scoped to playbook-only metrics; estat_nfs/iscsi/backend-io have their own +# converter in telegraf.inputs.storage_io. [[processors.converter]] + namepass = ["estat_zpl", "estat_zio", "estat_zvol", "estat_zio-queue", "estat_metaslab-alloc"] [processors.converter.fields] integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] -# The estat output contains a nested latency histogram, so we need to -# parse that out as a new array metric rather than a non-JSON string. -# -# From this: -# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}" -# to this: -# "microseconds":"{20000:5,30000:15,40000:3,50000:24}" -# -# Clone the original so we have a "new" metric with a "hist_" name prefix -[[processors.clone]] - order = 1 - name_prefix = "hist_" - namepass = ["estat_*"] - -# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects -[[processors.regex]] - order = 2 - namepass = ["hist_estat_*"] - [[processors.regex.fields]] - key = "microseconds" - pattern = "{(\\d+),(\\d+)}" - replacement = "\"${1}\":${2}" - [[processors.regex.fields]] - key = "microseconds" - pattern = ".*" - replacement = "{$0}" - -# Now parse out the arrays for "hist_estat_*" metrics -[[processors.parser]] - order = 3 - merge = "override" - parse_fields = ["microseconds"] - drop_original = false - data_format = "json" - namepass = ["hist_estat_*"] - fieldpass = ["microseconds"] +# Note: histogram processors (clone/regex/parser) for all estat_* measurements +# live in telegraf.inputs.storage_io, which is always included when InfluxDB +# is enabled. No duplication needed here. # End of Processor section ############################################################################## diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io new file mode 100644 index 0000000..2341147 --- /dev/null +++ b/telegraf/telegraf.inputs.storage_io @@ -0,0 +1,99 @@ +############################################################################## +# Storage I/O collection: NFS server, iSCSI target, and backend disk I/O. +# Always included when InfluxDB is enabled, independent of playbook state. +# NFS/iSCSI/backend-IO are also collected continuously by delphix-stat into +# analytics_datapoint (the source for Support Grafana), so full histogram +# capture here mirrors that existing always-on precedent. + +# Collect output from "estat nfs -jm 10" +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat iscsi -jm 10" +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat backend-io -jm 10" (stbtrace io equivalent) +[[inputs.execd]] + command = ["estat", "backend-io", "-jm", "10"] + name_override = "estat_backend-io" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Convert estat string fields to integers so they are not dropped by Telegraf. +[[processors.converter]] + namepass = ["estat_nfs", "estat_iscsi", "estat_backend-io"] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# Clone estat_* measurements as hist_estat_* to hold histogram data only. +# microseconds is removed from the originals (order 2 below) so it lives in +# hist_estat_* exclusively — no duplication. +# Keeps the original format "{20000,5},{30000,15}" compatible with import code. +[[processors.clone]] + order = 1 + name_prefix = "hist_" + namepass = ["estat_*"] + +# Drop microseconds from all original estat_* measurements after cloning. +# Covers both storage_io (estat_nfs/iscsi/backend-io) and playbook +# (estat_zpl/zvol/zio/etc) measurements in one place. +[[processors.strings]] + order = 2 + namepass = ["estat_*"] + fieldexclude = ["microseconds"] + +# Expand hist_estat_* microseconds histogram strings into per-bucket rows for +# Grafana heatmap support. Each "{upper_bound_us,count}" pair becomes a +# separate metric with le= tag and count field, replacing the +# opaque string. Runs after clone (order=1) and strings (order=2) so the +# microseconds field is still present in hist_estat_* at this point. +[[processors.starlark]] + order = 3 + namepass = ["hist_estat_*"] + source = ''' +def apply(metric): + ms = metric.fields.get("microseconds") + if ms == None: + return [metric] + + result = [] + for pair in ms[1:-1].split("},{"): + parts = pair.split(",") + if len(parts) == 2: + m = deepcopy(metric) + m.tags["le"] = parts[0] + for k in list(m.fields.keys()): + m.fields.pop(k) + m.fields["count"] = int(parts[1]) + result.append(m) + + return result if result else [metric] +''' + +# End of Storage I/O section +############################################################################## From 7344355c28c438c3d7b3ac8f86d609ff03531148 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 15 Jun 2026 21:32:09 +0530 Subject: [PATCH 04/10] DLPX-96312 Replace metaslab_alloc_dva probes with metaslab_alloc_dva_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metaslab_alloc_dva() is no longer in the normal write path on ZFS 2.4.99+ (Delphix 2026.3); it now only appears in vdev_removal.c. The write path is: metaslab_alloc() -> metaslab_alloc_range() -> metaslab_alloc_dva_range() -> metaslab_group_alloc() Replace the outer kprobe/kretprobe pair from metaslab_alloc_dva to metaslab_alloc_dva_range, which occupies the same structural role. metaslab_alloc_dva_range() receives spa_t * as its first argument, making pool filtering a direct equal_to_pool(spa->spa_name) call with no struct chain traversal needed. This also removes the dual-path dva_owned complexity introduced to work around the missing outer probe — with metaslab_alloc_dva_range as the anchor, metaslab_group_alloc_entry always finds an existing data_map entry and only needs to fill in vdev name and asize. On older ZFS without metaslab_alloc_dva_range, estat(8) prints a WARNING and skips those probes gracefully. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- bpf/estat/metaslab-alloc.c | 153 ++++++++++++------------------------- 1 file changed, 50 insertions(+), 103 deletions(-) diff --git a/bpf/estat/metaslab-alloc.c b/bpf/estat/metaslab-alloc.c index 473adae..af40e95 100644 --- a/bpf/estat/metaslab-alloc.c +++ b/bpf/estat/metaslab-alloc.c @@ -12,9 +12,8 @@ #define VD_NAME_SIZE 32 typedef struct { u64 ts; - u64 size; /* psize stored by metaslab_alloc_dva_entry (old path) */ + u64 size; /* psize from metaslab_alloc_dva_range_entry */ u64 asize; - u8 dva_owned; /* 1 if created by metaslab_alloc_dva_entry (old path) */ char vd_name[VD_NAME_SIZE]; } data_t; @@ -41,29 +40,31 @@ equal_to_pool(char *str) } /* - * In ZFS 2.4.99+ (2026.3 and later) the allocation path no longer goes - * through metaslab_alloc_dva at all — metaslab_group_alloc is called - * directly by a different caller. We therefore use metaslab_group_alloc - * as both the start-timing and pool-filter point. The pool name is - * available via mg->mg_class->mc_spa->spa_name. + * metaslab_alloc_dva_range() is the per-DVA entry point in the write path + * for ZFS versions that have it (Delphix 2026.3 / ZFS 2.4.99+). It receives + * spa_t * as its first argument, making pool filtering straightforward. * - * On older ZFS versions metaslab_alloc_dva is still the outer entry point. - * We probe it to set up the data_map entry (dva_owned=1) and store psize so - * metaslab_alloc_dva_exit can emit an "allocation failures" metric if the - * overall DVA allocation fails. metaslab_group_alloc_entry then populates - * vdev information into the same entry. + * Call chain: + * metaslab_alloc() + * -> metaslab_alloc_range() + * -> metaslab_alloc_dva_range() <- outer probe (entry/exit) + * -> metaslab_group_alloc() <- inner probe (entry/exit) * - * Lifetime of map entries: - * Old path: created by metaslab_alloc_dva_entry, deleted by - * metaslab_alloc_dva_exit (metaslab_group_alloc_exit only - * clears per-group fields so multiple groups can be tried). - * New path: created and deleted by metaslab_group_alloc_entry/exit pair - * (no outer dva probe fires). + * metaslab_alloc_dva_range() may call metaslab_group_alloc() multiple times + * (once per metaslab group tried). We therefore emit a metric on each + * metaslab_group_alloc_exit and reset the per-group fields so the next + * attempt gets a fresh vdev name, leaving the outer entry alive until + * metaslab_alloc_dva_range_exit cleans it up. + * + * metaslab_alloc_dva() (the old outer entry point, now only used in + * vdev_removal.c) is no longer probed. If this script is run on an older + * ZFS that lacks metaslab_alloc_dva_range, estat(8) will print a WARNING and + * skip those probes — no data will be collected. */ -// @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry +// @@ kprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_entry int -metaslab_alloc_dva_entry(struct pt_regs *ctx, +metaslab_alloc_dva_range_entry(struct pt_regs *ctx, spa_t *spa, metaslab_class_t *mc, uint64_t psize) { u32 tid = bpf_get_current_pid_tgid(); @@ -72,9 +73,8 @@ metaslab_alloc_dva_entry(struct pt_regs *ctx, if (!equal_to_pool(spa->spa_name)) return (0); - data.ts = bpf_ktime_get_ns(); - data.size = psize; - data.dva_owned = 1; + data.ts = bpf_ktime_get_ns(); + data.size = psize; data_map.update(&tid, &data); @@ -89,54 +89,18 @@ metaslab_group_alloc_entry(struct pt_regs *ctx, u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); - if (data != NULL && data->dva_owned) { - /* - * Older path: metaslab_alloc_dva_entry already created the - * entry and set the start timestamp. Just fill in vdev info. - */ - data->asize = asize; - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_path); - } else { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), - mg->mg_vd->vdev_ops->vdev_op_type); - } + if (data == NULL || data->ts == 0) + return (0); + + data->asize = asize; + + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_path); } else { - /* - * Newer path: metaslab_alloc_dva is not in the call chain. - * Create the entry here, filtering by pool via mc_spa. - * Read each pointer level explicitly: BCC's automatic - * three-level dereference (mg->mg_class->mc_spa->spa_name) - * does not produce a usable address for bpf_probe_read. - */ - metaslab_class_t *mc = NULL; - spa_t *spa = NULL; - bpf_probe_read(&mc, sizeof(mc), &mg->mg_class); - if (!mc) - return (0); - bpf_probe_read(&spa, sizeof(spa), &mc->mc_spa); - if (!spa) - return (0); - if (!equal_to_pool(spa->spa_name)) - return (0); - - data_t d = {}; - d.ts = bpf_ktime_get_ns(); - d.asize = asize; - /* dva_owned stays 0: this entry is owned by group entry/exit */ - - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(d.vd_name, - sizeof(d.vd_name), mg->mg_vd->vdev_path); - } else { - bpf_probe_read_str(d.vd_name, - sizeof(d.vd_name), - mg->mg_vd->vdev_ops->vdev_op_type); - } - - data_map.update(&tid, &d); + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); } return (0); @@ -155,10 +119,6 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) if (data == NULL || data->ts == 0) return (0); - /* - * metaslab_group_alloc returns a metaslab_t * (or similar pointer) - * in both old and new ZFS. NULL (0) = failure, non-NULL = success. - */ if (PT_REGS_RC(ctx) == 0) { axis = failure; } else { @@ -169,10 +129,9 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) * Guard against garbage in vd_name (DLPX-88427): a kernel bug on * some engine versions causes raw memory bytes to appear here. * A single non-printable byte anywhere in the string breaks JSON - * output (Python decodes with backslashreplace and the result is - * concatenated into JSON without escaping). Scan all bytes up to - * the first NUL; replace the whole name with "unknown" if the - * string is empty or any byte is outside printable ASCII (0x20-0x7e). + * output. Scan all bytes up to the first NUL; replace the whole + * name with "unknown" if empty or any byte is outside printable + * ASCII (0x20-0x7e). */ bool vd_valid = (data->vd_name[0] != '\0'); #pragma unroll @@ -191,32 +150,21 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) AGGREGATE_DATA(data->vd_name, axis, bpf_ktime_get_ns() - data->ts, data->asize); - if (data->dva_owned) { - /* - * Old path: metaslab_alloc_dva may call metaslab_group_alloc - * multiple times (one per group tried). Reset per-group fields - * so the next attempt gets a fresh vdev name, but leave the - * entry alive so metaslab_alloc_dva_exit can record an - * "allocation failures" metric if the overall DVA fails. - */ - data->asize = 0; - data->vd_name[0] = '\0'; - } else { - /* New path: no outer dva exit will run; clean up now. */ - data_map.delete(&tid); - } + /* + * Reset per-group fields so that if metaslab_alloc_dva_range retries + * with another group, metaslab_group_alloc_entry gets a clean slate. + * Leave the entry alive — metaslab_alloc_dva_range_exit owns cleanup. + */ + data->asize = 0; + data->vd_name[0] = '\0'; return (0); } -// @@ kretprobe|metaslab_alloc_dva|metaslab_alloc_dva_exit +// @@ kretprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_exit int -metaslab_alloc_dva_exit(struct pt_regs *ctx, - spa_t *spa, metaslab_class_t *mc, uint64_t psize) +metaslab_alloc_dva_range_exit(struct pt_regs *ctx) { - /* spa, mc, psize match the probed function signature but are unused. */ - (void)spa; (void)mc; (void)psize; - u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); @@ -224,11 +172,10 @@ metaslab_alloc_dva_exit(struct pt_regs *ctx, return (0); /* - * A live entry exists on both success and failure paths; only the - * return code determines whether to emit an "allocation failures" - * metric (non-zero RC = failure). psize is read from data->size - * rather than the kretprobe argument registers, which may be - * clobbered by the time the function returns. + * Non-zero return means the overall DVA allocation failed (no group + * succeeded). Emit an "allocation failures" metric using the psize + * stored at entry time; the kretprobe argument registers may be + * clobbered so we read from data->size instead. */ if (PT_REGS_RC(ctx) != 0) { char name[] = "allocation failures"; From f7b99c74004f8453b6a9f9eeeb42ab327985683f Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 15 Jun 2026 21:32:09 +0530 Subject: [PATCH 05/10] DLPX-96312 Replace metaslab_alloc_dva probes with metaslab_alloc_dva_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metaslab_alloc_dva() is no longer in the normal write path on ZFS 2.4.99+ (Delphix 2026.3); it now only appears in vdev_removal.c. The write path is: metaslab_alloc() -> metaslab_alloc_range() -> metaslab_alloc_dva_range() -> metaslab_group_alloc() Replace the outer kprobe/kretprobe pair from metaslab_alloc_dva to metaslab_alloc_dva_range, which occupies the same structural role. metaslab_alloc_dva_range() receives spa_t * as its first argument, making pool filtering a direct equal_to_pool(spa->spa_name) call with no struct chain traversal needed. This also removes the dual-path dva_owned complexity introduced to work around the missing outer probe — with metaslab_alloc_dva_range as the anchor, metaslab_group_alloc_entry always finds an existing data_map entry and only needs to fill in vdev name and asize. On older ZFS without metaslab_alloc_dva_range, estat(8) prints a WARNING and skips those probes gracefully. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- bpf/estat/metaslab-alloc.c | 153 ++++++++++++------------------------- 1 file changed, 50 insertions(+), 103 deletions(-) diff --git a/bpf/estat/metaslab-alloc.c b/bpf/estat/metaslab-alloc.c index 473adae..af40e95 100644 --- a/bpf/estat/metaslab-alloc.c +++ b/bpf/estat/metaslab-alloc.c @@ -12,9 +12,8 @@ #define VD_NAME_SIZE 32 typedef struct { u64 ts; - u64 size; /* psize stored by metaslab_alloc_dva_entry (old path) */ + u64 size; /* psize from metaslab_alloc_dva_range_entry */ u64 asize; - u8 dva_owned; /* 1 if created by metaslab_alloc_dva_entry (old path) */ char vd_name[VD_NAME_SIZE]; } data_t; @@ -41,29 +40,31 @@ equal_to_pool(char *str) } /* - * In ZFS 2.4.99+ (2026.3 and later) the allocation path no longer goes - * through metaslab_alloc_dva at all — metaslab_group_alloc is called - * directly by a different caller. We therefore use metaslab_group_alloc - * as both the start-timing and pool-filter point. The pool name is - * available via mg->mg_class->mc_spa->spa_name. + * metaslab_alloc_dva_range() is the per-DVA entry point in the write path + * for ZFS versions that have it (Delphix 2026.3 / ZFS 2.4.99+). It receives + * spa_t * as its first argument, making pool filtering straightforward. * - * On older ZFS versions metaslab_alloc_dva is still the outer entry point. - * We probe it to set up the data_map entry (dva_owned=1) and store psize so - * metaslab_alloc_dva_exit can emit an "allocation failures" metric if the - * overall DVA allocation fails. metaslab_group_alloc_entry then populates - * vdev information into the same entry. + * Call chain: + * metaslab_alloc() + * -> metaslab_alloc_range() + * -> metaslab_alloc_dva_range() <- outer probe (entry/exit) + * -> metaslab_group_alloc() <- inner probe (entry/exit) * - * Lifetime of map entries: - * Old path: created by metaslab_alloc_dva_entry, deleted by - * metaslab_alloc_dva_exit (metaslab_group_alloc_exit only - * clears per-group fields so multiple groups can be tried). - * New path: created and deleted by metaslab_group_alloc_entry/exit pair - * (no outer dva probe fires). + * metaslab_alloc_dva_range() may call metaslab_group_alloc() multiple times + * (once per metaslab group tried). We therefore emit a metric on each + * metaslab_group_alloc_exit and reset the per-group fields so the next + * attempt gets a fresh vdev name, leaving the outer entry alive until + * metaslab_alloc_dva_range_exit cleans it up. + * + * metaslab_alloc_dva() (the old outer entry point, now only used in + * vdev_removal.c) is no longer probed. If this script is run on an older + * ZFS that lacks metaslab_alloc_dva_range, estat(8) will print a WARNING and + * skip those probes — no data will be collected. */ -// @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry +// @@ kprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_entry int -metaslab_alloc_dva_entry(struct pt_regs *ctx, +metaslab_alloc_dva_range_entry(struct pt_regs *ctx, spa_t *spa, metaslab_class_t *mc, uint64_t psize) { u32 tid = bpf_get_current_pid_tgid(); @@ -72,9 +73,8 @@ metaslab_alloc_dva_entry(struct pt_regs *ctx, if (!equal_to_pool(spa->spa_name)) return (0); - data.ts = bpf_ktime_get_ns(); - data.size = psize; - data.dva_owned = 1; + data.ts = bpf_ktime_get_ns(); + data.size = psize; data_map.update(&tid, &data); @@ -89,54 +89,18 @@ metaslab_group_alloc_entry(struct pt_regs *ctx, u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); - if (data != NULL && data->dva_owned) { - /* - * Older path: metaslab_alloc_dva_entry already created the - * entry and set the start timestamp. Just fill in vdev info. - */ - data->asize = asize; - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), mg->mg_vd->vdev_path); - } else { - bpf_probe_read_str(data->vd_name, - sizeof(data->vd_name), - mg->mg_vd->vdev_ops->vdev_op_type); - } + if (data == NULL || data->ts == 0) + return (0); + + data->asize = asize; + + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_path); } else { - /* - * Newer path: metaslab_alloc_dva is not in the call chain. - * Create the entry here, filtering by pool via mc_spa. - * Read each pointer level explicitly: BCC's automatic - * three-level dereference (mg->mg_class->mc_spa->spa_name) - * does not produce a usable address for bpf_probe_read. - */ - metaslab_class_t *mc = NULL; - spa_t *spa = NULL; - bpf_probe_read(&mc, sizeof(mc), &mg->mg_class); - if (!mc) - return (0); - bpf_probe_read(&spa, sizeof(spa), &mc->mc_spa); - if (!spa) - return (0); - if (!equal_to_pool(spa->spa_name)) - return (0); - - data_t d = {}; - d.ts = bpf_ktime_get_ns(); - d.asize = asize; - /* dva_owned stays 0: this entry is owned by group entry/exit */ - - if (mg->mg_vd->vdev_path != NULL) { - bpf_probe_read_str(d.vd_name, - sizeof(d.vd_name), mg->mg_vd->vdev_path); - } else { - bpf_probe_read_str(d.vd_name, - sizeof(d.vd_name), - mg->mg_vd->vdev_ops->vdev_op_type); - } - - data_map.update(&tid, &d); + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), + mg->mg_vd->vdev_ops->vdev_op_type); } return (0); @@ -155,10 +119,6 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) if (data == NULL || data->ts == 0) return (0); - /* - * metaslab_group_alloc returns a metaslab_t * (or similar pointer) - * in both old and new ZFS. NULL (0) = failure, non-NULL = success. - */ if (PT_REGS_RC(ctx) == 0) { axis = failure; } else { @@ -169,10 +129,9 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) * Guard against garbage in vd_name (DLPX-88427): a kernel bug on * some engine versions causes raw memory bytes to appear here. * A single non-printable byte anywhere in the string breaks JSON - * output (Python decodes with backslashreplace and the result is - * concatenated into JSON without escaping). Scan all bytes up to - * the first NUL; replace the whole name with "unknown" if the - * string is empty or any byte is outside printable ASCII (0x20-0x7e). + * output. Scan all bytes up to the first NUL; replace the whole + * name with "unknown" if empty or any byte is outside printable + * ASCII (0x20-0x7e). */ bool vd_valid = (data->vd_name[0] != '\0'); #pragma unroll @@ -191,32 +150,21 @@ metaslab_group_alloc_exit(struct pt_regs *ctx) AGGREGATE_DATA(data->vd_name, axis, bpf_ktime_get_ns() - data->ts, data->asize); - if (data->dva_owned) { - /* - * Old path: metaslab_alloc_dva may call metaslab_group_alloc - * multiple times (one per group tried). Reset per-group fields - * so the next attempt gets a fresh vdev name, but leave the - * entry alive so metaslab_alloc_dva_exit can record an - * "allocation failures" metric if the overall DVA fails. - */ - data->asize = 0; - data->vd_name[0] = '\0'; - } else { - /* New path: no outer dva exit will run; clean up now. */ - data_map.delete(&tid); - } + /* + * Reset per-group fields so that if metaslab_alloc_dva_range retries + * with another group, metaslab_group_alloc_entry gets a clean slate. + * Leave the entry alive — metaslab_alloc_dva_range_exit owns cleanup. + */ + data->asize = 0; + data->vd_name[0] = '\0'; return (0); } -// @@ kretprobe|metaslab_alloc_dva|metaslab_alloc_dva_exit +// @@ kretprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_exit int -metaslab_alloc_dva_exit(struct pt_regs *ctx, - spa_t *spa, metaslab_class_t *mc, uint64_t psize) +metaslab_alloc_dva_range_exit(struct pt_regs *ctx) { - /* spa, mc, psize match the probed function signature but are unused. */ - (void)spa; (void)mc; (void)psize; - u32 tid = bpf_get_current_pid_tgid(); data_t *data = data_map.lookup(&tid); @@ -224,11 +172,10 @@ metaslab_alloc_dva_exit(struct pt_regs *ctx, return (0); /* - * A live entry exists on both success and failure paths; only the - * return code determines whether to emit an "allocation failures" - * metric (non-zero RC = failure). psize is read from data->size - * rather than the kretprobe argument registers, which may be - * clobbered by the time the function returns. + * Non-zero return means the overall DVA allocation failed (no group + * succeeded). Emit an "allocation failures" metric using the psize + * stored at entry time; the kretprobe argument registers may be + * clobbered so we read from data->size instead. */ if (PT_REGS_RC(ctx) != 0) { char name[] = "allocation failures"; From 8fe4222ec383991d7106e4254273f68bc5b75cdf Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Wed, 17 Jun 2026 15:45:01 +0530 Subject: [PATCH 06/10] DLPX-96312 Drop hist_estat_* clones that have no histogram data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Starlark processor was returning [metric] (pass-through) for hist_estat_* rows whose microseconds field is absent — specifically name=total summary rows, which carry only iops and throughput. Those clones are pure duplicates of the corresponding estat_* row and should not exist in hist_estat_*. Fix: return [] to drop the metric when microseconds is None. Only rows with actual histogram data produce le= output. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- telegraf/telegraf.inputs.storage_io | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 2341147..6ac542b 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -51,8 +51,9 @@ integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] # Clone estat_* measurements as hist_estat_* to hold histogram data only. -# microseconds is removed from the originals (order 2 below) so it lives in -# hist_estat_* exclusively — no duplication. +# Rows without a microseconds field (e.g. name=total summary rows) have no +# histogram data; the Starlark processor below drops those clones so they +# don't duplicate the summary stats already present in estat_*. # Keeps the original format "{20000,5},{30000,15}" compatible with import code. [[processors.clone]] order = 1 @@ -79,7 +80,7 @@ def apply(metric): ms = metric.fields.get("microseconds") if ms == None: - return [metric] + return [] # no histogram data — drop to avoid duplicating estat_* summary rows result = [] for pair in ms[1:-1].split("},{"): From 303da165182b913d49464796ee5ad06b9ac2522a Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Thu, 18 Jun 2026 11:06:17 +0530 Subject: [PATCH 07/10] DLPX-96312 Drop hist_estat_* clones with empty microseconds field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When microseconds is present but parses to no valid bucket pairs (e.g. "{ }" from a zero-activity interval), the Starlark function was falling through to `return result if result else [metric]` and passing the metric through unchanged — a duplicate of the estat_* summary row for that series. Fix: return [] instead of [metric] when parsing yields no results. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- telegraf/telegraf.inputs.storage_io | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 6ac542b..ef177ab 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -93,7 +93,7 @@ def apply(metric): m.fields["count"] = int(parts[1]) result.append(m) - return result if result else [metric] + return result if result else [] # empty microseconds (e.g. "{ }") — drop ''' # End of Storage I/O section From 26da557077f9267f24f46635691d2c87c211e091 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Thu, 18 Jun 2026 11:06:17 +0530 Subject: [PATCH 08/10] DLPX-96312 Drop hist_estat_* clones with empty microseconds field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When microseconds is present but parses to no valid bucket pairs (e.g. "{ }" from a zero-activity interval), the Starlark function was falling through to `return result if result else [metric]` and passing the metric through unchanged — a duplicate of the estat_* summary row for that series. Fix: return [] instead of [metric] when parsing yields no results. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- telegraf/telegraf.inputs.storage_io | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 6ac542b..0ad27ae 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -50,6 +50,18 @@ [processors.converter.fields] integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] +# Drop name=total rows from estat_backend-io. total = read + write and can be +# derived in Grafana, so storing it is redundant. Runs before the clone (order=1) +# so hist_estat_backend-io also never receives a name=total row. +[[processors.starlark]] + namepass = ["estat_backend-io"] + source = ''' +def apply(metric): + if metric.tags.get("name") == "total": + return [] + return metric +''' + # Clone estat_* measurements as hist_estat_* to hold histogram data only. # Rows without a microseconds field (e.g. name=total summary rows) have no # histogram data; the Starlark processor below drops those clones so they @@ -93,7 +105,7 @@ def apply(metric): m.fields["count"] = int(parts[1]) result.append(m) - return result if result else [metric] + return result if result else [] # empty microseconds (e.g. "{ }") — drop ''' # End of Storage I/O section From 5636a678e475215d950dcbe48a834942818b7767 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Fri, 19 Jun 2026 14:16:36 +0530 Subject: [PATCH 09/10] DLPX-96312 Fix tcp_stats output stall in mawk/Telegraf execd environment mawk 1.3.4 does not flush its stdout buffer via fflush() when writing to a Telegraf execd pipe, causing tcp_stats data to never reach InfluxDB on engines where mawk is the default awk. Wrapping connstat in a while loop with -c 2 forces awk to exit naturally after each 10-second interval, triggering the C runtime exit flush (fclose) that reliably delivers data to Telegraf. The END block captures the partial second interval on each awk exit so no data is lost. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- telegraf/connstat-stats.sh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh index 2d6284f..5b2b6b3 100755 --- a/telegraf/connstat-stats.sh +++ b/telegraf/connstat-stats.sh @@ -16,7 +16,13 @@ # swnd, cwnd, rwnd, rtt (averaged across connections) # connections (count of aggregated conns) # -/usr/bin/connstat -PLe -i 10 -T u \ +# mawk (the default awk on Delphix engines) does not flush its stdout buffer +# when writing to a Telegraf execd pipe, even with explicit fflush() calls. +# Wrapping connstat in a loop with -c 2 causes awk to exit naturally after +# each 10-second interval, which triggers the C runtime exit flush (fclose) +# and reliably delivers data to Telegraf. +while true; do +/usr/bin/connstat -PLe -i 10 -c 2 -T u \ -o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \ | awk -F',' ' BEGIN { @@ -66,4 +72,15 @@ NF == 13 { sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 cnt[key]++ } +END { + for (key in cnt) { + n = cnt[key] + split(key, k, SUBSEP) + print k[1] "," k[2] "," k[3] "," \ + inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ + int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n + } + fflush() +} ' +done From 042e04720ce195eb3fd7f82da6c900c100aed6ba Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Fri, 19 Jun 2026 22:38:40 +0530 Subject: [PATCH 10/10] DLPX-96312 Drop name=total from estat_nfs and estat_iscsi Extend the existing name=total starlark drop filter (previously estat_backend-io only) to also cover estat_nfs and estat_iscsi. The total row is read+write summed at collection time and can be derived in Grafana, so storing it wastes ~33% of each measurement's space. The hist_estat_* clones are unaffected since name=total rows carry no microseconds field and are already dropped by the histogram processor. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- telegraf/telegraf.inputs.storage_io | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 0ad27ae..6392331 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -50,11 +50,11 @@ [processors.converter.fields] integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] -# Drop name=total rows from estat_backend-io. total = read + write and can be -# derived in Grafana, so storing it is redundant. Runs before the clone (order=1) -# so hist_estat_backend-io also never receives a name=total row. +# Drop name=total rows from estat_nfs, estat_iscsi, and estat_backend-io. +# total = read + write and can be derived in Grafana, so storing it is redundant. +# Runs before the clone (order=1) so hist_estat_* also never receives a name=total row. [[processors.starlark]] - namepass = ["estat_backend-io"] + namepass = ["estat_nfs", "estat_iscsi", "estat_backend-io"] source = ''' def apply(metric): if metric.tags.get("name") == "total":