From 6c2e10729e9d893394e02c33616b7ada561d0c44 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:02:28 +0300 Subject: [PATCH 01/21] refactor(findings): rename dashboard-sprawl class/category to broken-panel/dashboard-hygiene Scope narrows in v0.1 to only broken-panel detection; the old names had no live emitters. --- internal/findings/classes.go | 2 +- internal/findings/classes_test.go | 2 +- internal/findings/finding.go | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/findings/classes.go b/internal/findings/classes.go index 7d11468..76b35c4 100644 --- a/internal/findings/classes.go +++ b/internal/findings/classes.go @@ -18,7 +18,7 @@ const ( ClassLabelPatternOverlyGranular Class = "label-pattern-overly-granular" ClassNeverFiringAlert Class = "never-firing-alert" ClassAlwaysFiringAlert Class = "always-firing-alert" - ClassDashboardSprawl Class = "dashboard-sprawl" + ClassBrokenPanel Class = "broken-panel" ) const docsBaseURL = "https://remetric.dev/findings/" diff --git a/internal/findings/classes_test.go b/internal/findings/classes_test.go index 24ea530..3d9e3c6 100644 --- a/internal/findings/classes_test.go +++ b/internal/findings/classes_test.go @@ -80,7 +80,7 @@ func TestDocURL(t *testing.T) { {"never-firing-alert", findings.ClassNeverFiringAlert, "https://remetric.dev/findings/never-firing-alert"}, {"always-firing-alert", findings.ClassAlwaysFiringAlert, "https://remetric.dev/findings/always-firing-alert"}, {"label-pattern-overly-granular", findings.ClassLabelPatternOverlyGranular, "https://remetric.dev/findings/label-pattern-overly-granular"}, - {"dashboard-sprawl", findings.ClassDashboardSprawl, "https://remetric.dev/findings/dashboard-sprawl"}, + {"broken-panel", findings.ClassBrokenPanel, "https://remetric.dev/findings/broken-panel"}, {"empty class returns empty URL", "", ""}, } for _, tt := range tests { diff --git a/internal/findings/finding.go b/internal/findings/finding.go index 374f355..7551b42 100644 --- a/internal/findings/finding.go +++ b/internal/findings/finding.go @@ -8,11 +8,11 @@ type Category string // Known finding categories. const ( - CategoryCardinality Category = "cardinality" - CategoryUnusedMetrics Category = "unused_metrics" - CategoryLabelPatterns Category = "label_patterns" - CategoryAlertHygiene Category = "alert_hygiene" - CategoryDashboardSprawl Category = "dashboard_sprawl" + CategoryCardinality Category = "cardinality" + CategoryUnusedMetrics Category = "unused_metrics" + CategoryLabelPatterns Category = "label_patterns" + CategoryAlertHygiene Category = "alert_hygiene" + CategoryDashboardHygiene Category = "dashboard_hygiene" ) // Finding is a single piece of feedback produced by an analyzer. From ed524eb8621759b1f2afc717ed469a50d3e1000b Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:07:29 +0300 Subject: [PATCH 02/21] feat(findings): add Finding.Dashboard field Used by DashboardHygieneAnalyzer to carry the dashboard title. omitempty keeps the wire form clean for non-dashboard findings. --- internal/findings/finding.go | 23 +++++++++++---------- internal/findings/finding_test.go | 34 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/internal/findings/finding.go b/internal/findings/finding.go index 7551b42..e3f1e6f 100644 --- a/internal/findings/finding.go +++ b/internal/findings/finding.go @@ -17,17 +17,18 @@ const ( // Finding is a single piece of feedback produced by an analyzer. type Finding struct { - ID string `json:"id"` - Severity Severity `json:"severity"` - Category Category `json:"category"` - Class Class `json:"class,omitempty"` - Title string `json:"title"` - Metric string `json:"metric,omitempty"` - Alert string `json:"alert,omitempty"` - Evidence Evidence `json:"evidence"` - Fix Fix `json:"fix"` - Impact Impact `json:"impact"` - DocURL string `json:"documentation_url,omitempty"` + ID string `json:"id"` + Severity Severity `json:"severity"` + Category Category `json:"category"` + Class Class `json:"class,omitempty"` + Title string `json:"title"` + Metric string `json:"metric,omitempty"` + Alert string `json:"alert,omitempty"` + Dashboard string `json:"dashboard,omitempty"` + Evidence Evidence `json:"evidence"` + Fix Fix `json:"fix"` + Impact Impact `json:"impact"` + DocURL string `json:"documentation_url,omitempty"` } // Evidence describes the observation that triggered a finding. diff --git a/internal/findings/finding_test.go b/internal/findings/finding_test.go index a1808ff..06f90d7 100644 --- a/internal/findings/finding_test.go +++ b/internal/findings/finding_test.go @@ -157,3 +157,37 @@ func TestFinding_AlertJSONRoundTrip(t *testing.T) { t.Errorf("expected alert key, got: %s", b) } } + +func TestFinding_DashboardField_OmitEmptyOnNonDashboardFindings(t *testing.T) { + f := Finding{ + ID: "x", + Severity: SeverityMedium, + Category: CategoryCardinality, + Title: "t", + Metric: "m", + } + b, err := json.Marshal(f) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + if strings.Contains(string(b), `"dashboard"`) { + t.Errorf("empty Dashboard should be omitted from JSON; got: %s", string(b)) + } +} + +func TestFinding_DashboardField_PresentWhenSet(t *testing.T) { + f := Finding{ + ID: "x", + Severity: SeverityMedium, + Category: CategoryDashboardHygiene, + Title: "t", + Dashboard: "Frontend SLOs", + } + b, err := json.Marshal(f) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + if !strings.Contains(string(b), `"dashboard":"Frontend SLOs"`) { + t.Errorf("Dashboard not in JSON output: %s", string(b)) + } +} From a9bd3f81e63e6a7a1f5946bcd78a250b1d3ea6f0 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:13:03 +0300 Subject: [PATCH 03/21] feat(ignore): add Dashboard pattern + --ignore-dashboard flag Anchored regex against Finding.Dashboard. Empty field never matches. Wires through config.IgnoreConfig.Dashboard. --- internal/config/config.go | 16 ++++++++++------ internal/ignore/ignore.go | 31 ++++++++++++++++++++----------- internal/ignore/ignore_test.go | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 17 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index f962ac6..f32e7e0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -40,9 +40,10 @@ type Config struct { // IgnoreConfig holds the raw regex patterns to drop findings whose // structured fields match. Parsed once in Validate. type IgnoreConfig struct { - Metric []string `mapstructure:"metric"` - Label []string `mapstructure:"label"` - Alert []string `mapstructure:"alert"` + Metric []string `mapstructure:"metric"` + Label []string `mapstructure:"label"` + Alert []string `mapstructure:"alert"` + Dashboard []string `mapstructure:"dashboard"` } // PrometheusConfig holds Prometheus client options. @@ -97,6 +98,7 @@ func BindFlags(fs *pflag.FlagSet) { fs.StringArray("ignore-metric", nil, "Drop findings whose metric name matches this regex (repeatable). Anchored full-match.") fs.StringArray("ignore-label", nil, "Drop findings whose evidence label matches this regex (repeatable). Anchored full-match.") fs.StringArray("ignore-alert", nil, "Drop findings whose alert name matches this regex (repeatable). Anchored full-match.") + fs.StringArray("ignore-dashboard", nil, "Drop findings whose dashboard title matches this regex (repeatable). Anchored full-match.") fs.BoolP("verbose", "v", false, "Verbose logging") fs.Duration("timeout", 5*time.Minute, "Total operation timeout") fs.String("config", "", "Path to config file") @@ -138,6 +140,7 @@ func Load(fs *pflag.FlagSet, cfgPath string) (*Config, error) { "ignore-metric": "ignore.metric", "ignore-label": "ignore.label", "ignore-alert": "ignore.alert", + "ignore-dashboard": "ignore.dashboard", "verbose": "verbose", "timeout": "timeout", } @@ -209,9 +212,10 @@ func (c *Config) Validate() error { c.failOnSeverity = sev c.failOnEnabled = enabled filter, err := ignore.New(ignore.Patterns{ - Metric: c.Ignore.Metric, - Label: c.Ignore.Label, - Alert: c.Ignore.Alert, + Metric: c.Ignore.Metric, + Label: c.Ignore.Label, + Alert: c.Ignore.Alert, + Dashboard: c.Ignore.Dashboard, }) if err != nil { return fmt.Errorf("invalid --ignore-*: %w", err) diff --git a/internal/ignore/ignore.go b/internal/ignore/ignore.go index c63cdff..7c2954f 100644 --- a/internal/ignore/ignore.go +++ b/internal/ignore/ignore.go @@ -2,7 +2,7 @@ // Copyright 2026 Andrei Taranik // Package ignore implements post-render filtering of findings by -// regex patterns matched against structured fields (Metric, Label, Alert). +// regex patterns matched against structured fields (Metric, Label, Alert, Dashboard). // // Patterns are anchored full-match: the user writes `foo_.*` and the // filter wraps it as `^(foo_.*)$`. Empty / whitespace-only patterns are @@ -19,17 +19,19 @@ import ( // Patterns groups raw regex strings by the field they match. type Patterns struct { - Metric []string - Label []string - Alert []string + Metric []string + Label []string + Alert []string + Dashboard []string } // Filter applies compiled regex patterns to a slice of findings. // The zero value passes every finding through. type Filter struct { - metric []*regexp.Regexp - label []*regexp.Regexp - alert []*regexp.Regexp + metric []*regexp.Regexp + label []*regexp.Regexp + alert []*regexp.Regexp + dashboard []*regexp.Regexp } // New compiles Patterns into a Filter. Returns an error on any @@ -48,7 +50,11 @@ func New(p Patterns) (*Filter, error) { if err != nil { return nil, err } - return &Filter{metric: mc, label: lc, alert: ac}, nil + dc, err := compile("dashboard", p.Dashboard) + if err != nil { + return nil, err + } + return &Filter{metric: mc, label: lc, alert: ac, dashboard: dc}, nil } func compile(kind string, raw []string) ([]*regexp.Regexp, error) { @@ -71,7 +77,7 @@ func compile(kind string, raw []string) ([]*regexp.Regexp, error) { // is preserved among kept findings. When the filter has no patterns at // all, Apply returns the input slice unchanged (no allocation). func (f *Filter) Apply(in []findings.Finding) (kept []findings.Finding, ignored int) { - if f == nil || (len(f.metric) == 0 && len(f.label) == 0 && len(f.alert) == 0) { + if f == nil || (len(f.metric) == 0 && len(f.label) == 0 && len(f.alert) == 0 && len(f.dashboard) == 0) { return in, 0 } kept = make([]findings.Finding, 0, len(in)) @@ -86,8 +92,8 @@ func (f *Filter) Apply(in []findings.Finding) (kept []findings.Finding, ignored } // drops reports whether any of the configured pattern lists matches one -// of the structured fields of fnd. Order is metric -> label -> alert, -// short-circuit on first match. +// of the structured fields of fnd. Order is metric -> label -> alert -> +// dashboard, short-circuit on first match. func (f *Filter) drops(fnd findings.Finding) bool { if fnd.Metric != "" && matchAny(f.metric, fnd.Metric) { return true @@ -98,6 +104,9 @@ func (f *Filter) drops(fnd findings.Finding) bool { if fnd.Alert != "" && matchAny(f.alert, fnd.Alert) { return true } + if fnd.Dashboard != "" && matchAny(f.dashboard, fnd.Dashboard) { + return true + } return false } diff --git a/internal/ignore/ignore_test.go b/internal/ignore/ignore_test.go index 82478dc..6a3ca76 100644 --- a/internal/ignore/ignore_test.go +++ b/internal/ignore/ignore_test.go @@ -196,3 +196,36 @@ func TestFilter_Apply_PreservesOrder(t *testing.T) { t.Errorf("kept order mismatch (-want +got):\n%s", diff) } } + +func TestFilter_Apply_DashboardPatternDrops(t *testing.T) { + f, err := ignore.New(ignore.Patterns{Dashboard: []string{"Legacy .*"}}) + if err != nil { + t.Fatalf("New: %v", err) + } + in := []findings.Finding{ + {ID: "1", Dashboard: "Legacy Disk Stats"}, + {ID: "2", Dashboard: "Current SLOs"}, + } + kept, ignored := f.Apply(in) + if ignored != 1 { + t.Errorf("ignored = %d, want 1", ignored) + } + if len(kept) != 1 || kept[0].ID != "2" { + t.Errorf("kept = %+v, want [{ID:2 ...}]", kept) + } +} + +func TestFilter_Apply_DashboardEmptyFieldNoMatch(t *testing.T) { + f, err := ignore.New(ignore.Patterns{Dashboard: []string{".*"}}) + if err != nil { + t.Fatalf("New: %v", err) + } + in := []findings.Finding{{ID: "1", Dashboard: ""}, {ID: "2", Metric: "m"}} + kept, ignored := f.Apply(in) + if ignored != 0 { + t.Errorf("ignored = %d, want 0 (empty Dashboard must not match)", ignored) + } + if len(kept) != 2 { + t.Errorf("kept len = %d, want 2", len(kept)) + } +} From a2680ec51888d50b655f640051fb58eedf6bf99c Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:19:45 +0300 Subject: [PATCH 04/21] feat(grafana): add Dashboard.PanelTargets and Client.BaseURL PanelTargets walks rows recursively, returns (panel-title, expr) pairs filtered to Prometheus targets. BaseURL is a defensive copy used by the dashboard-hygiene analyzer to build absolute dashboard URLs in Fix snippets. --- internal/grafana/client.go | 10 ++++++++ internal/grafana/client_test.go | 20 ++++++++++++++++ internal/grafana/dashboard.go | 37 ++++++++++++++++++++++++++++++ internal/grafana/dashboard_test.go | 36 +++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+) diff --git a/internal/grafana/client.go b/internal/grafana/client.go index 747dcb5..272e0c2 100644 --- a/internal/grafana/client.go +++ b/internal/grafana/client.go @@ -121,6 +121,16 @@ func (c *Client) do(ctx context.Context, method, pathAndQuery string, body io.Re return buf, nil } +// BaseURL returns a defensive copy of the configured base URL. +// Mutating the returned value does not affect future requests. +func (c *Client) BaseURL() *url.URL { + if c.baseURL == nil { + return nil + } + cp := *c.baseURL + return &cp +} + // singleJoin joins URL path segments without collapsing slashes // across two non-empty path components. func singleJoin(a, b string) string { diff --git a/internal/grafana/client_test.go b/internal/grafana/client_test.go index 48d12f9..129ac9a 100644 --- a/internal/grafana/client_test.go +++ b/internal/grafana/client_test.go @@ -83,3 +83,23 @@ func TestDo_ReturnsErrNotFoundOn404(t *testing.T) { t.Errorf("err = %v, want ErrNotFound", err) } } + +func TestClient_BaseURL_ReturnsAbsoluteCopy(t *testing.T) { + c, err := New("https://grafana.example.com/sub/") + if err != nil { + t.Fatalf("New: %v", err) + } + u := c.BaseURL() + if u == nil { + t.Fatal("BaseURL returned nil") + } + if u.Scheme != "https" || u.Host != "grafana.example.com" || u.Path != "/sub/" { + t.Errorf("BaseURL = %+v, want https://grafana.example.com/sub/", u) + } + // Caller mutation must not affect the client. + u.Path = "/mutated" + again := c.BaseURL() + if again.Path != "/sub/" { + t.Errorf("BaseURL after caller mutation = %q, want /sub/ (defensive copy)", again.Path) + } +} diff --git a/internal/grafana/dashboard.go b/internal/grafana/dashboard.go index d793294..89c72b3 100644 --- a/internal/grafana/dashboard.go +++ b/internal/grafana/dashboard.go @@ -18,6 +18,7 @@ type Dashboard struct { } type panel struct { + Title string `json:"title"` Type string `json:"type"` Panels []panel `json:"panels,omitempty"` Targets []target `json:"targets,omitempty"` @@ -93,3 +94,39 @@ func collectPanelQueries(panels []panel, out *[]string) { } } } + +// PanelTarget is a flat (panel-title, expr) pair for analyzers that +// care about who-references-what. Skips non-Prometheus targets and +// empty expressions. +type PanelTarget struct { + PanelTitle string + Expr string +} + +// PanelTargets returns every Prometheus target across all panels, +// recursing through rows. Panel titles fall back to "" +// when the title field is empty. +func (d *Dashboard) PanelTargets() []PanelTarget { + var out []PanelTarget + collectPanelTargets(d.Panels, &out) + return out +} + +func collectPanelTargets(panels []panel, out *[]PanelTarget) { + for _, p := range panels { + collectPanelTargets(p.Panels, out) + title := p.Title + if title == "" { + title = "" + } + for _, t := range p.Targets { + if t.Datasource.Type != "" && t.Datasource.Type != "prometheus" { + continue + } + if t.Expr == "" { + continue + } + *out = append(*out, PanelTarget{PanelTitle: title, Expr: t.Expr}) + } + } +} diff --git a/internal/grafana/dashboard_test.go b/internal/grafana/dashboard_test.go index 8c9bdb6..ca0c59c 100644 --- a/internal/grafana/dashboard_test.go +++ b/internal/grafana/dashboard_test.go @@ -112,6 +112,42 @@ func TestDashboard_Queries_NonPrometheusDatasourceSkipped(t *testing.T) { } } +func TestDashboard_PanelTargets_WalksRecursivelyAndFiltersNonProm(t *testing.T) { + body := `{"dashboard":{"uid":"d1","title":"Top","panels":[ + {"type":"graph","title":"P1","targets":[{"expr":"up","datasource":{"type":"prometheus"}}]}, + {"type":"row","title":"Row","panels":[ + {"type":"graph","title":"P2","targets":[{"expr":"down","datasource":{"type":"prometheus"}}]}, + {"type":"logs","title":"P3","targets":[{"expr":"{job=\"app\"}","datasource":{"type":"loki"}}]} + ]}, + {"type":"graph","title":"","targets":[{"expr":"node_load1","datasource":{"type":"prometheus"}}]} + ]}}` + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(body)) + })) + defer ts.Close() + c, err := New(ts.URL) + if err != nil { + t.Fatalf("New: %v", err) + } + dash, err := c.Dashboard(context.Background(), "d1") + if err != nil { + t.Fatalf("Dashboard: %v", err) + } + got := dash.PanelTargets() + // Recursion mirrors collectPanelQueries: for each top-level panel, + // descend into its children before processing its own targets. So + // P1's own targets emit before we descend into the Row sibling. + want := []PanelTarget{ + {PanelTitle: "P1", Expr: "up"}, + {PanelTitle: "P2", Expr: "down"}, // nested under Row; P3 (loki) skipped + {PanelTitle: "", Expr: "node_load1"}, + } + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("PanelTargets mismatch (-want +got):\n%s", diff) + } +} + func TestDashboard_Queries_TargetWithoutExpr(t *testing.T) { body := `{ "dashboard": { From 0b403a89283910d0938c37d0c9c972ecb6bd0a44 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:26:28 +0300 Subject: [PATCH 05/21] feat(analyzers): add dashboardhygiene skeleton New analyzer flags Grafana dashboards whose panel queries reference missing Prometheus metrics. Skeleton + nil-Graf warning path; full algorithm in subsequent commits. --- .../dashboardhygiene/dashboardhygiene.go | 34 +++++++++++++++++ .../dashboardhygiene/dashboardhygiene_test.go | 37 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 internal/analyzers/dashboardhygiene/dashboardhygiene.go create mode 100644 internal/analyzers/dashboardhygiene/dashboardhygiene_test.go diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go new file mode 100644 index 0000000..d6bdca7 --- /dev/null +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +// Package dashboardhygiene flags Grafana dashboards whose panel queries +// reference Prometheus metric names that do not exist in the head +// series or in any recording-rule output. One finding per +// (dashboard, missing-metric) pair, severity Medium. +package dashboardhygiene + +import ( + "context" + + "github.com/remetric-dev/remetric/internal/analyzers" +) + +// Analyzer is the dashboard-hygiene analyzer (broken panels). +type Analyzer struct{} + +// New constructs an Analyzer with default settings. +func New() *Analyzer { return &Analyzer{} } + +// Name implements analyzers.Analyzer. +func (a *Analyzer) Name() string { return "dashboardhygiene" } + +// Analyze implements analyzers.Analyzer. Without a Grafana client +// the analyzer cannot operate: returns a single warning, no findings, +// no error (consistent with unusedmetrics). +func (a *Analyzer) Analyze(_ context.Context, d analyzers.Deps) (analyzers.Result, error) { + if d.Graf == nil { + return analyzers.Result{Warnings: []string{"grafana not configured: skipped"}}, nil + } + // Implementation continues in Task 6+. + return analyzers.Result{}, nil +} diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go new file mode 100644 index 0000000..d2f4d1c --- /dev/null +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +package dashboardhygiene + +import ( + "context" + "log/slog" + "testing" + + "github.com/remetric-dev/remetric/internal/analyzers" +) + +func TestAnalyzer_Name(t *testing.T) { + if got := New().Name(); got != "dashboardhygiene" { + t.Errorf("Name = %q, want %q", got, "dashboardhygiene") + } +} + +func TestAnalyzer_NoGrafanaClientReturnsWarning(t *testing.T) { + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Logger: slog.Default(), + Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 0 { + t.Errorf("Findings len = %d, want 0", len(res.Findings)) + } + if len(res.Warnings) != 1 { + t.Fatalf("Warnings len = %d, want 1; got %v", len(res.Warnings), res.Warnings) + } + if res.Warnings[0] != "grafana not configured: skipped" { + t.Errorf("Warnings[0] = %q, want %q", res.Warnings[0], "grafana not configured: skipped") + } +} From e5023fad7e632f9f49d1ec7b0ea639e23a0c9410 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:35:52 +0300 Subject: [PATCH 06/21] feat(analyzers): dashboardhygiene happy-path detection Walks every Grafana dashboard, parses Prometheus targets via promqlx, and groups (dashboard, missing-metric) pairs. Severity Medium per the design. Recording-rule outputs and fix snippet come in later commits. --- .../dashboardhygiene/dashboardhygiene.go | 143 ++++++++++++++- .../dashboardhygiene/dashboardhygiene_test.go | 167 ++++++++++++++++++ 2 files changed, 304 insertions(+), 6 deletions(-) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go index d6bdca7..522eba3 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -9,8 +9,14 @@ package dashboardhygiene import ( "context" + "fmt" + "net/url" + "sort" "github.com/remetric-dev/remetric/internal/analyzers" + "github.com/remetric-dev/remetric/internal/findings" + "github.com/remetric-dev/remetric/internal/grafana" + "github.com/remetric-dev/remetric/internal/promqlx" ) // Analyzer is the dashboard-hygiene analyzer (broken panels). @@ -22,13 +28,138 @@ func New() *Analyzer { return &Analyzer{} } // Name implements analyzers.Analyzer. func (a *Analyzer) Name() string { return "dashboardhygiene" } -// Analyze implements analyzers.Analyzer. Without a Grafana client -// the analyzer cannot operate: returns a single warning, no findings, -// no error (consistent with unusedmetrics). -func (a *Analyzer) Analyze(_ context.Context, d analyzers.Deps) (analyzers.Result, error) { +// dashAgg accumulates per-(dashboard, missing-metric) state during +// the dashboard walk: which panels referenced the metric (titles +// preserved in walk order), the dashboard's title, and an absolute +// URL into Grafana for the Fix snippet. +type dashAgg struct { + dashTitle string + dashURL string + panelTitles []string +} + +// Analyze implements analyzers.Analyzer. +func (a *Analyzer) Analyze(ctx context.Context, d analyzers.Deps) (analyzers.Result, error) { if d.Graf == nil { return analyzers.Result{Warnings: []string{"grafana not configured: skipped"}}, nil } - // Implementation continues in Task 6+. - return analyzers.Result{}, nil + + ingested, err := d.Prom.LabelValues(ctx, "__name__") + if err != nil { + return analyzers.Result{}, fmt.Errorf("dashboardhygiene: ingested names: %w", err) + } + exists := make(map[string]struct{}, len(ingested)) + for _, n := range ingested { + exists[n] = struct{}{} + } + + refs, err := d.Graf.Search(ctx) + if err != nil { + return analyzers.Result{}, fmt.Errorf("dashboardhygiene: grafana search: %w", err) + } + + type key struct { + uid string + metric string + } + groups := map[key]*dashAgg{} + var warnings []string + + for _, ref := range refs { + dash, err := d.Graf.Dashboard(ctx, ref.UID) + if err != nil { + warnings = append(warnings, fmt.Sprintf("dashboardhygiene: dashboard %q: %v", ref.UID, err)) + continue + } + dashURL := absoluteDashboardURL(d.Graf, ref.URL) + for _, pt := range dash.PanelTargets() { + names, err := promqlx.ExtractFromQuery(pt.Expr) + if err != nil { + continue + } + for name := range names { + if _, ok := exists[name]; ok { + continue + } + k := key{uid: ref.UID, metric: name} + agg := groups[k] + if agg == nil { + agg = &dashAgg{dashTitle: dash.Title, dashURL: dashURL} + groups[k] = agg + } + agg.panelTitles = append(agg.panelTitles, pt.PanelTitle) + } + } + } + + out := make([]findings.Finding, 0, len(groups)) + for k, agg := range groups { + out = append(out, buildFinding(k.uid, k.metric, *agg)) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Severity != out[j].Severity { + return out[i].Severity > out[j].Severity + } + ni, nj := len(out[i].Evidence.SampleValues), len(out[j].Evidence.SampleValues) + if ni != nj { + return ni > nj + } + return out[i].Metric < out[j].Metric + }) + return analyzers.Result{Findings: out, Warnings: warnings}, nil +} + +// absoluteDashboardURL joins the Grafana base URL with the relative +// path from /api/search. Empty rel returns empty string. +func absoluteDashboardURL(c *grafana.Client, rel string) string { + if rel == "" || c == nil { + return "" + } + base := c.BaseURL() + if base == nil { + return "" + } + relURL, err := url.Parse(rel) + if err != nil { + return "" + } + return base.ResolveReference(relURL).String() +} + +// buildFinding renders one Finding for a (dashboard, missing-metric) pair. +func buildFinding(uid, missing string, a dashAgg) findings.Finding { + n := len(a.panelTitles) + sample := a.panelTitles + if n > 5 { + sample = sample[:5] + } + return findings.Finding{ + ID: "broken-panel:" + uid + ":" + missing, + Severity: findings.SeverityMedium, + Category: findings.CategoryDashboardHygiene, + Class: findings.ClassBrokenPanel, + Title: fmt.Sprintf("dashboard %q references missing metric %q", a.dashTitle, missing), + Metric: missing, + Dashboard: a.dashTitle, + Evidence: findings.Evidence{ + Description: fmt.Sprintf( + "%d panel(s) in dashboard %q query %q which is not present in head series or recording-rule outputs", + n, a.dashTitle, missing), + SampleValues: sample, + }, + Fix: findings.Fix{ + Type: "edit_dashboard", + Config: buildFix(a.dashTitle, missing, a.dashURL, a.panelTitles), + }, + Impact: findings.Impact{ + EstimationMethod: "broken_panel", + }, + } +} + +// buildFix returns a paste-ready instruction block. Full implementation +// comes in Task 11 (fix.go). For now the stub returns an empty string +// so happy-path tests can pass without locking in the snippet format. +func buildFix(_, _, _ string, _ []string) string { + return "" } diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index d2f4d1c..8913fa3 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -5,10 +5,16 @@ package dashboardhygiene import ( "context" + "encoding/json" "log/slog" + "net/http" + "net/http/httptest" "testing" "github.com/remetric-dev/remetric/internal/analyzers" + "github.com/remetric-dev/remetric/internal/findings" + "github.com/remetric-dev/remetric/internal/grafana" + "github.com/remetric-dev/remetric/internal/prometheus" ) func TestAnalyzer_Name(t *testing.T) { @@ -35,3 +41,164 @@ func TestAnalyzer_NoGrafanaClientReturnsWarning(t *testing.T) { t.Errorf("Warnings[0] = %q, want %q", res.Warnings[0], "grafana not configured: skipped") } } + +// stubsConfig defines the canned responses for the Prom + Grafana servers. +type stubsConfig struct { + IngestedNames []string // /api/v1/label/__name__/values + Rules []ruleStub // /api/v1/rules + Dashboards []dashboardStub // /api/search + /api/dashboards/uid/ + PromHandler http.HandlerFunc // optional override for Prom + GrafHandler http.HandlerFunc // optional override for Grafana +} + +type ruleStub struct { + Name string // recording rule output name + Query string + Type string // "alerting" or "recording" +} + +type dashboardStub struct { + UID string + Title string + URL string // relative path Grafana would return in /api/search + Body string // raw JSON body for /api/dashboards/uid/ +} + +func newStubs(t *testing.T, cfg stubsConfig) (promURL, grafURL string) { + t.Helper() + prom := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if cfg.PromHandler != nil { + cfg.PromHandler(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/label/__name__/values": + names := cfg.IngestedNames + if names == nil { + names = []string{} + } + _ = json.NewEncoder(w).Encode(map[string]any{"status": "success", "data": names}) + case "/api/v1/rules": + rules := make([]map[string]any, 0, len(cfg.Rules)) + for _, rl := range cfg.Rules { + rules = append(rules, map[string]any{"name": rl.Name, "query": rl.Query, "type": rl.Type}) + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "status": "success", + "data": map[string]any{ + "groups": []map[string]any{{"name": "g", "file": "f", "rules": rules}}, + }, + }) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(prom.Close) + + graf := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if cfg.GrafHandler != nil { + cfg.GrafHandler(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + if r.URL.Path == "/api/search" { + refs := make([]map[string]any, 0, len(cfg.Dashboards)) + for _, d := range cfg.Dashboards { + refs = append(refs, map[string]any{"uid": d.UID, "title": d.Title, "url": d.URL}) + } + _ = json.NewEncoder(w).Encode(refs) + return + } + for _, d := range cfg.Dashboards { + if r.URL.Path == "/api/dashboards/uid/"+d.UID { + _, _ = w.Write([]byte(d.Body)) + return + } + } + http.NotFound(w, r) + })) + t.Cleanup(graf.Close) + return prom.URL, graf.URL +} + +func mustClients(t *testing.T, promURL, grafURL string) (*prometheus.Client, *grafana.Client) { + t.Helper() + pc, err := prometheus.New(promURL, prometheus.WithFlavorHint(prometheus.FlavorProm)) + if err != nil { + t.Fatalf("prometheus.New: %v", err) + } + gc, err := grafana.New(grafURL) + if err != nil { + t.Fatalf("grafana.New: %v", err) + } + return pc, gc +} + +func TestAnalyzer_FlagsMissingMetric(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up", "node_cpu_seconds_total"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Frontend SLOs", + URL: "/d/d1/frontend-slos", + Body: `{"dashboard":{"uid":"d1","title":"Frontend SLOs","panels":[ + {"type":"graph","title":"Latency p99","targets":[{"expr":"missing_metric_xyz","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, + Graf: gc, + Logger: slog.Default(), + Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 1 { + t.Fatalf("Findings len = %d, want 1; got %+v", len(res.Findings), res.Findings) + } + f := res.Findings[0] + if f.Class != findings.ClassBrokenPanel { + t.Errorf("Class = %q, want %q", f.Class, findings.ClassBrokenPanel) + } + if f.Category != findings.CategoryDashboardHygiene { + t.Errorf("Category = %q, want %q", f.Category, findings.CategoryDashboardHygiene) + } + if f.Severity != findings.SeverityMedium { + t.Errorf("Severity = %v, want Medium", f.Severity) + } + if f.Metric != "missing_metric_xyz" { + t.Errorf("Metric = %q, want %q", f.Metric, "missing_metric_xyz") + } + if f.Dashboard != "Frontend SLOs" { + t.Errorf("Dashboard = %q, want %q", f.Dashboard, "Frontend SLOs") + } +} + +func TestAnalyzer_ExistingMetricIsClean(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Frontend SLOs", + Body: `{"dashboard":{"uid":"d1","title":"Frontend SLOs","panels":[ + {"type":"graph","title":"P1","targets":[{"expr":"up","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 0 { + t.Errorf("Findings len = %d, want 0; got %+v", len(res.Findings), res.Findings) + } +} From aef8e1506183c9710a2c07e70c87bb3dafe385c8 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:44:01 +0300 Subject: [PATCH 07/21] refactor(dashboardhygiene): deterministic sort + named const + param rename Code-quality follow-ups to the happy-path commit: - add Dashboard tiebreaker to the comparator so output is deterministic across map iterations - extract the comparator into findingLess to keep Analyze under the gocyclo limit (was 15, the extra branch pushed it to 16) - extract sample-cap 5 to a named const for grep-ability - rename buildFinding parameter to mirror the call-site variable name --- .../dashboardhygiene/dashboardhygiene.go | 53 ++++++++++++------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go index 522eba3..9c3ca85 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -19,6 +19,11 @@ import ( "github.com/remetric-dev/remetric/internal/promqlx" ) +// maxSamplePanelTitles is the upper bound on Evidence.SampleValues +// per broken-panel finding. Beyond this, panel titles are listed +// only in the Fix snippet, not as structured evidence. +const maxSamplePanelTitles = 5 + // Analyzer is the dashboard-hygiene analyzer (broken panels). type Analyzer struct{} @@ -96,19 +101,29 @@ func (a *Analyzer) Analyze(ctx context.Context, d analyzers.Deps) (analyzers.Res for k, agg := range groups { out = append(out, buildFinding(k.uid, k.metric, *agg)) } - sort.SliceStable(out, func(i, j int) bool { - if out[i].Severity != out[j].Severity { - return out[i].Severity > out[j].Severity - } - ni, nj := len(out[i].Evidence.SampleValues), len(out[j].Evidence.SampleValues) - if ni != nj { - return ni > nj - } - return out[i].Metric < out[j].Metric - }) + sort.SliceStable(out, func(i, j int) bool { return findingLess(out[i], out[j]) }) return analyzers.Result{Findings: out, Warnings: warnings}, nil } +// findingLess orders broken-panel findings deterministically by +// severity (desc), sample-count (desc), dashboard (asc), metric (asc). +// The Dashboard tiebreaker is required because two findings can share +// (severity, sample-count, metric) when different dashboards reference +// the same missing metric. +func findingLess(a, b findings.Finding) bool { + if a.Severity != b.Severity { + return a.Severity > b.Severity + } + na, nb := len(a.Evidence.SampleValues), len(b.Evidence.SampleValues) + if na != nb { + return na > nb + } + if a.Dashboard != b.Dashboard { + return a.Dashboard < b.Dashboard + } + return a.Metric < b.Metric +} + // absoluteDashboardURL joins the Grafana base URL with the relative // path from /api/search. Empty rel returns empty string. func absoluteDashboardURL(c *grafana.Client, rel string) string { @@ -127,29 +142,29 @@ func absoluteDashboardURL(c *grafana.Client, rel string) string { } // buildFinding renders one Finding for a (dashboard, missing-metric) pair. -func buildFinding(uid, missing string, a dashAgg) findings.Finding { - n := len(a.panelTitles) - sample := a.panelTitles - if n > 5 { - sample = sample[:5] +func buildFinding(uid, missing string, agg dashAgg) findings.Finding { + n := len(agg.panelTitles) + sample := agg.panelTitles + if n > maxSamplePanelTitles { + sample = sample[:maxSamplePanelTitles] } return findings.Finding{ ID: "broken-panel:" + uid + ":" + missing, Severity: findings.SeverityMedium, Category: findings.CategoryDashboardHygiene, Class: findings.ClassBrokenPanel, - Title: fmt.Sprintf("dashboard %q references missing metric %q", a.dashTitle, missing), + Title: fmt.Sprintf("dashboard %q references missing metric %q", agg.dashTitle, missing), Metric: missing, - Dashboard: a.dashTitle, + Dashboard: agg.dashTitle, Evidence: findings.Evidence{ Description: fmt.Sprintf( "%d panel(s) in dashboard %q query %q which is not present in head series or recording-rule outputs", - n, a.dashTitle, missing), + n, agg.dashTitle, missing), SampleValues: sample, }, Fix: findings.Fix{ Type: "edit_dashboard", - Config: buildFix(a.dashTitle, missing, a.dashURL, a.panelTitles), + Config: buildFix(agg.dashTitle, missing, agg.dashURL, agg.panelTitles), }, Impact: findings.Impact{ EstimationMethod: "broken_panel", From 25b0934674baa8cf715fe427351e6dc527079bb6 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:48:12 +0300 Subject: [PATCH 08/21] test(dashboardhygiene): pin grouping by (dashboard, missing-metric) Same missing metric across multiple panels yields one finding; distinct missing metrics in the same dashboard emit separate findings. --- .../dashboardhygiene/dashboardhygiene_test.go | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index 8913fa3..a9c8a60 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -9,8 +9,11 @@ import ( "log/slog" "net/http" "net/http/httptest" + "sort" "testing" + "github.com/google/go-cmp/cmp" + "github.com/remetric-dev/remetric/internal/analyzers" "github.com/remetric-dev/remetric/internal/findings" "github.com/remetric-dev/remetric/internal/grafana" @@ -202,3 +205,70 @@ func TestAnalyzer_ExistingMetricIsClean(t *testing.T) { t.Errorf("Findings len = %d, want 0; got %+v", len(res.Findings), res.Findings) } } + +func TestAnalyzer_GroupsByDashboardAndMetric(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Disk", + Body: `{"dashboard":{"uid":"d1","title":"Disk","panels":[ + {"type":"graph","title":"Disk I/O 5m","targets":[{"expr":"missing_xyz","datasource":{"type":"prometheus"}}]}, + {"type":"graph","title":"Disk I/O 1h","targets":[{"expr":"missing_xyz","datasource":{"type":"prometheus"}}]}, + {"type":"graph","title":"Disk I/O 24h","targets":[{"expr":"sum(missing_xyz)","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 1 { + t.Fatalf("Findings len = %d, want 1; got %+v", len(res.Findings), res.Findings) + } + f := res.Findings[0] + if got := len(f.Evidence.SampleValues); got != 3 { + t.Errorf("Evidence.SampleValues len = %d, want 3", got) + } + wantTitles := []string{"Disk I/O 5m", "Disk I/O 1h", "Disk I/O 24h"} + if diff := cmp.Diff(wantTitles, f.Evidence.SampleValues); diff != "" { + t.Errorf("SampleValues mismatch (-want +got):\n%s", diff) + } +} + +func TestAnalyzer_DistinctMissingMetricsEmitSeparateFindings(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Mixed", + Body: `{"dashboard":{"uid":"d1","title":"Mixed","panels":[ + {"type":"graph","title":"P1","targets":[{"expr":"missing_a","datasource":{"type":"prometheus"}}]}, + {"type":"graph","title":"P2","targets":[{"expr":"missing_b","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 2 { + t.Fatalf("Findings len = %d, want 2; got %+v", len(res.Findings), res.Findings) + } + metrics := make([]string, 0, len(res.Findings)) + for _, f := range res.Findings { + metrics = append(metrics, f.Metric) + } + sort.Strings(metrics) + if diff := cmp.Diff([]string{"missing_a", "missing_b"}, metrics); diff != "" { + t.Errorf("missing metrics mismatch (-want +got):\n%s", diff) + } +} From eb26f17a22fc62640b9e69e4bd1f44d0d4871326 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 01:54:28 +0300 Subject: [PATCH 09/21] feat(dashboardhygiene): recording-rule outputs count as existing A recording rule whose output is not yet in head series must still be treated as a known metric. Mirrors the resolution flow in unusedmetrics, including the VictoriaMetrics graceful-degrade sentinel. --- .../dashboardhygiene/dashboardhygiene.go | 58 ++++++++++++++++++- .../dashboardhygiene/dashboardhygiene_test.go | 29 ++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go index 9c3ca85..29d2549 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -9,6 +9,7 @@ package dashboardhygiene import ( "context" + "errors" "fmt" "net/url" "sort" @@ -16,6 +17,7 @@ import ( "github.com/remetric-dev/remetric/internal/analyzers" "github.com/remetric-dev/remetric/internal/findings" "github.com/remetric-dev/remetric/internal/grafana" + "github.com/remetric-dev/remetric/internal/prometheus" "github.com/remetric-dev/remetric/internal/promqlx" ) @@ -24,6 +26,12 @@ import ( // only in the Fix snippet, not as structured evidence. const maxSamplePanelTitles = 5 +// errRulesUnavailable signals that the rules endpoint is unreachable +// on the detected flavor (VictoriaMetrics without a vmalert client). +// Callers convert it to a Result warning rather than failing the +// analyzer. Matches the contract in internal/analyzers/unusedmetrics. +var errRulesUnavailable = errors.New("rules unavailable") + // Analyzer is the dashboard-hygiene analyzer (broken panels). type Analyzer struct{} @@ -58,6 +66,15 @@ func (a *Analyzer) Analyze(ctx context.Context, d analyzers.Deps) (analyzers.Res exists[n] = struct{}{} } + var warnings []string + if err := collectRecordingOutputs(ctx, d, exists); err != nil { + if errors.Is(err, errRulesUnavailable) { + warnings = append(warnings, "rules unavailable: VictoriaMetrics detected without --vmalert URL - recording rules ignored") + } else { + return analyzers.Result{}, err + } + } + refs, err := d.Graf.Search(ctx) if err != nil { return analyzers.Result{}, fmt.Errorf("dashboardhygiene: grafana search: %w", err) @@ -68,7 +85,6 @@ func (a *Analyzer) Analyze(ctx context.Context, d analyzers.Deps) (analyzers.Res metric string } groups := map[key]*dashAgg{} - var warnings []string for _, ref := range refs { dash, err := d.Graf.Dashboard(ctx, ref.UID) @@ -105,6 +121,46 @@ func (a *Analyzer) Analyze(ctx context.Context, d analyzers.Deps) (analyzers.Res return analyzers.Result{Findings: out, Warnings: warnings}, nil } +// collectRecordingOutputs adds the output name of every recording +// rule to exists. Prefers d.VMAlert over d.Prom when present. On +// VictoriaMetrics without a vmalert URL the sentinel errRulesUnavailable +// is returned so callers can degrade gracefully. +// +// Unlike unusedmetrics.collectRuleUsage, this helper does not extract +// metric names from rule expressions. The question this analyzer asks +// is "does this metric exist?", which is answered by the rule's output +// name, not by the metrics it consumes. +func collectRecordingOutputs(ctx context.Context, d analyzers.Deps, exists map[string]struct{}) error { + client := d.Prom + if d.VMAlert != nil { + client = d.VMAlert + } + rules, err := client.Rules(ctx) + if err != nil { + if errors.Is(err, prometheus.ErrNotFound) { + _, _ = d.Prom.BuildInfo(ctx) // trigger cached flavor detection + if d.Prom.Flavor() == prometheus.FlavorVictoria { + return errRulesUnavailable + } + } + return fmt.Errorf("dashboardhygiene: rules: %w", err) + } + if d.VMAlert == nil { + _, _ = d.Prom.BuildInfo(ctx) // trigger cached flavor detection + if d.Prom.Flavor() == prometheus.FlavorVictoria { + return errRulesUnavailable + } + } + for _, g := range rules.Groups { + for _, r := range g.Rules { + if r.Type == "recording" && r.Name != "" { + exists[r.Name] = struct{}{} + } + } + } + return nil +} + // findingLess orders broken-panel findings deterministically by // severity (desc), sample-count (desc), dashboard (asc), metric (asc). // The Dashboard tiebreaker is required because two findings can share diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index a9c8a60..9860ae9 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -272,3 +272,32 @@ func TestAnalyzer_DistinctMissingMetricsEmitSeparateFindings(t *testing.T) { t.Errorf("missing metrics mismatch (-want +got):\n%s", diff) } } + +func TestAnalyzer_RecordingRuleOutputCountsAsExisting(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, // freshly_recorded_metric not in head yet + Rules: []ruleStub{ + {Name: "freshly_recorded_metric", Query: "sum(up)", Type: "recording"}, + {Name: "AlertA", Query: "up == 0", Type: "alerting"}, + }, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "RR", + Body: `{"dashboard":{"uid":"d1","title":"RR","panels":[ + {"type":"graph","title":"P1","targets":[{"expr":"freshly_recorded_metric","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 0 { + t.Errorf("Findings len = %d, want 0 (RR output should count as existing); got %+v", + len(res.Findings), res.Findings) + } +} From 86022d7f9362e4e3e4bc5b062fcba241c45b0590 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:00:57 +0300 Subject: [PATCH 10/21] refactor(dashboardhygiene): enrich VM-flavor comments + strengthen RR test Code-quality follow-ups to the recording-rule resolution commit: - expand the two BuildInfo-adjacent comments to explain WHY each VM-flavor check exists (404 path vs 200-empty-groups path), so a future reader doesn't need to cross-reference unusedmetrics - strengthen the RR test by querying AlertA from a second panel: proves the type filter (r.Type == "recording") actually held, not just that the recording-rule output was added to exists --- .../analyzers/dashboardhygiene/dashboardhygiene.go | 13 +++++++++++-- .../dashboardhygiene/dashboardhygiene_test.go | 11 ++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go index 29d2549..ae15925 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -138,15 +138,24 @@ func collectRecordingOutputs(ctx context.Context, d analyzers.Deps, exists map[s rules, err := client.Rules(ctx) if err != nil { if errors.Is(err, prometheus.ErrNotFound) { - _, _ = d.Prom.BuildInfo(ctx) // trigger cached flavor detection + // 404 on /api/v1/rules can mean older VM builds or non-VM + // gateways; trigger cached flavor detection so the next + // check reads Flavor() reliably regardless of call order. + // Same pattern as unusedmetrics.collectRuleUsage. + _, _ = d.Prom.BuildInfo(ctx) if d.Prom.Flavor() == prometheus.FlavorVictoria { return errRulesUnavailable } } return fmt.Errorf("dashboardhygiene: rules: %w", err) } + // VictoriaMetrics single-node serves /api/v1/rules at HTTP 200 with + // empty or partial groups because recording rules live in vmalert. + // Without --vmalert we can't trust this payload to be complete, so + // short-circuit to the same sentinel the 404 path takes. Mirrors + // the unusedmetrics.collectRuleUsage rationale (intentional dup). if d.VMAlert == nil { - _, _ = d.Prom.BuildInfo(ctx) // trigger cached flavor detection + _, _ = d.Prom.BuildInfo(ctx) if d.Prom.Flavor() == prometheus.FlavorVictoria { return errRulesUnavailable } diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index 9860ae9..947522b 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -284,7 +284,8 @@ func TestAnalyzer_RecordingRuleOutputCountsAsExisting(t *testing.T) { UID: "d1", Title: "RR", Body: `{"dashboard":{"uid":"d1","title":"RR","panels":[ - {"type":"graph","title":"P1","targets":[{"expr":"freshly_recorded_metric","datasource":{"type":"prometheus"}}]} + {"type":"graph","title":"FromRecording","targets":[{"expr":"freshly_recorded_metric","datasource":{"type":"prometheus"}}]}, + {"type":"graph","title":"FromAlertName","targets":[{"expr":"AlertA","datasource":{"type":"prometheus"}}]} ]}}`, }}, }) @@ -296,8 +297,12 @@ func TestAnalyzer_RecordingRuleOutputCountsAsExisting(t *testing.T) { if err != nil { t.Fatalf("Analyze: %v", err) } - if len(res.Findings) != 0 { - t.Errorf("Findings len = %d, want 0 (RR output should count as existing); got %+v", + if len(res.Findings) != 1 { + t.Fatalf("Findings len = %d, want 1 (only AlertA should remain; freshly_recorded_metric is in exists); got %+v", len(res.Findings), res.Findings) } + if res.Findings[0].Metric != "AlertA" { + t.Errorf("Finding Metric = %q, want %q (proves type filter held: RR output added, alert name not added)", + res.Findings[0].Metric, "AlertA") + } } From 84733a9a330ed53620e638a89381ef16a390b85d Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:08:03 +0300 Subject: [PATCH 11/21] fix(promqlx): filter sentinel substrings, not just exact matches Grafana template variables like ${metric}_total sanitise to __remetric_var___total - a valid PromQL identifier that parses cleanly and leaks into the extracted metric set. Change isSentinel to a Contains check so any name containing the sentinel substring is treated as a sanitiser artifact and filtered. Without this, dashboardhygiene and unusedmetrics would treat template-variable expressions as references to bogus metrics named '__remetric_var__*'. --- internal/promqlx/extract_test.go | 10 ++++++++++ internal/promqlx/variables.go | 19 +++++++++++++++---- internal/promqlx/variables_test.go | 8 ++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/internal/promqlx/extract_test.go b/internal/promqlx/extract_test.go index a72657b..a328f44 100644 --- a/internal/promqlx/extract_test.go +++ b/internal/promqlx/extract_test.go @@ -51,6 +51,16 @@ func TestExtractFromQuery(t *testing.T) { in: `$metric_name{job="api"}`, want: []string{}, // sentinel filtered, no real metric names captured }, + { + name: "grafana_var_suffix_concat", + in: `${metric_name}_total`, + want: []string{}, // sanitises to __remetric_var___total - must be filtered as sentinel artifact + }, + { + name: "grafana_var_prefix_concat", + in: `foo_${suffix}`, + want: []string{}, // sanitises to foo___remetric_var__ - must be filtered as sentinel artifact + }, { name: "deduplication", in: `up + up + up`, diff --git a/internal/promqlx/variables.go b/internal/promqlx/variables.go index 5630911..1dcf7b4 100644 --- a/internal/promqlx/variables.go +++ b/internal/promqlx/variables.go @@ -7,7 +7,10 @@ // by substituting them with a sentinel identifier before parsing. package promqlx -import "regexp" +import ( + "regexp" + "strings" +) // sentinel replaces Grafana template variables that would otherwise // break PromQL parsing. The string is a valid PromQL identifier @@ -29,8 +32,16 @@ func sanitiseGrafanaVars(expr string) string { return grafanaVarPattern.ReplaceAllString(expr, sentinel) } -// isSentinel reports whether name is the sentinel identifier used -// to stand in for Grafana template variables. +// isSentinel reports whether name is, or was derived from, the +// sentinel identifier used to stand in for Grafana template variables. +// +// A substring (not equality) check is required because Grafana panels +// often concatenate template variables with literal text, e.g. +// "${metric}_total" or "foo_${suffix}". After sanitisation these +// become "__remetric_var___total" / "foo___remetric_var__", which are +// valid PromQL identifiers and parse cleanly. Any name containing the +// sentinel is therefore a sanitiser artifact, not a real metric, and +// must be filtered out of the extracted set. func isSentinel(name string) bool { - return name == sentinel + return strings.Contains(name, sentinel) } diff --git a/internal/promqlx/variables_test.go b/internal/promqlx/variables_test.go index 453e790..dd8ac1e 100644 --- a/internal/promqlx/variables_test.go +++ b/internal/promqlx/variables_test.go @@ -35,4 +35,12 @@ func TestIsSentinel(t *testing.T) { if isSentinel("__remetric_var") { t.Error("isSentinel(non-sentinel) = true, want false") } + // Concatenation forms: sanitiser leaves the sentinel as a substring + // when a template variable is glued to literal text in the source. + if !isSentinel("__remetric_var___total") { + t.Error("isSentinel(suffix-concat) = false, want true") + } + if !isSentinel("foo___remetric_var__") { + t.Error("isSentinel(prefix-concat) = false, want true") + } } From 26cc1a838e783b7a2f2efe4cf4baa54326e1476b Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:09:04 +0300 Subject: [PATCH 12/21] test(dashboardhygiene): pin silent skips for template-variable exprs and Loki targets Grafana template-variable queries (${metric}_total) and non-Prometheus datasources must not generate findings or warnings. The template-variable path relies on promqlx filtering sentinel-derived metric names; the Loki path relies on PanelTargets filtering by datasource type. --- .../dashboardhygiene/dashboardhygiene_test.go | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index 947522b..448fe59 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -273,6 +273,57 @@ func TestAnalyzer_DistinctMissingMetricsEmitSeparateFindings(t *testing.T) { } } +func TestAnalyzer_TemplateVariableExprSkippedSilently(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Tmpl", + Body: `{"dashboard":{"uid":"d1","title":"Tmpl","panels":[ + {"type":"graph","title":"P1","targets":[{"expr":"${metric_name}_total","datasource":{"type":"prometheus"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 0 { + t.Errorf("Findings len = %d, want 0 (template-variable expr must skip silently)", len(res.Findings)) + } + if len(res.Warnings) != 0 { + t.Errorf("Warnings = %v, want none (silent skip means no warning either)", res.Warnings) + } +} + +func TestAnalyzer_NonPromDatasourceSkipped(t *testing.T) { + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + Dashboards: []dashboardStub{{ + UID: "d1", + Title: "Loki", + Body: `{"dashboard":{"uid":"d1","title":"Loki","panels":[ + {"type":"logs","title":"App logs","targets":[{"expr":"{job=\"app\"} |= \"error\"","datasource":{"type":"loki"}}]} + ]}}`, + }}, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + if len(res.Findings) != 0 { + t.Errorf("Findings len = %d, want 0 (non-prom target should be filtered)", len(res.Findings)) + } +} + func TestAnalyzer_RecordingRuleOutputCountsAsExisting(t *testing.T) { promURL, grafURL := newStubs(t, stubsConfig{ IngestedNames: []string{"up"}, // freshly_recorded_metric not in head yet From f3430811a447771ebf2451e3765bb170782a7629 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:18:25 +0300 Subject: [PATCH 13/21] test(dashboardhygiene): pin error surface (per-dashboard, search, VM) Per-dashboard fetch errors degrade to warnings without aborting the analyzer. Search() failure is fatal. VictoriaMetrics without --vmalert emits the recording-rules-unavailable warning. --- .../dashboardhygiene/dashboardhygiene_test.go | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go index 448fe59..2512cab 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene_test.go @@ -10,6 +10,7 @@ import ( "net/http" "net/http/httptest" "sort" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -357,3 +358,134 @@ func TestAnalyzer_RecordingRuleOutputCountsAsExisting(t *testing.T) { res.Findings[0].Metric, "AlertA") } } + +func TestAnalyzer_PerDashboardFetchErrorDegrades(t *testing.T) { + // Two dashboards: d1 returns 500, d2 returns a valid body with a broken panel. + // Per-dashboard fetch failures should append a warning and the analyzer + // must continue processing remaining dashboards. + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + GrafHandler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/search": + _ = json.NewEncoder(w).Encode([]map[string]any{ + {"uid": "d1", "title": "Broken Fetch"}, + {"uid": "d2", "title": "OK"}, + }) + case "/api/dashboards/uid/d1": + w.WriteHeader(http.StatusInternalServerError) + case "/api/dashboards/uid/d2": + _, _ = w.Write([]byte(`{"dashboard":{"uid":"d2","title":"OK","panels":[ + {"type":"graph","title":"P","targets":[{"expr":"missing_xyz","datasource":{"type":"prometheus"}}]} + ]}}`)) + default: + http.NotFound(w, r) + } + }, + }) + pc, gc := mustClients(t, promURL, grafURL) + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v (should degrade, not fail)", err) + } + if len(res.Findings) != 1 || res.Findings[0].Metric != "missing_xyz" { + t.Errorf("Findings = %+v; want one finding for d2", res.Findings) + } + if len(res.Warnings) != 1 { + t.Fatalf("Warnings len = %d, want 1; got %v", len(res.Warnings), res.Warnings) + } + if !strings.Contains(res.Warnings[0], `dashboard "d1"`) { + t.Errorf("warning %q should mention d1", res.Warnings[0]) + } +} + +func TestAnalyzer_SearchErrorIsFatal(t *testing.T) { + // Grafana /api/search 500 is unrecoverable: without the dashboard + // list there is nothing for the analyzer to walk. Must wrap and return. + promURL, grafURL := newStubs(t, stubsConfig{ + IngestedNames: []string{"up"}, + GrafHandler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + }, + }) + pc, gc := mustClients(t, promURL, grafURL) + + _, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err == nil { + t.Fatal("Analyze returned nil error; Search() 500 should be fatal") + } + if !strings.Contains(err.Error(), "grafana search") { + t.Errorf("error %q should mention grafana search", err.Error()) + } +} + +func TestAnalyzer_VictoriaMetricsWithoutVMAlertWarns(t *testing.T) { + // VictoriaMetrics single-node serves /api/v1/rules but the payload + // is incomplete without --vmalert. collectRecordingOutputs returns + // errRulesUnavailable, which Analyze converts to the "rules unavailable" + // warning. The dashboard walk still runs. + // + // Mirrors unusedmetrics' TestAnalyzer_VictoriaWithoutVMAlert_WarnsAndContinues: + // force the flavor with WithFlavorHint(FlavorVictoria) rather than relying + // on buildinfo auto-detection. Build the prom client inline so the shared + // mustClients helper (which hints FlavorProm) is unaffected. + prom := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/label/__name__/values": + _, _ = w.Write([]byte(`{"status":"success","data":["up"]}`)) + case "/api/v1/rules": + _, _ = w.Write([]byte(`{"status":"success","data":{"groups":[]}}`)) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(prom.Close) + + graf := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/search": + _ = json.NewEncoder(w).Encode([]map[string]any{{"uid": "d1", "title": "VM"}}) + case "/api/dashboards/uid/d1": + _, _ = w.Write([]byte(`{"dashboard":{"uid":"d1","title":"VM","panels":[ + {"type":"graph","title":"P","targets":[{"expr":"up","datasource":{"type":"prometheus"}}]} + ]}}`)) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(graf.Close) + + pc, err := prometheus.New(prom.URL, prometheus.WithFlavorHint(prometheus.FlavorVictoria)) + if err != nil { + t.Fatalf("prometheus.New: %v", err) + } + gc, err := grafana.New(graf.URL) + if err != nil { + t.Fatalf("grafana.New: %v", err) + } + + res, err := New().Analyze(context.Background(), analyzers.Deps{ + Prom: pc, Graf: gc, Logger: slog.Default(), Limits: analyzers.DefaultLimits(), + }) + if err != nil { + t.Fatalf("Analyze: %v", err) + } + found := false + for _, w := range res.Warnings { + if strings.Contains(w, "rules unavailable") && strings.Contains(w, "VictoriaMetrics") { + found = true + break + } + } + if !found { + t.Errorf("expected VM-without-vmalert warning; got %v", res.Warnings) + } +} From 168e3326c9a60c0821d61b09f90aa387b358d76f Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:27:27 +0300 Subject: [PATCH 14/21] feat(dashboardhygiene): fix-snippet builder Renders a paste-ready instruction block: restore the metric or remove the broken queries. Drops the URL line when no absolute dashboard URL is available. Caps the panel list at 10 entries with a '... and N more' tail. --- .../dashboardhygiene/dashboardhygiene.go | 7 --- internal/analyzers/dashboardhygiene/fix.go | 45 ++++++++++++++ .../analyzers/dashboardhygiene/fix_test.go | 61 +++++++++++++++++++ 3 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 internal/analyzers/dashboardhygiene/fix.go create mode 100644 internal/analyzers/dashboardhygiene/fix_test.go diff --git a/internal/analyzers/dashboardhygiene/dashboardhygiene.go b/internal/analyzers/dashboardhygiene/dashboardhygiene.go index ae15925..eaee268 100644 --- a/internal/analyzers/dashboardhygiene/dashboardhygiene.go +++ b/internal/analyzers/dashboardhygiene/dashboardhygiene.go @@ -236,10 +236,3 @@ func buildFinding(uid, missing string, agg dashAgg) findings.Finding { }, } } - -// buildFix returns a paste-ready instruction block. Full implementation -// comes in Task 11 (fix.go). For now the stub returns an empty string -// so happy-path tests can pass without locking in the snippet format. -func buildFix(_, _, _ string, _ []string) string { - return "" -} diff --git a/internal/analyzers/dashboardhygiene/fix.go b/internal/analyzers/dashboardhygiene/fix.go new file mode 100644 index 0000000..d388b75 --- /dev/null +++ b/internal/analyzers/dashboardhygiene/fix.go @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +package dashboardhygiene + +import ( + "fmt" + "strings" +) + +// buildFix renders a paste-ready instruction block for a single +// (dashboard, missing-metric) finding. The block opens with a header +// identifying the dashboard (with optional absolute URL), lists the +// two remediation paths (restore the metric or remove the queries), +// names the affected panels (up to maxFixPanels, then "... and N more"), +// and closes with a Reference link to the public docs page. +func buildFix(dashTitle, missing, dashURL string, panelTitles []string) string { + var b strings.Builder + if dashURL != "" { + fmt.Fprintf(&b, "Edit dashboard %q (%s)\n", dashTitle, dashURL) + } else { + fmt.Fprintf(&b, "Edit dashboard %q\n", dashTitle) + } + b.WriteString("and either:\n") + fmt.Fprintf(&b, " 1. Restore metric %q (re-enable the scrape job or recording rule), or\n", missing) + b.WriteString(" 2. Remove/replace the broken queries in panel(s):\n") + n := len(panelTitles) + limit := n + if limit > maxFixPanels { + limit = maxFixPanels + } + for i := 0; i < limit; i++ { + fmt.Fprintf(&b, " - %s\n", panelTitles[i]) + } + if n > maxFixPanels { + fmt.Fprintf(&b, " ... and %d more\n", n-maxFixPanels) + } + b.WriteString("Reference: https://remetric.dev/findings/broken-panel") + return b.String() +} + +// maxFixPanels caps the number of panel titles listed in the Fix +// snippet. Beyond this, panel titles are elided with "... and N more" +// to keep the output skimmable. +const maxFixPanels = 10 diff --git a/internal/analyzers/dashboardhygiene/fix_test.go b/internal/analyzers/dashboardhygiene/fix_test.go new file mode 100644 index 0000000..298b454 --- /dev/null +++ b/internal/analyzers/dashboardhygiene/fix_test.go @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +package dashboardhygiene + +import ( + "strings" + "testing" +) + +func TestBuildFix_WithURLAndPanelList(t *testing.T) { + got := buildFix( + "Frontend SLOs", + "node_disk_io_now", + "https://grafana.example.com/d/abc123/frontend-slos", + []string{"Disk I/O - last 5m", "Disk I/O - last 1h"}, + ) + for _, want := range []string{ + `Edit dashboard "Frontend SLOs"`, + `https://grafana.example.com/d/abc123/frontend-slos`, + `Restore metric "node_disk_io_now"`, + `Disk I/O - last 5m`, + `Disk I/O - last 1h`, + `https://remetric.dev/findings/broken-panel`, + } { + if !strings.Contains(got, want) { + t.Errorf("buildFix output missing %q\nfull output:\n%s", want, got) + } + } +} + +func TestBuildFix_OmitsURLWhenEmpty(t *testing.T) { + got := buildFix("D", "m", "", []string{"P"}) + if strings.Contains(got, "(http") { + t.Errorf("empty URL should drop the URL parenthetical; got:\n%s", got) + } + if strings.Contains(got, "https://grafana") { + t.Errorf("empty URL must not emit any grafana URL; got:\n%s", got) + } + if !strings.Contains(got, `Edit dashboard "D"`) { + t.Errorf("title still expected; got:\n%s", got) + } +} + +func TestBuildFix_TruncatesPanelListPast10(t *testing.T) { + titles := []string{ + "P01", "P02", "P03", "P04", "P05", + "P06", "P07", "P08", "P09", "P10", + "P11", "P12", + } + got := buildFix("D", "m", "", titles) + if !strings.Contains(got, "P10") { + t.Errorf("10th panel must still appear; got:\n%s", got) + } + if strings.Contains(got, "P11") { + t.Errorf("11th panel must be elided; got:\n%s", got) + } + if !strings.Contains(got, "... and 2 more") { + t.Errorf("expected '... and 2 more' tail; got:\n%s", got) + } +} From 9fa28798a48e9c3e676e1ecd0d6a5dab3c50ada4 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:34:18 +0300 Subject: [PATCH 15/21] refactor(dashboardhygiene): DRY docs URL + idiomatic range loop + min builtin Code-quality follow-ups to the fix-snippet commit: - pull the broken-panel docs URL from findings.DocURL(ClassBrokenPanel) instead of a hardcoded literal, so the single source of truth in internal/findings/ stays authoritative - replace explicit limit-clamping with the Go 1.21+ min() builtin - replace C-style index loop with idiomatic range over the slice --- internal/analyzers/dashboardhygiene/fix.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/internal/analyzers/dashboardhygiene/fix.go b/internal/analyzers/dashboardhygiene/fix.go index d388b75..1cb8b51 100644 --- a/internal/analyzers/dashboardhygiene/fix.go +++ b/internal/analyzers/dashboardhygiene/fix.go @@ -6,6 +6,8 @@ package dashboardhygiene import ( "fmt" "strings" + + "github.com/remetric-dev/remetric/internal/findings" ) // buildFix renders a paste-ready instruction block for a single @@ -25,17 +27,14 @@ func buildFix(dashTitle, missing, dashURL string, panelTitles []string) string { fmt.Fprintf(&b, " 1. Restore metric %q (re-enable the scrape job or recording rule), or\n", missing) b.WriteString(" 2. Remove/replace the broken queries in panel(s):\n") n := len(panelTitles) - limit := n - if limit > maxFixPanels { - limit = maxFixPanels - } - for i := 0; i < limit; i++ { - fmt.Fprintf(&b, " - %s\n", panelTitles[i]) + limit := min(n, maxFixPanels) + for _, title := range panelTitles[:limit] { + fmt.Fprintf(&b, " - %s\n", title) } if n > maxFixPanels { fmt.Fprintf(&b, " ... and %d more\n", n-maxFixPanels) } - b.WriteString("Reference: https://remetric.dev/findings/broken-panel") + b.WriteString("Reference: " + findings.DocURL(findings.ClassBrokenPanel)) return b.String() } From ac9701fc6622146809a3173c37cd364115a4e90a Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:41:34 +0300 Subject: [PATCH 16/21] feat(cli): add 'remetric dashboards broken' subcommand New top-level subject 'dashboards' with one action 'broken'. Requires --prometheus and --grafana. Honors --output, --min-severity, --ignore-dashboard, --ignore-metric, --fail-on, --limit, --timeout. --- internal/cli/dashboards.go | 145 ++++++++++++++++++++++++++++ internal/cli/dashboards_test.go | 162 ++++++++++++++++++++++++++++++++ internal/cli/root.go | 1 + 3 files changed, 308 insertions(+) create mode 100644 internal/cli/dashboards.go create mode 100644 internal/cli/dashboards_test.go diff --git a/internal/cli/dashboards.go b/internal/cli/dashboards.go new file mode 100644 index 0000000..8fd1516 --- /dev/null +++ b/internal/cli/dashboards.go @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +package cli + +import ( + "context" + "errors" + "fmt" + "sort" + + "github.com/spf13/cobra" + + "github.com/remetric-dev/remetric/internal/analyzers" + "github.com/remetric-dev/remetric/internal/analyzers/dashboardhygiene" + "github.com/remetric-dev/remetric/internal/findings" +) + +func newDashboardsCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "dashboards", + Short: "Inspect dashboard-level findings (broken panels).", + Long: `Dashboard subcommands flag Grafana dashboards whose panel +queries reference metrics that don't exist in Prometheus. + +Subcommands: + broken Flag panels referencing missing metrics.`, + } + cmd.AddCommand(newDashboardsBrokenCmd()) + return cmd +} + +//nolint:gocyclo // RunE closure is straight-line config-gathering; mirrors metrics.go +func newDashboardsBrokenCmd() *cobra.Command { + var ( + limit int + minSeverity string + ) + cmd := &cobra.Command{ + Use: "broken", + Short: "Flag dashboards whose panels reference missing metrics.", + Long: `Walks every Grafana dashboard, parses each Prometheus target +query, and reports (dashboard, missing-metric) pairs as one +finding each. A metric is "missing" if it is not present in +Prometheus head series (LabelValues) and not declared as the +output of any recording rule. + +Requires both --prometheus and --grafana.`, + Example: ` # All broken-panel findings + remetric dashboards broken --prometheus http://localhost:9090 --grafana http://localhost:3000 + + # JSON output for CI + remetric dashboards broken --prometheus ... --grafana ... --output json + + # Suppress a known-broken dashboard + remetric dashboards broken --prometheus ... --grafana ... --ignore-dashboard "Legacy Disk Stats"`, + RunE: func(cmd *cobra.Command, _ []string) error { + ctx := cmd.Context() + cfg := configFrom(ctx) + if cfg == nil || cfg.Prometheus.URL == "" { + return &flagError{err: errors.New("--prometheus is required")} + } + if cfg.Grafana.URL == "" { + return &flagError{err: errors.New("--grafana is required for dashboards broken")} + } + if err := validateOutput(cfg.Output); err != nil { + return &flagError{err: err} + } + minSev, err := findings.ParseSeverity(minSeverity) + if err != nil { + return &flagError{err: fmt.Errorf("invalid --min-severity: %w", err)} + } + if cfg.Timeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, cfg.Timeout) + defer cancel() + } + + prom, err := buildPromClient(cfg, "remetric/dashboards-broken") + if err != nil { + return &exitError{code: 2, err: err} + } + graf, err := buildGrafanaClient(cfg, "remetric/dashboards-broken") + if err != nil { + return &exitError{code: 2, err: err} + } + vmalert, err := buildVMAlertClient(cfg, "remetric/dashboards-broken") + if err != nil { + return &exitError{code: 2, err: err} + } + + d := analyzers.Deps{ + Prom: prom, + Graf: graf, + VMAlert: vmalert, + Logger: loggerFrom(ctx), + Limits: analyzers.DefaultLimits(), + } + res, err := dashboardhygiene.New().Analyze(ctx, d) + if err != nil { + return &exitError{code: 1, err: err} + } + for _, w := range res.Warnings { + _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "! warning: %s\n", w) + } + all, ignored := cfg.IgnoreFilter().Apply(res.Findings) + + filtered := filterAtLeast(all, minSev) + if len(filtered) == 0 { + return renderEmpty(cfg, cmd.OutOrStdout(), brokenPanelCopy, minSev, len(all), tallyBySeverity(all), ignored) + } + sort.SliceStable(filtered, func(i, j int) bool { + if filtered[i].Severity != filtered[j].Severity { + return filtered[i].Severity > filtered[j].Severity + } + if filtered[i].Dashboard != filtered[j].Dashboard { + return filtered[i].Dashboard < filtered[j].Dashboard + } + return filtered[i].Metric < filtered[j].Metric + }) + if limit >= 0 && len(filtered) > limit { + filtered = filtered[:limit] + } + if len(filtered) == 0 { + return renderEmpty(cfg, cmd.OutOrStdout(), brokenPanelCopy, minSev, 0, nil, ignored) + } + if err := renderFindings(cfg, cmd.OutOrStdout(), filtered, ignored); err != nil { + return err + } + sev, enabled := cfg.FailOnThreshold() + if findings.ShouldFail(sev, enabled, filtered) { + return &exitError{code: 3, err: fmt.Errorf("findings at or above --fail-on=%s", cfg.FailOn)} + } + return nil + }, + } + cmd.Flags().IntVar(&limit, "limit", 20, "Maximum findings to print (0 = none)") + cmd.Flags().StringVar(&minSeverity, "min-severity", "medium", "Minimum severity to print: low|medium|high|critical") + return cmd +} + +var brokenPanelCopy = emptyCopy{ + NoResults: "No broken panels found - every dashboard panel query resolves to a metric that exists in Prometheus head series or as a recording-rule output.", + Subject: "broken panels", +} diff --git a/internal/cli/dashboards_test.go b/internal/cli/dashboards_test.go new file mode 100644 index 0000000..9757075 --- /dev/null +++ b/internal/cli/dashboards_test.go @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +package cli_test + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + cli "github.com/remetric-dev/remetric/internal/cli" +) + +// newDashboardsStubs spins up a Prom + Grafana pair where one dashboard +// references a missing metric "missing_xyz". +func newDashboardsStubs(t *testing.T) (promURL, grafURL string) { + t.Helper() + prom := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/v1/label/__name__/values": + _ = json.NewEncoder(w).Encode(map[string]any{"status": "success", "data": []string{"up"}}) + case "/api/v1/rules": + _ = json.NewEncoder(w).Encode(map[string]any{ + "status": "success", + "data": map[string]any{"groups": []map[string]any{}}, + }) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(prom.Close) + + graf := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/search": + _ = json.NewEncoder(w).Encode([]map[string]any{ + {"uid": "d1", "title": "Disk", "url": "/d/d1/disk"}, + }) + case "/api/dashboards/uid/d1": + _, _ = w.Write([]byte(`{"dashboard":{"uid":"d1","title":"Disk","panels":[ + {"type":"graph","title":"P","targets":[{"expr":"missing_xyz","datasource":{"type":"prometheus"}}]} + ]}}`)) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(graf.Close) + return prom.URL, graf.URL +} + +func TestDashboardsBroken_RequiresGrafana(t *testing.T) { + var stdout, stderr bytes.Buffer + code := cli.ExecuteWith(cli.Args{ + Version: "test", + Args: []string{"dashboards", "broken", "--prometheus", "http://127.0.0.1:1"}, + Stdout: &stdout, + Stderr: &stderr, + }) + if code != 2 { + t.Errorf("exit code = %d, want 2 (flag error); stderr=%s", code, stderr.String()) + } + if !strings.Contains(stderr.String(), "--grafana is required") { + t.Errorf("stderr = %q, want contains '--grafana is required'", stderr.String()) + } +} + +func TestDashboardsBroken_TextOutput(t *testing.T) { + promURL, grafURL := newDashboardsStubs(t) + var stdout, stderr bytes.Buffer + code := cli.ExecuteWith(cli.Args{ + Version: "test", + Args: []string{ + "dashboards", "broken", + "--prometheus", promURL, + "--grafana", grafURL, + }, + Stdout: &stdout, + Stderr: &stderr, + }) + if code != 0 { + t.Fatalf("exit code = %d, want 0; stderr=%s", code, stderr.String()) + } + if !strings.Contains(stdout.String(), "missing_xyz") { + t.Errorf("stdout should mention missing_xyz; got: %s", stdout.String()) + } +} + +func TestDashboardsBroken_JSONOutput(t *testing.T) { + promURL, grafURL := newDashboardsStubs(t) + var stdout, stderr bytes.Buffer + code := cli.ExecuteWith(cli.Args{ + Version: "test", + Args: []string{ + "dashboards", "broken", + "--prometheus", promURL, + "--grafana", grafURL, + "--output", "json", + }, + Stdout: &stdout, + Stderr: &stderr, + }) + if code != 0 { + t.Fatalf("exit code = %d, want 0; stderr=%s", code, stderr.String()) + } + var payload struct { + Findings []struct { + Class string `json:"class"` + Metric string `json:"metric"` + Dashboard string `json:"dashboard"` + } `json:"findings"` + } + if err := json.Unmarshal(stdout.Bytes(), &payload); err != nil { + t.Fatalf("json.Unmarshal: %v\nstdout=%s", err, stdout.String()) + } + if len(payload.Findings) != 1 { + t.Fatalf("findings len = %d, want 1; payload=%+v", len(payload.Findings), payload) + } + if payload.Findings[0].Class != "broken-panel" || payload.Findings[0].Metric != "missing_xyz" { + t.Errorf("finding = %+v, want class=broken-panel metric=missing_xyz", payload.Findings[0]) + } + if payload.Findings[0].Dashboard != "Disk" { + t.Errorf("Dashboard = %q, want %q", payload.Findings[0].Dashboard, "Disk") + } +} + +func TestDashboardsBroken_IgnoreDashboardDrops(t *testing.T) { + promURL, grafURL := newDashboardsStubs(t) + var stdout, stderr bytes.Buffer + code := cli.ExecuteWith(cli.Args{ + Version: "test", + Args: []string{ + "dashboards", "broken", + "--prometheus", promURL, + "--grafana", grafURL, + "--ignore-dashboard", "Disk", + "--output", "json", + }, + Stdout: &stdout, + Stderr: &stderr, + }) + if code != 0 { + t.Fatalf("exit code = %d, want 0; stderr=%s", code, stderr.String()) + } + var payload struct { + Findings []any `json:"findings"` + IgnoredCount int `json:"ignored_count"` + } + if err := json.Unmarshal(stdout.Bytes(), &payload); err != nil { + t.Fatalf("json.Unmarshal: %v\nstdout=%s", err, stdout.String()) + } + if len(payload.Findings) != 0 { + t.Errorf("findings len = %d, want 0 (dropped by --ignore-dashboard)", len(payload.Findings)) + } + if payload.IgnoredCount != 1 { + t.Errorf("IgnoredCount = %d, want 1", payload.IgnoredCount) + } +} diff --git a/internal/cli/root.go b/internal/cli/root.go index bdf9eca..ceb8c51 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -133,6 +133,7 @@ Supported Prometheus versions: 2.30 and newer.`, cmd.AddCommand(newCardinalityCmd()) cmd.AddCommand(newMetricsCmd()) cmd.AddCommand(newAlertsCmd()) + cmd.AddCommand(newDashboardsCmd()) cmd.AddCommand(newScanCmd()) cmd.AddCommand(newReportCmd()) return cmd From 2f83eab498eead4d3b0e79fb169209311cee3e75 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:51:58 +0300 Subject: [PATCH 17/21] refactor(cli): trust analyzer sort order + move brokenPanelCopy to empty.go Code-quality follow-ups to the dashboards broken subcommand: - drop the CLI's local re-sort; the analyzer already orders by (severity desc, sample-count desc, dashboard asc, metric asc), and the filter passes are stable. The local sort was discarding the sample-count tiebreaker - meaningful signal that broken-from- many-panels metrics rank higher within a severity tier - move brokenPanelCopy to empty.go for parity with cardinalityCopy and labelPatternCopy - extend TestEmptyCopy_Values to cover brokenPanelCopy and the previously-uncovered unusedMetricsCopy --- internal/cli/dashboards.go | 18 ------------------ internal/cli/empty.go | 4 ++++ internal/cli/empty_test.go | 12 ++++++++++++ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/internal/cli/dashboards.go b/internal/cli/dashboards.go index 8fd1516..3762527 100644 --- a/internal/cli/dashboards.go +++ b/internal/cli/dashboards.go @@ -7,7 +7,6 @@ import ( "context" "errors" "fmt" - "sort" "github.com/spf13/cobra" @@ -109,21 +108,9 @@ Requires both --prometheus and --grafana.`, if len(filtered) == 0 { return renderEmpty(cfg, cmd.OutOrStdout(), brokenPanelCopy, minSev, len(all), tallyBySeverity(all), ignored) } - sort.SliceStable(filtered, func(i, j int) bool { - if filtered[i].Severity != filtered[j].Severity { - return filtered[i].Severity > filtered[j].Severity - } - if filtered[i].Dashboard != filtered[j].Dashboard { - return filtered[i].Dashboard < filtered[j].Dashboard - } - return filtered[i].Metric < filtered[j].Metric - }) if limit >= 0 && len(filtered) > limit { filtered = filtered[:limit] } - if len(filtered) == 0 { - return renderEmpty(cfg, cmd.OutOrStdout(), brokenPanelCopy, minSev, 0, nil, ignored) - } if err := renderFindings(cfg, cmd.OutOrStdout(), filtered, ignored); err != nil { return err } @@ -138,8 +125,3 @@ Requires both --prometheus and --grafana.`, cmd.Flags().StringVar(&minSeverity, "min-severity", "medium", "Minimum severity to print: low|medium|high|critical") return cmd } - -var brokenPanelCopy = emptyCopy{ - NoResults: "No broken panels found - every dashboard panel query resolves to a metric that exists in Prometheus head series or as a recording-rule output.", - Subject: "broken panels", -} diff --git a/internal/cli/empty.go b/internal/cli/empty.go index 42fa819..3ad5b3e 100644 --- a/internal/cli/empty.go +++ b/internal/cli/empty.go @@ -36,6 +36,10 @@ var ( NoResults: "No unused metrics found - every ingested metric is referenced by a dashboard, alert, or recording rule.", Subject: "unused metrics", } + brokenPanelCopy = emptyCopy{ + NoResults: "No broken panels found - every dashboard panel query resolves to a metric that exists in Prometheus head series or as a recording-rule output.", + Subject: "broken panels", + } ) // tallyBySeverity counts findings per severity tier. diff --git a/internal/cli/empty_test.go b/internal/cli/empty_test.go index b69ecb6..360690d 100644 --- a/internal/cli/empty_test.go +++ b/internal/cli/empty_test.go @@ -81,6 +81,18 @@ func TestEmptyCopy_Values(t *testing.T) { if labelPatternCopy.NoResults == "" { t.Error("labelPatternCopy.NoResults is empty") } + if unusedMetricsCopy.Subject == "" { + t.Error("unusedMetricsCopy.Subject is empty") + } + if unusedMetricsCopy.NoResults == "" { + t.Error("unusedMetricsCopy.NoResults is empty") + } + if brokenPanelCopy.Subject == "" { + t.Error("brokenPanelCopy.Subject is empty") + } + if brokenPanelCopy.NoResults == "" { + t.Error("brokenPanelCopy.NoResults is empty") + } } func TestRenderEmpty_Terminal_NoResults(t *testing.T) { From 6d16642e9a1245bc7f282c37b361a3881f2b05b3 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 02:56:45 +0300 Subject: [PATCH 18/21] feat(cli): wire dashboardhygiene into scan + report runners Both flows include the new analyzer; without --grafana it emits a warning and zero findings (consistent with unusedmetrics). --- internal/cli/report.go | 2 ++ internal/cli/scan.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/internal/cli/report.go b/internal/cli/report.go index 4c310dc..6ef6475 100644 --- a/internal/cli/report.go +++ b/internal/cli/report.go @@ -17,6 +17,7 @@ import ( "github.com/remetric-dev/remetric/internal/analyzers" "github.com/remetric-dev/remetric/internal/analyzers/alerthygiene" "github.com/remetric-dev/remetric/internal/analyzers/cardinality" + "github.com/remetric-dev/remetric/internal/analyzers/dashboardhygiene" "github.com/remetric-dev/remetric/internal/analyzers/labelpattern" "github.com/remetric-dev/remetric/internal/analyzers/unusedmetrics" "github.com/remetric-dev/remetric/internal/findings" @@ -96,6 +97,7 @@ instead. Use --out FILE to write to a file (or '-' for stdout).`, labelpattern.New(), unusedmetrics.New(), alerthygiene.New(alerthygiene.Config{Lookback: lookback, Step: step}), + dashboardhygiene.New(), } var ( all []findings.Finding diff --git a/internal/cli/scan.go b/internal/cli/scan.go index 46a751b..91fc67f 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -15,6 +15,7 @@ import ( "github.com/remetric-dev/remetric/internal/analyzers" "github.com/remetric-dev/remetric/internal/analyzers/alerthygiene" "github.com/remetric-dev/remetric/internal/analyzers/cardinality" + "github.com/remetric-dev/remetric/internal/analyzers/dashboardhygiene" "github.com/remetric-dev/remetric/internal/analyzers/labelpattern" "github.com/remetric-dev/remetric/internal/analyzers/unusedmetrics" "github.com/remetric-dev/remetric/internal/config" @@ -85,6 +86,7 @@ into a single findings.Report shape - see the JSON schema in the spec.`, labelpattern.New(), unusedmetrics.New(), alerthygiene.New(alerthygiene.Config{}), + dashboardhygiene.New(), } prog := progress.New(cmd.ErrOrStderr(), cfg.NoProgress) for _, a := range runners { From 8419fd2bb71cd9131e9954c794c9f0bf235f1539 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 03:03:43 +0300 Subject: [PATCH 19/21] docs(findings): replace dashboard-sprawl placeholder with broken-panel page Real content for the broken-panel finding class. Updates the catalog, the unused-metric cross-link, the mkdocs nav, and the 'What's still missing in v0.1' README section since the analyzer now ships in v0.1. --- README.md | 6 -- docs/findings/broken-panel.md | 97 +++++++++++++++++++++++++++++++ docs/findings/dashboard-sprawl.md | 22 ------- docs/findings/index.md | 2 +- docs/findings/unused-metric.md | 4 +- mkdocs.yml | 2 +- 6 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 docs/findings/broken-panel.md delete mode 100644 docs/findings/dashboard-sprawl.md diff --git a/README.md b/README.md index 5becb94..0380a7e 100644 --- a/README.md +++ b/README.md @@ -276,12 +276,6 @@ Global flags (subset; see `--help` for the full list): Full reference at [remetric.dev](https://remetric.dev/) - one page per finding class with detection rules, fix snippets, and false-positive notes. -## What's still missing in v0.1 - -- No dashboard sprawl analyzer. - -This lands in a subsequent release. - ## CI integration Pair any analyzer command with `--fail-on=critical` to fail the build when a diff --git a/docs/findings/broken-panel.md b/docs/findings/broken-panel.md new file mode 100644 index 0000000..30d80d2 --- /dev/null +++ b/docs/findings/broken-panel.md @@ -0,0 +1,97 @@ +# Broken panel + +A Grafana dashboard panel queries a Prometheus metric that does not exist. + +**Class:** `broken-panel` +**Severity:** Medium +**Category:** `dashboard_hygiene` +**Detected by:** `remetric dashboards broken`, `remetric scan` + +## What it means + +The panel's PromQL target references a metric name that is not in +Prometheus head series and is not declared as the output of any +recording rule. The panel will render as empty (no data) or as a +flat line, and the user will see a silent gap rather than the +expected signal. + +## Why it matters + +Empty dashboard tiles are dangerous because they look the same as +a system that is fine. An on-call engineer scanning a dashboard +for problems can miss a real issue because the relevant tile shows +"no data" instead of the metric they expect. A scan that surfaces +every (dashboard, missing-metric) pair lets you either restore the +metric or remove the dead query. + +## How remetric detects it + +1. Build the set of "known metrics": + - Every metric name returned by `/api/v1/label/__name__/values`. + - Every recording-rule output name returned by `/api/v1/rules` + (a freshly-added recording rule counts as known even before + it emits its first sample). +2. For each Grafana dashboard, parse every panel's Prometheus + target with the PromQL parser and extract referenced metric + names. +3. Any extracted name not in the known set is "missing". Findings + are grouped by `(dashboard, missing-metric)` so 50 panels in one + dashboard referencing the same removed metric collapse into a + single finding listing the affected panels. + +## Known false positives + +- **Intermittent metrics.** A metric that is only present during + scheduled cron-job runs may appear missing during a scan that + runs between executions. Suppress with `--ignore-metric `. +- **Freshly-rotated retention.** If a metric is in long-term storage + but no longer in head series, it counts as missing. Most users + do not hit this because head-series retention is usually 15 days + or more. +- **VictoriaMetrics without `--vmalert`.** Recording rules live in + the `vmalert` process, not in `vmselect`. When `--vmalert` is not + provided, recording outputs are not visible to remetric and + any panel querying a recording-rule output is reported as broken. + The analyzer surfaces this case as a warning rather than failing. + +## How to fix + +Pick one of: + +1. **Restore the metric.** Re-enable the scrape job, fix the + exporter, or add back the recording rule that emits the + metric. +2. **Remove the dead query.** Edit the dashboard in Grafana, + delete the panel or rewrite its query to use a metric that + still exists. +3. **Suppress.** If the dashboard is known-stale and you cannot + delete it yet, use `--ignore-dashboard "Legacy.*"` (anchored + regex against the dashboard title). + +## Sample JSON + +```json +{ + "id": "broken-panel:abc123:node_disk_io_now", + "severity": "medium", + "category": "dashboard_hygiene", + "class": "broken-panel", + "title": "dashboard \"Frontend SLOs\" references missing metric \"node_disk_io_now\"", + "metric": "node_disk_io_now", + "dashboard": "Frontend SLOs", + "evidence": { + "description": "2 panel(s) in dashboard \"Frontend SLOs\" query \"node_disk_io_now\" which is not present in head series or recording-rule outputs", + "sample_values": ["Disk I/O - last 5m", "Disk I/O - last 1h"] + }, + "fix": { + "type": "edit_dashboard", + "config": "Edit dashboard \"Frontend SLOs\" (https://grafana.example.com/d/abc123/frontend-slos)\nand either:\n 1. Restore metric \"node_disk_io_now\" (...)\n 2. Remove/replace the broken queries in panel(s):\n - Disk I/O - last 5m\n - Disk I/O - last 1h\nReference: https://remetric.dev/findings/broken-panel" + }, + "impact": { + "series_reduction": 0, + "percentage": 0, + "estimation_method": "broken_panel" + }, + "documentation_url": "https://remetric.dev/findings/broken-panel" +} +``` diff --git a/docs/findings/dashboard-sprawl.md b/docs/findings/dashboard-sprawl.md deleted file mode 100644 index 0cade47..0000000 --- a/docs/findings/dashboard-sprawl.md +++ /dev/null @@ -1,22 +0,0 @@ -# Dashboard sprawl - -**Class:** `dashboard-sprawl` -**Category:** `dashboard_sprawl` - -!!! info "Not yet implemented" - The `DashboardSprawlAnalyzer` is on the v0.1 launch path but not yet - shipped. The Class slug is reserved. - -## What it will detect - -When implemented, this class will surface: - -- **Untouched dashboards** - dashboards that haven't been viewed in over N days - (last-viewed timestamp via the Grafana API). -- **Broken panels** - dashboard panels whose queries reference metrics that no - longer exist (or never existed) in Prometheus. - -## Status - -See the [backlog](https://github.com/remetric-dev/remetric/issues) for current -implementation status. diff --git a/docs/findings/index.md b/docs/findings/index.md index 146b910..fe44529 100644 --- a/docs/findings/index.md +++ b/docs/findings/index.md @@ -11,4 +11,4 @@ that triggered it. Each class has its own reference page below. | [Label pattern overly granular](label-pattern-overly-granular.md) | Low / Medium | label_patterns | | [Never-firing alert](never-firing-alert.md) | Medium | alert_hygiene | | [Always-firing alert](always-firing-alert.md) | Critical | alert_hygiene | -| [Dashboard sprawl](dashboard-sprawl.md) | n/a (coming soon) | dashboard_sprawl | +| [Broken panel](broken-panel.md) | Medium | dashboard_hygiene | diff --git a/docs/findings/unused-metric.md b/docs/findings/unused-metric.md index ed800e6..025331b 100644 --- a/docs/findings/unused-metric.md +++ b/docs/findings/unused-metric.md @@ -89,5 +89,5 @@ Suppress via `--ignore-metric `. See the - [Hot label](hot-label.md) - if the metric is heavily used but bloated by one label, fix the label rather than dropping the metric. -- [Dashboard sprawl](dashboard-sprawl.md) - if the metric is referenced only by - dashboards nobody looks at, its "used" status may itself be a lie. +- [Broken panel](broken-panel.md) - the inverse case: a dashboard panel + references a metric that no longer exists in Prometheus. diff --git a/mkdocs.yml b/mkdocs.yml index 9ede4da..6519cba 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -37,7 +37,7 @@ nav: - Label pattern overly granular: findings/label-pattern-overly-granular.md - Never-firing alert: findings/never-firing-alert.md - Always-firing alert: findings/always-firing-alert.md - - Dashboard sprawl (coming soon): findings/dashboard-sprawl.md + - Broken panel: findings/broken-panel.md markdown_extensions: - admonition From 247b8432f1eef6e88ca048354b89ba5545d01e3d Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 03:12:58 +0300 Subject: [PATCH 20/21] test(e2e): broken-panel scenario against demo stack Provisions a Grafana dashboard whose only panel queries a metric Prometheus does not scrape; runs 'remetric dashboards broken' and asserts the finding is emitted with class=broken-panel. --- e2e/dashboards_e2e_test.go | 54 +++++++++++++++++++ e2e/grafana/dashboards/broken-panel-demo.json | 17 ++++++ 2 files changed, 71 insertions(+) create mode 100644 e2e/dashboards_e2e_test.go create mode 100644 e2e/grafana/dashboards/broken-panel-demo.json diff --git a/e2e/dashboards_e2e_test.go b/e2e/dashboards_e2e_test.go new file mode 100644 index 0000000..83f54b2 --- /dev/null +++ b/e2e/dashboards_e2e_test.go @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Andrei Taranik + +//go:build e2e + +package e2e + +import ( + "encoding/json" + "strings" + "testing" +) + +// TestE2E_DashboardsBroken_JSON verifies the dashboards broken subcommand +// surfaces the broken panel provisioned by e2e/grafana/dashboards/broken-panel-demo.json, +// which references a metric Prometheus does not scrape. +func TestE2E_DashboardsBroken_JSON(t *testing.T) { + out, err := runCmd(t, binPath(t), + "dashboards", "broken", + "--prometheus", "http://localhost:9090", + "--grafana", "http://localhost:3000", + "--output", "json", + ) + if err != nil { + t.Fatalf("dashboards broken failed: %v\noutput:\n%s", err, out) + } + + var payload struct { + Findings []struct { + Class string `json:"class"` + Metric string `json:"metric"` + Dashboard string `json:"dashboard"` + } `json:"findings"` + } + if err := json.Unmarshal([]byte(out), &payload); err != nil { + t.Fatalf("json.Unmarshal: %v\noutput:\n%s", err, out) + } + if len(payload.Findings) == 0 { + t.Fatalf("expected at least 1 finding; got 0\noutput:\n%s", out) + } + found := false + for _, f := range payload.Findings { + if f.Class == "broken-panel" && f.Metric == "nonexistent_broken_panel_demo_xyz_total" { + if !strings.Contains(f.Dashboard, "Broken Panel Demo") { + t.Errorf("Dashboard field = %q, want %q", f.Dashboard, "Broken Panel Demo") + } + found = true + break + } + } + if !found { + t.Errorf("expected broken-panel finding for nonexistent_broken_panel_demo_xyz_total; got %+v", payload.Findings) + } +} diff --git a/e2e/grafana/dashboards/broken-panel-demo.json b/e2e/grafana/dashboards/broken-panel-demo.json new file mode 100644 index 0000000..d8576c4 --- /dev/null +++ b/e2e/grafana/dashboards/broken-panel-demo.json @@ -0,0 +1,17 @@ +{ + "uid": "broken-panel-demo", + "title": "Broken Panel Demo", + "schemaVersion": 36, + "version": 1, + "panels": [ + { + "id": 1, + "type": "graph", + "title": "Bogus Metric", + "datasource": {"type": "prometheus", "uid": "Prometheus"}, + "targets": [ + {"expr": "nonexistent_broken_panel_demo_xyz_total", "refId": "A"} + ] + } + ] +} From dc6b7c70b36e549c8cec4159c5fd7f89b7398f04 Mon Sep 17 00:00:00 2001 From: Andrei Taranik Date: Sat, 23 May 2026 03:31:02 +0300 Subject: [PATCH 21/21] docs: surface dashboardhygiene + --ignore-dashboard in README + help text Follow-up to the analyzer landing. Updates user-facing copy that still listed the v0.1 analyzer set as four (now five with broken-panel) and omitted --ignore-dashboard from the ignore-* table. --- README.md | 17 ++++++++++------- internal/cli/report.go | 8 +++++--- internal/cli/scan.go | 9 ++++++--- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0380a7e..d8ee492 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,10 @@ Re-metric your stack - find waste in Prometheus, Grafana & Loki. Prometheus server and it prints a ranked, actionable list of cardinality problems with suggested `metric_relabel_configs` fixes. -> Status: **alpha** - cardinality, label-pattern, unused-metric, and -> alert-hygiene analyzers are wired up. JSON output, Grafana integration, -> unified `remetric scan`, and HTML/Markdown reports shipped. +> Status: **alpha** - cardinality, label-pattern, unused-metric, +> alert-hygiene, and dashboard-hygiene (broken-panel) analyzers are +> wired up. JSON output, Grafana integration, unified `remetric scan`, +> and HTML/Markdown reports shipped. ![remetric demo](demo/remetric.gif) @@ -248,6 +249,7 @@ flag is ignored by `report` - use `--format` instead. | `remetric metrics unused` | Ingested ∖ used metrics (needs Grafana for dashboard coverage)| | `remetric alerts unused` | Alerts that never fired in the lookback window | | `remetric alerts always-firing` | Alerts firing >=95% of the lookback window | +| `remetric dashboards broken` | Flag dashboards whose panels reference missing metrics | | `remetric report` | Run every analyzer, render terminal/json/html/markdown | | `remetric scan` | Run every available analyzer, emit a unified Report | @@ -302,13 +304,14 @@ Suppress findings that are known noise or out of scope. Patterns are **anchored full-match** regexes: `foo_.*` matches `foo_bar` but not `xfoo_bar`. Empty / whitespace-only patterns are silently ignored. -Three target fields, each with its own flag (repeatable): +Four target fields, each with its own flag (repeatable): | Flag | Drops findings whose ... | |------|--------------------------| -| `--ignore-metric REGEX` | metric name matches | -| `--ignore-label REGEX` | evidence label matches | -| `--ignore-alert REGEX` | alert name matches | +| `--ignore-metric REGEX` | metric name matches | +| `--ignore-label REGEX` | evidence label matches | +| `--ignore-alert REGEX` | alert name matches | +| `--ignore-dashboard REGEX` | dashboard title matches | ```bash # Repeatable flag diff --git a/internal/cli/report.go b/internal/cli/report.go index 6ef6475..2fb7857 100644 --- a/internal/cli/report.go +++ b/internal/cli/report.go @@ -40,9 +40,11 @@ func newReportCmd() *cobra.Command { cmd := &cobra.Command{ Use: "report", Short: "Run all analyzers and emit a report in the chosen format.", - Long: `Runs cardinality, label patterns, unused metrics, and alert hygiene, -then renders the unified report to stdout or a file in terminal, JSON, -HTML, or Markdown format. + Long: `Runs cardinality, label patterns, unused metrics, alert hygiene, +and dashboard hygiene (broken panels), then renders the unified report +to stdout or a file in terminal, JSON, HTML, or Markdown format. +unused-metrics and dashboard-hygiene need --grafana to fully populate +their signal; without it they emit a warning and skip the dashboard walk. The global --output flag is ignored by this subcommand; use --format instead. Use --out FILE to write to a file (or '-' for stdout).`, diff --git a/internal/cli/scan.go b/internal/cli/scan.go index 91fc67f..88b1332 100644 --- a/internal/cli/scan.go +++ b/internal/cli/scan.go @@ -29,9 +29,12 @@ func newScanCmd() *cobra.Command { cmd := &cobra.Command{ Use: "scan", Short: "Run every available analyzer and emit a unified report.", - Long: `Orchestrates cardinality, label-pattern, alert-hygiene, and -(if --grafana is set) unused-metrics analyzers. Findings are merged -into a single findings.Report shape - see the JSON schema in the spec.`, + Long: `Orchestrates the cardinality, label-pattern, alert-hygiene, +unused-metrics, and dashboard-hygiene (broken panels) analyzers. +unused-metrics and dashboard-hygiene need --grafana to fully +populate their signal; without it they emit a warning and skip the +dashboard walk. Findings are merged into a single findings.Report +shape - see the JSON schema in the spec.`, Example: ` # Full scan with Grafana remetric scan --prometheus http://localhost:9090 --grafana http://localhost:3000