From 02099a4eee5383233cd2878cfe8be92bee48b0ef Mon Sep 17 00:00:00 2001 From: Ben Cairns Date: Tue, 2 Jun 2026 18:06:11 -0500 Subject: [PATCH] telemetry: add ip-route and ip-pim-neighbor to default state-collect kinds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two kinds to the state-ingest server's default state-to-collect map: - ip-route: show ip route vrf all Exposes admin distance, programmed flags (hardwareProgrammed, kernelProgrammed), routeAction, routeLeaked, directlyConnected — fields the gNMI AFT path does not surface. Unblocks malbeclabs/infra#1410. - ip-pim-neighbor: show ip pim neighbor PIM neighbor adjacencies (state, hello-interval, expires, etc.) on devices running PIM sparse-mode. Unblocks the per-multicast-group dashboard's PIM neighbor health panel (malbeclabs/infra#1428). Both are CLI-based state-collect rather than gNMI because: - ip-route: gNMI AFT omits operationally critical fields (admin distance, programmed flags). - ip-pim-neighbor: OpenConfig PIM coverage in Arista's gNMI server is limited and the existing multicast state-collect path (mroute, MSDP) is the established pattern. Per-command interval + device-pubkey targeting (infra#1409) is a separate change. These two kinds initially run on the default cycle. --- CHANGELOG.md | 1 + telemetry/state-ingest/pkg/server/config.go | 2 ++ telemetry/state-ingest/pkg/server/handler_test.go | 4 +++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea38262f2..da6682b59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ All notable changes to this project will be documented in this file. - Add `tools/stress/device-observer/`, a per-device sampling tool for the GRE Tunnel Capacity Study. Each tick issues five eAPI `show` commands, scrapes the doublezero-agent Prometheus endpoint, polls EOS syslog via `show logging last` with cross-tick dedupe, tails the orchestrator's agent log for abort-trigger patterns, and tails the orchestrator's runlog to compute provision/deprovision durations. The abort decider is stubbed. - Telemetry - Drop the redundant `ip-msdp-sa-cache` kind from the state-ingest server's default state-collect command list. `show ip msdp sa-cache rejected` already returns the full SA cache (accepted SAs in the `acceptedSaMsg` array plus any rejected SAs in `rejectedSaMsg`), so the bare `show ip msdp sa-cache` collection is redundant — devices were running both commands per tick and uploading the same accepted-SA data twice. The `ip-msdp-sa-cache-rejected` kind is retained. + - Add `ip-route` (`show ip route vrf all`) and `ip-pim-neighbor` (`show ip pim neighbor`) to the state-ingest server's default state-collect command list. `ip-route` exposes admin distance, programmed flags (`hardwareProgrammed`, `kernelProgrammed`), and route action — fields the gNMI AFT path doesn't surface. `ip-pim-neighbor` unblocks the multicast operator dashboard's PIM neighbor health panel. - Telemetry (geoprobe) - Retry transient `bind: invalid argument` failures when allocating per-probe UDP sockets in `Publisher.AddProbe`, matching the existing retry-on-bind pattern in `Pinger`. The shared retry helper is lifted into `retry.go` so the publisher and pinger paths use the same exponential-backoff logic. Fixes intermittent `TestPublisher_RemoveProbe`/`TestPublisher_AddProbe` CI flakes caused by concurrent ephemeral-port allocation ([#3765](https://github.com/malbeclabs/doublezero/issues/3765)) diff --git a/telemetry/state-ingest/pkg/server/config.go b/telemetry/state-ingest/pkg/server/config.go index 6db35b2ad..e5deace74 100644 --- a/telemetry/state-ingest/pkg/server/config.go +++ b/telemetry/state-ingest/pkg/server/config.go @@ -23,8 +23,10 @@ var ( defaultStateToCollectShowCommands = map[string]string{ "snmp-mib-ifmib-ifindex": "show snmp mib ifmib ifindex", "isis-database-detail": "show isis database detail", + "ip-route": "show ip route vrf all", "ip-mroute": "show ip mroute", "ip-mroute-count": "show ip mroute count", + "ip-pim-neighbor": "show ip pim neighbor", "ip-msdp-summary": "show ip msdp summary", "ip-msdp-pim-sa-cache": "show ip msdp pim sa-cache", "ip-msdp-sa-cache-rejected": "show ip msdp sa-cache rejected", diff --git a/telemetry/state-ingest/pkg/server/handler_test.go b/telemetry/state-ingest/pkg/server/handler_test.go index f1e52f6d5..899c6c0db 100644 --- a/telemetry/state-ingest/pkg/server/handler_test.go +++ b/telemetry/state-ingest/pkg/server/handler_test.go @@ -860,7 +860,7 @@ func TestTelemetry_StateIngest_Handler_StateToCollect_UsesDefaultShowCommandsAnd var resp types.StateToCollectResponse require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &resp)) - require.Len(t, resp.ShowCommands, 7) + require.Len(t, resp.ShowCommands, 9) require.Len(t, resp.Custom, 1) // Convert to map for order-independent comparison (map iteration is non-deterministic) @@ -871,8 +871,10 @@ func TestTelemetry_StateIngest_Handler_StateToCollect_UsesDefaultShowCommandsAnd require.Equal(t, map[string]string{ "snmp-mib-ifmib-ifindex": "show snmp mib ifmib ifindex", "isis-database-detail": "show isis database detail", + "ip-route": "show ip route vrf all", "ip-mroute": "show ip mroute", "ip-mroute-count": "show ip mroute count", + "ip-pim-neighbor": "show ip pim neighbor", "ip-msdp-summary": "show ip msdp summary", "ip-msdp-pim-sa-cache": "show ip msdp pim sa-cache", "ip-msdp-sa-cache-rejected": "show ip msdp sa-cache rejected",