From f838035adca256d04b63fe81b4a0586f69fb7511 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Wed, 1 Jul 2026 16:42:51 +0000 Subject: [PATCH 1/5] e2e/qa: run testnet shred-subscription settlement on solana devnet The testnet shred-subscription program moved from the DZ Ledger testnet to Solana devnet. Stop forcing the testnet settlement RPC onto the DZ ledger so --url (and the USDC balance / program-state reads) resolve to networkConfig.SolanaRPCURL, which SOLANA_RPC_URL overrides to the Solana devnet RPC endpoint at run time. DZ devnet is unchanged. Point the testnet USDC mint at the Solana devnet mint so GetUSDCBalance derives the right ATA and --usdc-mint carries the correct mint. Serviceability still flows over --dz-ledger-url (DZ Ledger testnet), unchanged. Part of malbeclabs/infra#1761 / epic malbeclabs/infra#1758. --- config/constants.go | 4 +++- e2e/internal/qa/client.go | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/config/constants.go b/config/constants.go index 69b2fbcf21..6b55040af7 100644 --- a/config/constants.go +++ b/config/constants.go @@ -30,7 +30,9 @@ const ( TestnetTelemetryStateIngestURL = "https://telemetry-state-in-testnet.doublezero.xyz" TestnetGeolocationProgramID = "3AG2BCA7gAm47Q6xZzPQcUUYvnBjxAvPKnPz919cxHF4" TestnetShredSubscriptionProgramID = "dzshrr3yL57SB13sJPYHYo3TV8Bo1i1FxkyrZr3bKNE" - TestnetUSDCMint = "uSDZq2RMuxrEf7gqgDjR8wJCtCyaDAQk2e5jLAaoeeM" + // Testnet shred-subscription runs on Solana devnet, so settlement uses the + // Solana devnet USDC mint (not the DZ ledger testnet mint). + TestnetUSDCMint = "4zMMC9srt5Ri5X14GAgXhaHii3GnPAEERYPJgZJDncDU" TestnetTelemetryGNMITunnelServerAddr = "gnmic-testnet.doublezero.xyz:443" // Devnet constants. diff --git a/e2e/internal/qa/client.go b/e2e/internal/qa/client.go index 5beafb84b6..8164adae53 100644 --- a/e2e/internal/qa/client.go +++ b/e2e/internal/qa/client.go @@ -125,7 +125,8 @@ type Client struct { // Settlement config passed to doublezero-solana shreds commands. // SolanaRPCURL is the Solana RPC endpoint for settlement transactions (--url). - // On testnet this is the DZ ledger URL; on mainnet it's the public Solana RPC. + // On testnet this is Solana devnet (via SOLANA_RPC_URL); on mainnet the public + // Solana RPC; on devnet the DZ ledger URL. SolanaRPCURL string ShredSubscriptionProgramID string DZLedgerURL string @@ -155,11 +156,14 @@ func NewClient(ctx context.Context, log *slog.Logger, hostname string, port int, serviceabilityClient := serviceability.New(rpc.New(networkConfig.LedgerPublicRPCURL), networkConfig.ServiceabilityProgramID) - // Settlement transactions on testnet/devnet use the DZ ledger RPC endpoint - // (which hosts the settlement programs). Mainnet and localnet use the + // Settlement transactions on devnet use the DZ ledger RPC endpoint (which + // hosts the settlement programs there). Testnet reads/writes the + // shred-subscription program on Solana devnet via networkConfig.SolanaRPCURL + // (the SOLANA_RPC_URL override, a Solana devnet RPC endpoint in CI; + // defaults to the public Solana endpoint). Mainnet and localnet use the // standard Solana RPC. solanaRPCURL := networkConfig.SolanaRPCURL - if networkConfig.Moniker == config.EnvTestnet || networkConfig.Moniker == config.EnvDevnet { + if networkConfig.Moniker == config.EnvDevnet { solanaRPCURL = networkConfig.LedgerPublicRPCURL } From a21a184c163b27d37230a50d37f23b4fd9d460ef Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Wed, 1 Jul 2026 20:16:00 +0000 Subject: [PATCH 2/5] fixup --- config/constants.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/config/constants.go b/config/constants.go index 6b55040af7..74d0c3b92a 100644 --- a/config/constants.go +++ b/config/constants.go @@ -19,17 +19,17 @@ const ( MainnetUSDCMint = "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v" // Testnet constants. - TestnetLedgerPublicRPCURL = "https://doublezerolocalnet.rpcpool.com/8a4fd3f4-0977-449f-88c7-63d4b0f10f16" - TestnetServiceabilityProgramID = "DZtnuQ839pSaDMFG5q1ad2V95G82S5EC4RrB3Ndw2Heb" - TestnetTelemetryProgramID = "3KogTMmVxc5eUHtjZnwm136H5P8tvPwVu4ufbGPvM7p1" - TestnetInternetLatencyCollectorPK = "HWGQSTmXWMB85NY2vFLhM1nGpXA8f4VCARRyeGNbqDF1" - TestnetDeviceLocalASN = 65342 - TestnetTwoZOracleURL = "https://sol-2z-oracle-api-v1.testnet.doublezero.xyz" - TestnetSolanaRPC = "https://api.testnet.solana.com" - TestnetTelemetryFlowIngestURL = "http://telemetry-flow-in.testnet.doublezero.xyz" - TestnetTelemetryStateIngestURL = "https://telemetry-state-in-testnet.doublezero.xyz" - TestnetGeolocationProgramID = "3AG2BCA7gAm47Q6xZzPQcUUYvnBjxAvPKnPz919cxHF4" - TestnetShredSubscriptionProgramID = "dzshrr3yL57SB13sJPYHYo3TV8Bo1i1FxkyrZr3bKNE" + TestnetLedgerPublicRPCURL = "https://doublezerolocalnet.rpcpool.com/8a4fd3f4-0977-449f-88c7-63d4b0f10f16" + TestnetServiceabilityProgramID = "DZtnuQ839pSaDMFG5q1ad2V95G82S5EC4RrB3Ndw2Heb" + TestnetTelemetryProgramID = "3KogTMmVxc5eUHtjZnwm136H5P8tvPwVu4ufbGPvM7p1" + TestnetInternetLatencyCollectorPK = "HWGQSTmXWMB85NY2vFLhM1nGpXA8f4VCARRyeGNbqDF1" + TestnetDeviceLocalASN = 65342 + TestnetTwoZOracleURL = "https://sol-2z-oracle-api-v1.testnet.doublezero.xyz" + TestnetSolanaRPC = "https://api.testnet.solana.com" + TestnetTelemetryFlowIngestURL = "http://telemetry-flow-in.testnet.doublezero.xyz" + TestnetTelemetryStateIngestURL = "https://telemetry-state-in-testnet.doublezero.xyz" + TestnetGeolocationProgramID = "3AG2BCA7gAm47Q6xZzPQcUUYvnBjxAvPKnPz919cxHF4" + TestnetShredSubscriptionProgramID = "dzshrr3yL57SB13sJPYHYo3TV8Bo1i1FxkyrZr3bKNE" // Testnet shred-subscription runs on Solana devnet, so settlement uses the // Solana devnet USDC mint (not the DZ ledger testnet mint). TestnetUSDCMint = "4zMMC9srt5Ri5X14GAgXhaHii3GnPAEERYPJgZJDncDU" From f31cac0dbb3f7f799896a184f58498de3c71b790 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Wed, 1 Jul 2026 21:15:02 +0000 Subject: [PATCH 3/5] e2e/qa: query seat price by device pubkey to avoid CLI code resolution The settlement test's query_seat_price ran 'doublezero-solana shreds price' without a device filter, which resolves device codes via serviceability. The CLI refuses code resolution when it can't classify the cluster (a private Solana devnet RPC URL is seen as localnet), failing the query. Pass the closest device's pubkey through FeedSeatPrice so the agent runs 'shreds price --device ', which skips code resolution. Match the returned price by pubkey since code may be absent. pay/withdraw already use --device, so this makes the whole settlement path code-resolution-free. Part of malbeclabs/infra#1761 / epic malbeclabs/infra#1758. --- e2e/internal/qa/client_settlement.go | 13 ++++++++----- e2e/internal/rpc/agent.go | 7 +++++++ e2e/proto/qa/agent.proto | 1 + e2e/proto/qa/gen/pb-go/agent.pb.go | 15 ++++++++++++--- e2e/qa_multicast_settlement_test.go | 6 ++++-- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/e2e/internal/qa/client_settlement.go b/e2e/internal/qa/client_settlement.go index f485b2cd1f..59c09feb1c 100644 --- a/e2e/internal/qa/client_settlement.go +++ b/e2e/internal/qa/client_settlement.go @@ -106,11 +106,13 @@ func (c *Client) ClosestDevice(ctx context.Context) (*Device, error) { return device, nil } -// FeedSeatPrice calls the FeedSeatPrice RPC to query device seat prices. -// This is an idempotent read, so on RPC failure it fails over to the next -// endpoint and retries. -func (c *Client) FeedSeatPrice(ctx context.Context) ([]*pb.DevicePrice, error) { - c.log.Debug("Querying seat prices", "host", c.Host) +// FeedSeatPrice calls the FeedSeatPrice RPC to query seat pricing for a single +// device (by pubkey). Querying by pubkey avoids device-code resolution, which +// the CLI refuses when it can't classify the cluster (e.g. a private Solana +// devnet RPC URL). This is an idempotent read, so on RPC failure it fails over +// to the next endpoint and retries. +func (c *Client) FeedSeatPrice(ctx context.Context, devicePubkey string) ([]*pb.DevicePrice, error) { + c.log.Debug("Querying seat prices", "host", c.Host, "device", devicePubkey) var prices []*pb.DevicePrice err := c.withReadFailover(func(rpcURL string) error { resp, err := c.grpcClient.FeedSeatPrice(ctx, &pb.FeedSeatPriceRequest{ @@ -119,6 +121,7 @@ func (c *Client) FeedSeatPrice(ctx context.Context) ([]*pb.DevicePrice, error) { UsdcMint: c.USDCMint, Keypair: c.Keypair, ShredSubscriptionProgramId: c.ShredSubscriptionProgramID, + DevicePubkey: devicePubkey, }) if err != nil { return err diff --git a/e2e/internal/rpc/agent.go b/e2e/internal/rpc/agent.go index 92d16ce1c3..0992288306 100644 --- a/e2e/internal/rpc/agent.go +++ b/e2e/internal/rpc/agent.go @@ -362,6 +362,13 @@ func (q *QAAgent) FeedSeatPrice(ctx context.Context, req *pb.FeedSeatPriceReques if req.GetSolanaRpcUrl() != "" { args = append(args, "--url", req.GetSolanaRpcUrl()) } + // Query a single device by pubkey rather than listing all. The list path + // resolves device codes via serviceability, which the CLI refuses when it + // can't classify the cluster (e.g. a private Solana devnet RPC URL, seen as + // localnet); passing --device sidesteps code resolution entirely. + if req.GetDevicePubkey() != "" { + args = append(args, "--device", req.GetDevicePubkey()) + } cmdCtx, cancel := context.WithTimeout(ctx, 60*time.Second) defer cancel() diff --git a/e2e/proto/qa/agent.proto b/e2e/proto/qa/agent.proto index d00b4cb10d..7997216375 100644 --- a/e2e/proto/qa/agent.proto +++ b/e2e/proto/qa/agent.proto @@ -238,6 +238,7 @@ message FeedSeatPriceRequest { string usdc_mint = 3; string keypair = 4; string shred_subscription_program_id = 5; + string device_pubkey = 6; } message DevicePrice { diff --git a/e2e/proto/qa/gen/pb-go/agent.pb.go b/e2e/proto/qa/gen/pb-go/agent.pb.go index 645ca9310a..4dfad737cc 100644 --- a/e2e/proto/qa/gen/pb-go/agent.pb.go +++ b/e2e/proto/qa/gen/pb-go/agent.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.36.10 -// protoc (unknown) +// protoc v3.21.12 // source: agent.proto package qa @@ -1949,6 +1949,7 @@ type FeedSeatPriceRequest struct { UsdcMint string `protobuf:"bytes,3,opt,name=usdc_mint,json=usdcMint,proto3" json:"usdc_mint,omitempty"` Keypair string `protobuf:"bytes,4,opt,name=keypair,proto3" json:"keypair,omitempty"` ShredSubscriptionProgramId string `protobuf:"bytes,5,opt,name=shred_subscription_program_id,json=shredSubscriptionProgramId,proto3" json:"shred_subscription_program_id,omitempty"` + DevicePubkey string `protobuf:"bytes,6,opt,name=device_pubkey,json=devicePubkey,proto3" json:"device_pubkey,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -2018,6 +2019,13 @@ func (x *FeedSeatPriceRequest) GetShredSubscriptionProgramId() string { return "" } +func (x *FeedSeatPriceRequest) GetDevicePubkey() string { + if x != nil { + return x.DevicePubkey + } + return "" +} + type DevicePrice struct { state protoimpl.MessageState `protogen:"open.v1"` DeviceCode string `protobuf:"bytes,1,opt,name=device_code,json=deviceCode,proto3" json:"device_code,omitempty"` @@ -2416,13 +2424,14 @@ const file_agent_proto_rawDesc = "" + "\x1dshred_subscription_program_id\x18\x05 \x01(\tR\x1ashredSubscriptionProgramId\x12\"\n" + "\rdz_ledger_url\x18\x06 \x01(\tR\vdzLedgerUrl\x12\x1b\n" + "\tusdc_mint\x18\a \x01(\tR\busdcMint\x12\x18\n" + - "\akeypair\x18\b \x01(\tR\akeypair\"\xda\x01\n" + + "\akeypair\x18\b \x01(\tR\akeypair\"\xff\x01\n" + "\x14FeedSeatPriceRequest\x12$\n" + "\x0esolana_rpc_url\x18\x01 \x01(\tR\fsolanaRpcUrl\x12\"\n" + "\rdz_ledger_url\x18\x02 \x01(\tR\vdzLedgerUrl\x12\x1b\n" + "\tusdc_mint\x18\x03 \x01(\tR\busdcMint\x12\x18\n" + "\akeypair\x18\x04 \x01(\tR\akeypair\x12A\n" + - "\x1dshred_subscription_program_id\x18\x05 \x01(\tR\x1ashredSubscriptionProgramId\"\xd1\x02\n" + + "\x1dshred_subscription_program_id\x18\x05 \x01(\tR\x1ashredSubscriptionProgramId\x12#\n" + + "\rdevice_pubkey\x18\x06 \x01(\tR\fdevicePubkey\"\xd1\x02\n" + "\vDevicePrice\x12\x1f\n" + "\vdevice_code\x18\x01 \x01(\tR\n" + "deviceCode\x12#\n" + diff --git a/e2e/qa_multicast_settlement_test.go b/e2e/qa_multicast_settlement_test.go index a38504f6c8..71785d9ebf 100644 --- a/e2e/qa_multicast_settlement_test.go +++ b/e2e/qa_multicast_settlement_test.go @@ -130,12 +130,14 @@ func TestQA_MulticastSettlement(t *testing.T) { } if !t.Run("query_seat_price", func(t *testing.T) { - prices, err := client.FeedSeatPrice(ctx) + prices, err := client.FeedSeatPrice(ctx, device.PubKey) require.NoError(t, err, "failed to get seat prices") + // Match by pubkey, not code: querying by --device skips code resolution, + // so the returned rows may not carry a device_code. var price *pb.DevicePrice for _, p := range prices { - if p.DeviceCode == device.Code { + if p.DevicePubkey == device.PubKey { price = p break } From bb9fe8b5eb9db71d7ef3551ce8ed75f03085e160 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Thu, 2 Jul 2026 02:29:24 +0000 Subject: [PATCH 4/5] e2e/qa: give multicast tunnel-up its own 180s timeout validate_tunnel_up (WaitForMulticastStatusUp) shared the 90s waitForStatusUpTimeout, but the multicast tunnel only comes up after the oracle subscribes the seat to its shred groups on its ~60s reconcile loop (not on seat-allocation ack), so tunnel-up races the cadence and flakes on devnet. Give multicast a dedicated 180s timeout (> one reconcile cycle + tx confirm + client poll); unicast keeps 90s. Stopgap until the oracle subscribes on grant (event-driven). --- e2e/internal/qa/client.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/e2e/internal/qa/client.go b/e2e/internal/qa/client.go index 8164adae53..d495d4556f 100644 --- a/e2e/internal/qa/client.go +++ b/e2e/internal/qa/client.go @@ -28,6 +28,12 @@ import ( const ( disconnectTimeout = 150 * time.Second waitForStatusUpTimeout = 90 * time.Second + // Multicast (shred-subscription) tunnels come up only after the oracle + // subscribes the seat to its shred groups, which is driven by the oracle's + // ~60s reconcile loop rather than on seat-allocation ack. Allow more than + // one full reconcile cycle (plus tx confirm + client poll) so tunnel-up + // doesn't race the cadence. (Remove once the oracle subscribes on grant.) + waitForMulticastStatusUpTimeout = 180 * time.Second waitForStatusDisconnectedTimeout = 90 * time.Second waitForUserDeletionTimeout = 90 * time.Second @@ -397,19 +403,19 @@ func (c *Client) WaitForStatusUp(ctx context.Context) error { // its session is up. Prefer this over WaitForStatusUp in multi-tunnel contexts // where other tunnel types may already be present. func (c *Client) WaitForUnicastStatusUp(ctx context.Context) error { - return c.waitForUserTypeStatusUp(ctx, "IBRL", FindIBRLStatus) + return c.waitForUserTypeStatusUp(ctx, "IBRL", FindIBRLStatus, waitForStatusUpTimeout) } // WaitForMulticastStatusUp polls until a Multicast status entry exists and // its session is up. Prefer this over WaitForStatusUp in multi-tunnel contexts // where other tunnel types may already be present. func (c *Client) WaitForMulticastStatusUp(ctx context.Context) error { - return c.waitForUserTypeStatusUp(ctx, "Multicast", FindMulticastStatus) + return c.waitForUserTypeStatusUp(ctx, "Multicast", FindMulticastStatus, waitForMulticastStatusUpTimeout) } // waitForUserTypeStatusUp polls until find returns a non-nil status whose // session is up. userType is used only for log context. -func (c *Client) waitForUserTypeStatusUp(ctx context.Context, userType string, find func([]*pb.Status) *pb.Status) error { +func (c *Client) waitForUserTypeStatusUp(ctx context.Context, userType string, find func([]*pb.Status) *pb.Status, timeout time.Duration) error { c.log.Debug("Waiting for status to be up", "host", c.Host, "userType", userType) err := poll.Until(ctx, func() (bool, error) { resp, err := c.grpcClient.GetStatus(ctx, &emptypb.Empty{}) @@ -418,7 +424,7 @@ func (c *Client) waitForUserTypeStatusUp(ctx context.Context, userType string, f } s := find(resp.Status) return s != nil && IsStatusUp(s.SessionStatus), nil - }, waitForStatusUpTimeout, waitInterval) + }, timeout, waitInterval) if err != nil { return fmt.Errorf("failed to wait for %s status to be up on host %s: %w", userType, c.Host, err) } From e60da713aaccf9db5c12422dcf935c16ba9eb015 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Thu, 2 Jul 2026 02:55:26 +0000 Subject: [PATCH 5/5] e2e/qa: crank multicast tunnel up/down timeouts to 360s (debug) Observed subscribe latency up to ~3.5 min on devnet (oracle subscribes on its ~60s reconcile loop, gated on the seat becoming active), which even 180s didn't cover. Bump multicast up and add a dedicated multicast-disconnect timeout, both 360s, to blow past tunnel up/down and surface any downstream breakage. Temporary padding; real fix is event-driven subscribe/unsubscribe in the oracle. --- e2e/internal/qa/client.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/e2e/internal/qa/client.go b/e2e/internal/qa/client.go index d495d4556f..853644f8cc 100644 --- a/e2e/internal/qa/client.go +++ b/e2e/internal/qa/client.go @@ -26,16 +26,18 @@ import ( ) const ( - disconnectTimeout = 150 * time.Second - waitForStatusUpTimeout = 90 * time.Second - // Multicast (shred-subscription) tunnels come up only after the oracle - // subscribes the seat to its shred groups, which is driven by the oracle's - // ~60s reconcile loop rather than on seat-allocation ack. Allow more than - // one full reconcile cycle (plus tx confirm + client poll) so tunnel-up - // doesn't race the cadence. (Remove once the oracle subscribes on grant.) - waitForMulticastStatusUpTimeout = 180 * time.Second - waitForStatusDisconnectedTimeout = 90 * time.Second - waitForUserDeletionTimeout = 90 * time.Second + disconnectTimeout = 150 * time.Second + waitForStatusUpTimeout = 90 * time.Second + // Multicast (shred-subscription) tunnel up/down is driven by the oracle's + // ~60s reconcile loop (subscribe once the seat is active; unsubscribe after + // withdrawal), NOT on seat-allocation ack / withdrawal — observed latency up + // to ~3.5 min on devnet. Generous, dedicated timeouts so up/down don't race + // the cadence. TEMPORARY debug padding to surface downstream failures; the + // real fix is event-driven subscribe/unsubscribe in the oracle. + waitForMulticastStatusUpTimeout = 360 * time.Second + waitForMulticastStatusDisconnectedTimeout = 360 * time.Second + waitForStatusDisconnectedTimeout = 90 * time.Second + waitForUserDeletionTimeout = 90 * time.Second // NOTE: This needs to be longer than 1m since BGP can sometimes throttle activity for that // amount of time if too much is happening consecutively for the same peers. @@ -513,7 +515,7 @@ func (c *Client) WaitForStatusDisconnected(ctx context.Context) error { // Prefer this over WaitForStatusDisconnected in multi-tunnel contexts where // other tunnel types (e.g. IBRL) remain up after a multicast seat is withdrawn. func (c *Client) WaitForMulticastStatusDisconnected(ctx context.Context) error { - return c.waitForUserTypeStatusDisconnected(ctx, "Multicast", FindMulticastStatus) + return c.waitForUserTypeStatusDisconnected(ctx, "Multicast", FindMulticastStatus, waitForMulticastStatusDisconnectedTimeout) } // WaitForIBRLStatusDisconnected polls until no IBRL (or IBRLWithAllocatedIP) @@ -547,7 +549,7 @@ func (c *Client) WaitForIBRLStatusDisconnected(ctx context.Context) error { // waitForUserTypeStatusDisconnected polls until find returns nil or a status // whose session is disconnected. userType is used only for log context. -func (c *Client) waitForUserTypeStatusDisconnected(ctx context.Context, userType string, find func([]*pb.Status) *pb.Status) error { +func (c *Client) waitForUserTypeStatusDisconnected(ctx context.Context, userType string, find func([]*pb.Status) *pb.Status, timeout time.Duration) error { c.log.Debug("Waiting for status to be disconnected", "host", c.Host, "userType", userType) err := poll.Until(ctx, func() (bool, error) { resp, err := c.grpcClient.GetStatus(ctx, &emptypb.Empty{}) @@ -556,7 +558,7 @@ func (c *Client) waitForUserTypeStatusDisconnected(ctx context.Context, userType } s := find(resp.Status) return s == nil || s.SessionStatus == UserStatusDisconnected, nil - }, waitForStatusDisconnectedTimeout, waitInterval) + }, timeout, waitInterval) if err != nil { return fmt.Errorf("failed to wait for %s status to be disconnected on host %s: %w", userType, c.Host, err) }