Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions internal/connections/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,33 @@ func (s *Client) List(ctx context.Context, sort SortMode) (ConnectionList, error
}

deadline := time.Now().Add(s.retryBudget)
var partial ConnectionList
havePartial := false
for {
list, retryAfter, err := s.tryList(ctx, sort)
if err == nil {
switch {
case err == nil && !list.HasUnreachableInstance():
return list, nil
}
if isTimeoutError(err) && ctx.Err() != nil {
case err == nil:
// 200 OK, but the server flagged one or more instances unreachable
// (a partial fan-out failure). These are usually momentary under
// load and clear within a poll or two, so retry within the budget.
// Retry locally; the response carries no Retry-After hint.
partial, havePartial = list, true
retryAfter = 0
case isTimeoutError(err) && ctx.Err() != nil:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Timeout drops saved partial list

Medium Severity

After a partial 200 is stored for retry, the overall List timeout can fire either while waiting between attempts or inside tryList. The wait path returns the saved partial with a nil error, but the in-flight timeout path always returns a timeout error and discards havePartial, so callers see a failed refresh instead of the degraded snapshot the PR intends.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 179b666. Configure here.

if callerCtx.Err() != nil {
return ConnectionList{}, fmt.Errorf("list connections: %w", callerCtx.Err())
}
return ConnectionList{}, fmt.Errorf("list connections: request timed out after %s, please retry", s.cfg.RequestTimeout)
}
if !errors.Is(err, errListWarming) {
case !errors.Is(err, errListWarming):
return ConnectionList{}, err
}

if s.retryBudget <= 0 {
if havePartial {
return partial, nil
}
return ConnectionList{}, errListWarmingExhausted
}

Expand All @@ -215,12 +227,21 @@ func (s *Client) List(ctx context.Context, sort SortMode) (ConnectionList, error
delay = s.retryDelay()
}
if time.Now().Add(delay).After(deadline) {
if havePartial {
return partial, nil
}
return ConnectionList{}, errListWarmingExhausted
}

select {
case <-time.After(delay):
case <-ctx.Done():
if callerCtx.Err() != nil {
return ConnectionList{}, fmt.Errorf("list connections: %w", callerCtx.Err())
}
if havePartial {
return partial, nil
}
return ConnectionList{}, fmt.Errorf("list connections: %w", ctx.Err())
}
}
Expand Down
133 changes: 130 additions & 3 deletions internal/connections/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const sampleListResponse = `{
"captured_at": "2026-04-29T12:34:56.789Z",
"instances": [
{"id": "primary", "role": "primary", "error": null},
{"id": "replica-1", "role": "replica", "error": "timeout after 2s"}
{"id": "replica-1", "role": "replica", "error": null}
],
"data": [
{
Expand Down Expand Up @@ -180,8 +180,8 @@ func TestClient_ListDecodesConnectionList(t *testing.T) {
if list.Instances[0] != (InstanceMeta{ID: "primary", Role: "primary"}) {
t.Errorf("Instances[0] = %+v, want primary/primary/(no error)", list.Instances[0])
}
if list.Instances[1] != (InstanceMeta{ID: "replica-1", Role: "replica", Error: "timeout after 2s"}) {
t.Errorf("Instances[1] = %+v, want replica-1/replica/timeout", list.Instances[1])
if list.Instances[1] != (InstanceMeta{ID: "replica-1", Role: "replica"}) {
t.Errorf("Instances[1] = %+v, want replica-1/replica/(no error)", list.Instances[1])
}

if first.InstanceRole != "primary" {
Expand Down Expand Up @@ -526,6 +526,103 @@ func TestClientListPartialResponseFriendlyMessage(t *testing.T) {
}
}

// partialInstanceListResponse models a 200 OK that the server returns while a
// fan-out to one instance failed: the primary is flagged unreachable while the
// replica reports cleanly. The error string mirrors a real captured trace.
const partialInstanceListResponse = `{
"type": "list",
"captured_at": "2026-04-29T12:34:56.789Z",
"instances": [
{"id": "primary", "role": "primary", "error": "remote service unavailable"},
{"id": "replica-1", "role": "replica", "error": null}
],
"data": []
}`

func TestClient_ListRetriesPartialInstanceErrorThenReturnsClean(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
if calls.Add(1) == 1 {
_, _ = io.WriteString(w, partialInstanceListResponse)
return
}
_, _ = io.WriteString(w, sampleListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 500*time.Millisecond, 10*time.Millisecond, 0)
list, err := c.List(context.Background(), SortByTransactionStart)
if err != nil {
t.Fatalf("List: %v", err)
}
if got := calls.Load(); got != 2 {
t.Fatalf("calls = %d, want 2 (retried past the partial-instance failure)", got)
}
for _, inst := range list.Instances {
if inst.Error != "" {
t.Fatalf("returned list still reports unreachable instance %q (%q), want a clean retry result", inst.ID, inst.Error)
}
}
}

func TestClient_ListReturnsPartialAfterRetryBudgetExhausted(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 40*time.Millisecond, 10*time.Millisecond, 0)
start := time.Now()
list, err := c.List(context.Background(), SortByTransactionStart)
elapsed := time.Since(start)
if err != nil {
t.Fatalf("List: want partial list, got error %v", err)
}
if calls.Load() < 2 {
t.Fatalf("calls = %d, want at least 2 (retried before giving up)", calls.Load())
}
if elapsed > time.Second {
t.Fatalf("List took %v, want retry budget to expire promptly", elapsed)
}
// A persistent partial failure still returns useful data: the reachable
// instances plus the per-instance error that drives the unreachable banner.
var found bool
for _, inst := range list.Instances {
if inst.ID == "primary" && inst.Error == "remote service unavailable" {
found = true
}
}
if !found {
t.Fatalf("partial list lost the instance error; instances = %+v", list.Instances)
}
}

func TestClient_ListDoesNotRetryPartialWhenBudgetDisabled(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 0, 10*time.Millisecond, 0)
list, err := c.List(context.Background(), SortByTransactionStart)
if err != nil {
t.Fatalf("List: %v", err)
}
if got := calls.Load(); got != 1 {
t.Fatalf("calls = %d, want 1 (no retry when budget disabled)", got)
}
if got := list.Instances[0].Error; got != "remote service unavailable" {
t.Fatalf("Instances[0].Error = %q, want decoded instance error", got)
}
}

func TestClient_ListRetryWaitHonorsContextCancellation(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
Expand Down Expand Up @@ -555,6 +652,36 @@ func TestClient_ListRetryWaitHonorsContextCancellation(t *testing.T) {
}
}

func TestClient_ListPartialRetryWaitHonorsContextCancellation(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()

c := newClientWithTimings(t, srv.URL, 500*time.Millisecond, 250*time.Millisecond, 0)
start := time.Now()
_, err := c.List(ctx, SortByTransactionStart)
elapsed := time.Since(start)
if err == nil {
t.Fatal("List: want context error, got nil")
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Fatalf("err = %v, want errors.Is(err, context.DeadlineExceeded)", err)
}
if got := calls.Load(); got != 1 {
t.Fatalf("calls = %d, want 1 because context expired during partial retry wait", got)
}
if elapsed > time.Second {
t.Fatalf("List took %v, want context cancellation to interrupt partial retry wait", elapsed)
}
}

func TestClient_ListHonorsRetryAfterHeader(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
Expand Down
12 changes: 12 additions & 0 deletions internal/connections/connection_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ type ConnectionList struct {
Topology *Topology `json:"topology,omitempty"`
}

// HasUnreachableInstance reports whether the server flagged any instance as
// unreachable in the list response. These partial fan-out failures drive the
// "N of M instances unreachable" banner.
func (l ConnectionList) HasUnreachableInstance() bool {
for _, inst := range l.Instances {
if inst.Error != "" {
return true
}
}
return false
}

type Connection struct {
PID int `json:"pid"`
Instance string `json:"instance"`
Expand Down
4 changes: 3 additions & 1 deletion internal/connections/tui/detail.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,9 @@ func blockerTreeLabel(row blockerRow, selected bool) string {
// consistent across views.
return selectedRowStyle.Render("▶ " + label)
}
return label
// Reserve the caret's width on unselected rows so moving the selection
// doesn't shift the row text, matching the table view's fixed-width marker.
return " " + label
}

func wrapLines(text string, width int) []string {
Expand Down
17 changes: 17 additions & 0 deletions internal/connections/tui/detail_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,3 +688,20 @@ func TestDetailHelpHidesRefreshWhilePaused(t *testing.T) {
c.Assert(footer, qt.Not(qt.Contains), "r refresh")
c.Assert(footer, qt.Contains, "space resume")
}

func TestBlockerTreeLabelReservesCaretWidth(t *testing.T) {
c := qt.New(t)
row := blockerRow{PID: 123, Present: false}

// The selection caret ("▶ ") must not shift the row text: an unselected
// row reserves the caret's width so the label sits at the same column
// whether or not it is selected (matching the table view's fixed marker).
unselected := blockerTreeLabel(row, false)
c.Assert(unselected, qt.Equals, " "+blockerLabel(row))

// Selected rows use the caret + a space — the same two display columns as
// the unselected padding, so the label text never shifts between states.
selectedInner := ansi.Strip(blockerTreeLabel(row, true))
c.Assert(strings.HasPrefix(selectedInner, "▶ "), qt.IsTrue)
c.Assert(strings.TrimPrefix(selectedInner, "▶ "), qt.Equals, strings.TrimPrefix(unselected, " "))
}
9 changes: 9 additions & 0 deletions internal/connections/tui/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,15 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.consecutiveErrors++
return m, nil
}
// A degraded result reached us after the client exhausted its retries on
// a partial-instance failure. Don't swap it into the live view: hold the
// last good frame and let the staleness dot and captured-age surface it,
// rather than flashing the "N unreachable" banner over good data. First
// load has no good frame to hold, so fall through and show what we have.
if m.hasList && msg.list.HasUnreachableInstance() {
m.consecutiveErrors++
return m, nil
}
m.consecutiveErrors = 0
m.initialAccessDenied = false
cursor := m.samples.Push(msg.list)
Expand Down
40 changes: 40 additions & 0 deletions internal/connections/tui/model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,46 @@ func TestModelKeepsLastListWhenRefreshFails(t *testing.T) {
c.Assert(view, qt.Contains, "error: connection refused")
}

func degradedList(captured time.Time, pid int) live.ConnectionList {
list := live.NewConnectionList(captured, []live.Connection{{PID: pid, Instance: "primary"}}, live.SortByTransactionStart)
list.Instances = []live.InstanceMeta{
{ID: "primary", Role: "primary", Error: "remote service unavailable"},
{ID: "replica-1", Role: "replica"},
}
return list
}

func TestModelHoldsLastGoodListOnPersistentPartial(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)
good := live.NewConnectionList(time.Now(), []live.Connection{{PID: 10, Instance: "primary"}}, live.SortByTransactionStart)

updated, _ := model.Update(listMsg{list: good})
// A degraded refresh (client retries already exhausted) must not replace the
// good frame or show the unreachable banner — it holds and goes stale.
updated, _ = updated.(Model).Update(listMsg{list: degradedList(time.Now(), 20)})
got := updated.(Model)
view := got.View()

c.Assert(view, qt.Contains, "10")
c.Assert(view, qt.Not(qt.Contains), "20")
c.Assert(view, qt.Not(qt.Contains), "unreachable")
c.Assert(got.consecutiveErrors, qt.Equals, 1)
}

func TestModelShowsPartialOnFirstLoadWithNoPriorFrame(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)

// No good frame to hold yet, so the first result is shown even if degraded.
updated, _ := model.Update(listMsg{list: degradedList(time.Now(), 30)})
got := updated.(Model)
view := got.View()

c.Assert(view, qt.Contains, "30")
c.Assert(view, qt.Contains, "unreachable")
}

func TestModelShowsInitialListErrorBeforeAnySuccessfulList(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)
Expand Down
Loading
Loading