diff --git a/.conductor/registry/workflows/actionable.yaml b/.conductor/registry/workflows/actionable.yaml index 6b1720e..483dbf5 100644 --- a/.conductor/registry/workflows/actionable.yaml +++ b/.conductor/registry/workflows/actionable.yaml @@ -176,7 +176,7 @@ agents: # Reads `workflow.input.executor`, validates it is one of the allowed # values, and emits `{ executor, work_item_id }` so the routes can # branch. Unknown executor values surface as an `error` field that - # falls through to workflow_error_gate. + # falls through to abort_run. # # Invariants: # Pre: workflow.input.executor is one of {polyphony, human}. @@ -199,9 +199,9 @@ agents: - to: human_satisfaction_gate when: "{{ executor_router.output.executor == 'human' }}" # Catch-all per M4: an unknown executor value (router emits - # `output.error`) routes to the workflow error gate so a human - # decides retry/abandon rather than `No matching route found`. - - to: workflow_error_gate + # `output.error`) aborts deterministically rather than raising + # `No matching route found`. + - to: abort_run # ── Evidence branch ensure (polyphony leg) ────────────────────────────── # @@ -235,7 +235,7 @@ agents: - "--from-ref" - "{{ workflow.input.from_ref }}" routes: - - to: workflow_error_gate + - to: abort_run when: "{{ ensure_evidence_branch.output.action == 'error' or (ensure_evidence_branch.output.error is defined and ensure_evidence_branch.output.error != '') }}" - to: compose_addendum @@ -260,7 +260,7 @@ agents: # Post: output.facets / output.skills / output.mcps / output.guidance # / output.guidance_present are populated. On failure # output.error and output.error_code are populated and the - # workflow routes to workflow_error_gate; the agent never sees + # workflow routes to abort_run; the agent never sees # a partial envelope. - name: compose_addendum type: script @@ -271,7 +271,7 @@ agents: - "compose-addendum" - "{{ workflow.input.work_item_id }}" routes: - - to: workflow_error_gate + - to: abort_run when: "{{ compose_addendum.output.error is defined and compose_addendum.output.error != '' }}" - to: guidance_loader @@ -491,7 +491,7 @@ agents: - "--repository" - "{{ workflow.input.repository }}" routes: - - to: workflow_error_gate + - to: abort_run when: "{{ open_evidence_pr.output.error is defined and open_evidence_pr.output.error != '' }}" - to: evidence_floor_check @@ -509,14 +509,12 @@ agents: # province. Do not extend with content-quality checks here. # # Routing (per the verb's routing-style envelope contract): - # - error_code populated (pr_not_found / gh_failed) → workflow_error_gate + # - error_code populated (pr_not_found / gh_failed) → abort_run # - passes_floor == false → floor_failed_gate # - passes_floor == true → evidence_reviewer # # The error_code branch fires before the floor branch because a - # transport failure means the floor was not actually evaluated — the - # error gate's retry path lets the operator re-run after fixing the - # underlying gh / network issue. + # transport failure means the floor was not actually evaluated. # # Invariants: # Pre: open_evidence_pr.output.pr_number > 0. @@ -539,7 +537,7 @@ agents: - "--repository-override" - "{{ workflow.input.repository }}" routes: - - to: workflow_error_gate + - to: abort_run when: "{{ evidence_floor_check.output.error_code is defined and evidence_floor_check.output.error_code != null }}" - to: floor_failed_gate when: "{{ evidence_floor_check.output.passes_floor == false }}" @@ -714,12 +712,11 @@ agents: when: "{{ evidence_reviewer.output.decision == 'approve' }}" - to: revise_loop_gate when: "{{ evidence_reviewer.output.decision == 'request_changes' }}" - - to: workflow_error_gate + - to: workflow_abandoned when: "{{ evidence_reviewer.output.decision == 'block' }}" - # Catch-all per M4: unknown / missing decision routes to the - # error gate so the operator decides rather than raising - # `No matching route found`. - - to: workflow_error_gate + # Catch-all per M4: unknown / missing decision abandons the workflow + # rather than raising `No matching route found`. + - to: workflow_abandoned # ── Revise loop gate (polyphony leg) ──────────────────────────────────── # @@ -804,10 +801,10 @@ agents: routes: - to: workflow_completed when: "{{ merge_evidence_pr.output.merged == true }}" - - to: workflow_error_gate + - to: workflow_abandoned when: "{{ merge_evidence_pr.output.merged == false }}" - # Catch-all per M4: missing/malformed merged field -> error gate. - - to: workflow_error_gate + # Catch-all per M4: missing/malformed merged field -> abandoned. + - to: workflow_abandoned # ── Human satisfaction gate (human leg) ──────────────────────────────── # @@ -851,71 +848,26 @@ agents: value: abandoned route: workflow_abandoned - # ── Workflow error gate (shared) ─────────────────────────────────────── + # ── Terminal abort (deterministic workflow error) ────────────────────── # - # Fires on any error path — verb error envelope, reviewer block, or - # merge failure. Per P7 the retry option has no automatic cap; the - # human decides when to abandon. Retry re-enters at the executor - # router so a flipped-mind operator can change the executor input - # mid-recovery. - # - # Per M3 (StrictUndefined), every reference to a verb's output is - # guarded with `is defined` because at most one of the upstream verbs - # has actually populated an error envelope when this gate fires. - # - # Invariants: - # Pre: At least one upstream step has signaled an error. - # Post: Operator chose retry (re-enter executor_router) or - # abandon (workflow_abandoned). - - name: workflow_error_gate - type: human_gate - description: An error occurred — operator decides retry or abandon - prompt: | - ## ❌ Actionable workflow error — item #{{ workflow.input.work_item_id }} - - An upstream step signaled an error or unrecoverable state. - - {% if executor_router is defined and executor_router.output.error is defined and executor_router.output.error != '' -%} - **Stage:** executor_router - **Detail:** {{ executor_router.output.error }} - {%- elif ensure_evidence_branch is defined and ensure_evidence_branch.output.error is defined and ensure_evidence_branch.output.error != '' -%} - **Stage:** ensure_evidence_branch - **Detail:** {{ ensure_evidence_branch.output.error }} - {%- elif compose_addendum is defined and compose_addendum.output.error is defined and compose_addendum.output.error != '' -%} - **Stage:** compose_addendum - **Detail:** {{ compose_addendum.output.error }}{% if compose_addendum.output.error_code is defined %} ({{ compose_addendum.output.error_code }}){% endif %} - {%- elif open_evidence_pr is defined and open_evidence_pr.output.error is defined and open_evidence_pr.output.error != '' -%} - **Stage:** open_evidence_pr - **Detail:** {{ open_evidence_pr.output.error }} - {%- elif evidence_floor_check is defined and evidence_floor_check.output.error_code is defined and evidence_floor_check.output.error_code != null -%} - **Stage:** evidence_floor_check - **Detail:** [{{ evidence_floor_check.output.error_code }}] {{ evidence_floor_check.output.error_message }} - {%- elif evidence_reviewer is defined and evidence_reviewer.output.decision == 'block' -%} - **Stage:** evidence_reviewer - **Detail:** {{ evidence_reviewer.output.comment }} - {%- elif merge_evidence_pr is defined -%} - **Stage:** merge_evidence_pr - **Detail:** {% if merge_evidence_pr.output.error is defined %}{{ merge_evidence_pr.output.error }}{% else %}polyphony pr merge-evidence-pr reported merged=false (no error message){% endif %} - {%- else -%} - **Stage:** unknown - **Detail:** Routed to error gate without an explicit error envelope. - {%- endif %} - - Choose an action: - - - **Retry** — re-enter at the executor router. Both - `branch ensure-evidence-branch` and `pr open-evidence-pr` are - idempotent (existing branch / PR is reused). Safe regardless - of which stage failed. - - **Abandon** — terminate the workflow without satisfying the - item. - options: - - label: "🔄 Retry" - value: retry - route: executor_router - - label: "🛑 Abandon" - value: abandon - route: workflow_abandoned + # Invokes abort-run.ps1, which POSTs to conductor's /api/stop endpoint. + # Used for infrastructure errors that previously paused for human retry. + - name: abort_run + type: script + description: Deterministic actionable workflow abort — POSTs /api/stop to halt the run cleanly + command: pwsh + args: + - "-NoProfile" + - "-File" + - "{{ workflow.dir }}/../scripts/abort-run.ps1" + - "-Reason" + - "workflow-error" + - "-WorkItemId" + - "{{ workflow.input.work_item_id }}" + - "-Stage" + - "actionable" + routes: + - to: $end # ── Terminal: workflow completed ─────────────────────────────────────── # diff --git a/.conductor/registry/workflows/ado-pr.yaml b/.conductor/registry/workflows/ado-pr.yaml index 4ed1968..bf15dd3 100644 --- a/.conductor/registry/workflows/ado-pr.yaml +++ b/.conductor/registry/workflows/ado-pr.yaml @@ -484,7 +484,8 @@ agents: - "--pr-number" - "{{ workflow.input.pr_number }}" routes: - - to: poll_error_gate + # on_error: auto-abort (AB#3257 — trivial gate removed; retry via conductor on_error: when Phase 1 ships) + - to: abort_run when: "{{ poll_status.output.error is defined and poll_status.output.error }}" - to: closed_unmerged_emitter when: "{{ poll_status.output.route == 'abort_unmerged' }}" @@ -502,34 +503,6 @@ agents: # Catch-all per M4 — defensive deferral to analyzer. - to: pr_feedback_analyzer - # ── Poll error gate ─────────────────────────────────────────────────── - - name: poll_error_gate - type: human_gate - prompt: | - ## ⚠️ PR Poll Failed — ADO PR #{{ workflow.input.pr_number }} - - Could not read the PR review state for ADO PR - **#{{ workflow.input.pr_number }}**. - - **State:** `{{ poll_status.output.state if poll_status.output.state is defined else "unknown" }}` - **Error:** {{ poll_status.output.error if poll_status.output.error is defined else "Unexpected state — check polyphony pr poll-status-ado output schema" }} - - Common causes: missing/invalid PAT, ADO timeout, transient API - failure. Retry is safe (single read). - - --- - - Choose an action: - - **Retry** — re-poll the PR - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔄 Retry" - value: retry - route: poll_status - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Already-merged emitter ──────────────────────────────────────────── # # Operator completed the PR through the ADO UI (or it was completed via diff --git a/.conductor/registry/workflows/github-pr.yaml b/.conductor/registry/workflows/github-pr.yaml index e35b0bf..f720628 100644 --- a/.conductor/registry/workflows/github-pr.yaml +++ b/.conductor/registry/workflows/github-pr.yaml @@ -385,7 +385,8 @@ agents: } $json.Trim() routes: - - to: poll_error_gate + # on_error: auto-abort (AB#3257 — trivial gate removed; retry via conductor on_error: when Phase 1 ships) + - to: abort_run when: "{{ poll_status.output.error is defined and poll_status.output.error }}" - to: closed_unmerged_emitter when: "{{ poll_status.output.route == 'abort_unmerged' }}" @@ -403,31 +404,6 @@ agents: # Catch-all per M4 — defensive deferral to analyzer. - to: pr_feedback_analyzer - # ── Poll error gate ─────────────────────────────────────────────────── - - name: poll_error_gate - type: human_gate - prompt: | - ## ⚠️ PR Poll Failed — PR #{{ workflow.input.pr_number }} - - Could not read the PR review state for PR - **#{{ workflow.input.pr_number }}**. - - **State:** `{{ poll_status.output.state if poll_status.output.state is defined else "unknown" }}` - **Error:** {{ poll_status.output.error if poll_status.output.error is defined else "Unexpected state — check polyphony pr poll-status output schema" }} - - --- - - Choose an action: - - **Retry** — re-poll the PR - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔄 Retry" - value: retry - route: poll_status - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Already-merged emitter ──────────────────────────────────────────── # # Operator-merged the PR through the GitHub UI. Emit the canonical diff --git a/.conductor/registry/workflows/implement-merge-group.yaml b/.conductor/registry/workflows/implement-merge-group.yaml index 4e71f18..77b95b7 100644 --- a/.conductor/registry/workflows/implement-merge-group.yaml +++ b/.conductor/registry/workflows/implement-merge-group.yaml @@ -244,7 +244,8 @@ agents: # twig sync). Without this route the catch-all below silently # treats errors as "done with tasks" and proceeds toward MG # closure — masking real failures and leaving items unstarted. - - to: root_router_error_gate + # on_error: auto-abort (AB#3257 — trivial gate removed) + - to: abort_run when: "{{ root_router.output.action == 'error' or (root_router.output.error is defined and root_router.output.error != '') }}" # Catch-all per M4: unknown action from next-impl should fail # safely to dependency_check (the natural "we're done with tasks" @@ -925,7 +926,7 @@ agents: # commit list to an operator via # squash_coverage_mismatch_gate. # - 'error' → verb itself failed (git invocation, ref resolution); - # surface to squash_coverage_error_gate for retry/abort. + # auto-abort via abort_run (AB#3257). # # Invariants: # Pre: Squash committed to `origin/mg/{root}_{path}`; local ref @@ -958,7 +959,8 @@ agents: to: delete_impl_branch - when: "{{ assert_impl_pr_coverage.output.action == 'mismatch' }}" to: squash_coverage_mismatch_gate - - to: squash_coverage_error_gate + # on_error: auto-abort (AB#3257 — trivial gate removed) + - to: abort_run # ── Coverage mismatch gate (AB#3211) ────────────────────────────────── # @@ -1023,46 +1025,6 @@ agents: value: force_accept route: delete_impl_branch - # ── Coverage assertion error gate (AB#3211) ─────────────────────────── - # - # Fires when `assert-impl-pr-coverage` itself returns action='error' - # (typically a git invocation failure: missing ref, transient network). - # Distinct from the mismatch gate — this is a tool failure, not a - # data-loss incident. - - name: squash_coverage_error_gate - type: human_gate - description: assert-impl-pr-coverage tool error — operator decides retry or abort - prompt: | - ## ⚠️ Coverage assertion failed to run — task `{{ root_router.output.root_id }}` - - `polyphony pr assert-impl-pr-coverage` returned an error envelope - (the verb itself could not complete; this is a tool failure, not - a coverage mismatch). - - **Detail:** {{ assert_impl_pr_coverage.output.error | default('(no error message)') }} - - | Field | Value | - |---|---| - | Impl ref | `{{ assert_impl_pr_coverage.output.impl_ref | default('(unresolved)') }}` | - | MG ref | `{{ assert_impl_pr_coverage.output.mg_ref | default('(unresolved)') }}` | - - Common causes: stale local refs (re-fetch may help), missing - `origin/impl/{root}-{item}` (impl branch deleted prematurely), - transient git failure. - - Choose an action: - - - **Retry** — re-run the fetch + assertion. Use after manual - recovery (e.g. `git fetch origin`). - - **Abort** — END this workflow without deleting the impl branch. - options: - - label: "🔄 Retry (re-fetch + re-assert)" - value: retry - route: fetch_for_coverage_assertion - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Delete impl branch (deferred from impl_pr_merge per AB#3211) ────── # # Performs the branch deletion that `merge-impl-pr` would normally do @@ -1222,60 +1184,6 @@ agents: routes: - to: root_router - # ── Primary router error gate ───────────────────────────────────────── - # - # Fires when `branch next-impl` (root_router) emits action='error' or - # a non-empty `error` field. Without an explicit route, the catch-all - # below would silently treat the error as "all items done" and proceed - # toward MG closure — masking the failure and skipping every remaining - # task. - # - # AB#3126 root cause coupling: next-impl now flushes its - # begin_implementation transition via a post-state `twig sync` before - # returning. Network/auth blips during that sync surface here as - # action='error'. - # - # Operator options: - # - Retry: re-enter root_router. `next-impl` is idempotent — - # transitions a Doing item to Doing is a no-op for the validator - # (begin_implementation accepts both Proposed and InProgress), and - # the second `twig sync` will retry the push. - # - Abort: $end. The MG remains in its current state; no items are - # advanced to Done. Re-running the workflow later resumes from the - # same point. - # - # Invariants: - # Pre: root_router emitted action='error' or a populated `error`. - # Post: Operator has retried (route to root_router) or aborted ($end). - - name: root_router_error_gate - type: human_gate - description: next-impl signaled error — operator decides retry or abort - prompt: | - ## ❌ next-impl error — MG `mg/{{ workflow.input.root_id }}_{{ workflow.input.mg_path }}` - - `polyphony branch next-impl` returned an error envelope. The most - common cause is a transient ADO failure during the post-state - `twig sync` that flushes the `begin_implementation` transition - (AB#3126). - - **Detail:** {{ root_router.output.error | default('(no error message)') }} - - Choose an action: - - - **Retry** — re-enter `root_router`. `next-impl` is idempotent - (selecting the same task again, validating `begin_implementation` - on a Doing item, and re-flushing all are no-ops or safe retries). - - **Abort** — exit without advancing any further items. The MG - remains in its current state; a later workflow re-entry resumes - from the same point. - options: - - label: "🔄 Retry (re-run next-impl)" - value: retry - route: root_router - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Dependency check ────────────────────────────────────────────────── # # Checks ADO predecessor links to determine if the MG is blocked by diff --git a/.conductor/registry/workflows/plan-level.yaml b/.conductor/registry/workflows/plan-level.yaml index 910caf0..29c6c7c 100644 --- a/.conductor/registry/workflows/plan-level.yaml +++ b/.conductor/registry/workflows/plan-level.yaml @@ -315,7 +315,7 @@ agents: routes: - to: root_fallback_gate when: "{{ root_resolver.output.fallback_required == true }}" - - to: root_resolver_error_gate + - to: abort_run when: "{{ root_resolver.output.error is defined and root_resolver.output.error }}" - to: type_loader @@ -343,26 +343,6 @@ agents: value: abort route: abort_run - # ── Root resolver error gate ────────────────────────────────────────── - - name: root_resolver_error_gate - type: human_gate - prompt: | - ## ⚠️ Root Resolution Failed - - Could not resolve the run root for work item - **{{ workflow.input.work_item_id }}**. - - **Error:** {{ root_resolver.output.error if root_resolver.output.error is defined else "Unknown error — check that the work item exists in ADO." }} - - --- - - Choose an action: - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Type loader ────────────────────────────────────────────────────────── # # Loads type-specific context from .polyphony-config/work-item-types/. Outputs @@ -379,27 +359,7 @@ agents: routes: - to: ancestor_chain when: "{{ type_loader.output.type is defined and type_loader.output.type != '' }}" - - to: type_loader_error_gate - - # ── Type loader error gate ─────────────────────────────────────────────── - - name: type_loader_error_gate - type: human_gate - prompt: | - ## ⚠️ Type Context Load Failed - - Planning for work item **{{ workflow.input.work_item_id }}** cannot continue - because the type context could not be loaded. - - **Error:** {{ type_loader.output.error if type_loader.output.error is defined else "Unknown error — check that .polyphony-config/work-item-types/ is configured and the work item exists in ADO." }} - - --- - - Choose an action: - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🛑 Abort" - value: abort - route: abort_run + - to: abort_run # ── Ancestor chain derivation ───────────────────────────────────────── # @@ -421,34 +381,10 @@ agents: - "--item-id" - "{{ workflow.input.work_item_id }}" routes: - - to: ancestor_chain_error_gate + - to: abort_run when: "{{ ancestor_chain.output.error is defined and ancestor_chain.output.error }}" - to: state_detector - # ── Ancestor chain error gate ───────────────────────────────────────── - - name: ancestor_chain_error_gate - type: human_gate - prompt: | - ## ⚠️ Ancestor Chain Derivation Failed - - Could not derive the plan-tree ancestor chain for work item - **{{ workflow.input.work_item_id }}** under root - **{{ root_resolver.output.resolved_root_id }}**. - - **Error:** {{ ancestor_chain.output.error if ancestor_chain.output.error is defined else "Unknown error" }} - - Common causes: work item is not actually a descendant of the root, - the parent chain is broken, or a cycle was detected. - - --- - - Choose an action: - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🛑 Abort" - value: abort - route: abort_run - # ── State detector ──────────────────────────────────────────────────── # # The single oracle for "where is this plan tree branch right now?" @@ -492,7 +428,7 @@ agents: - "--repository-override" - "{{ workflow.input.repository }}" routes: - - to: state_detector_error_gate + - to: abort_run when: "{{ state_detector.output.state == 'error' }}" - to: child_router when: "{{ state_detector.output.state == 'complete' }}" @@ -511,31 +447,8 @@ agents: when: "{{ state_detector.output.state == 'stale_generation' }}" - to: architect when: "{{ state_detector.output.state == 'not_started' }}" - # Catch-all per M4: unexpected state value should surface as a gate, - # not raise mid-run "no matching route". - - to: state_detector_error_gate - - # ── State detector error gate ───────────────────────────────────────── - - name: state_detector_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan State Detection Failed - - Could not detect plan-tree state for work item - **{{ workflow.input.work_item_id }}** under root - **{{ root_resolver.output.resolved_root_id }}**. - - **State:** `{{ state_detector.output.state if state_detector.output.state is defined else "unknown" }}` - **Error:** {{ state_detector.output.error if state_detector.output.error is defined else "Unexpected state value — check polyphony plan detect-state output schema" }} - - --- - - Choose an action: - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🛑 Abort" - value: abort - route: abort_run + # Catch-all per M4: unexpected state value; abort deterministically (AB#3257). + - to: abort_run # ── Closed-unmerged gate ─────────────────────────────────────────────── # @@ -1036,34 +949,10 @@ agents: - "--children-json" - "{{ architect.output.children | tojson }}" routes: - - to: write_plan_error_gate + - to: abort_run when: "{{ write_plan.output.error is defined and write_plan.output.error }}" - to: commit_and_push - # ── Write plan error gate ────────────────────────────────────────────── - - name: write_plan_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan Write Failed - - Could not write the plan for work item - **{{ workflow.input.work_item_id }}**. - - **Error:** {{ write_plan.output.error if write_plan.output.error is defined else "Unknown error" }} - - --- - - Choose an action: - - **Retry** — re-run `polyphony plan write-plan` (idempotent) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: write_plan - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Ensure plan branch ──────────────────────────────────────────────── # # Idempotently creates (or fast-forwards to) the plan branch and checks @@ -1090,35 +979,10 @@ agents: - "--parent-item-id" - "{{ ancestor_chain.output.parent_item_id | default(0) }}" routes: - - to: ensure_plan_branch_error_gate + - to: abort_run when: "{{ ensure_plan_branch.output.error is defined and ensure_plan_branch.output.error }}" - to: write_plan - # ── Ensure plan branch error gate ───────────────────────────────────── - - name: ensure_plan_branch_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan Branch Setup Failed - - Could not create or check out the plan branch for work item - **{{ workflow.input.work_item_id }}**. - - **Error:** {{ ensure_plan_branch.output.error if ensure_plan_branch.output.error is defined else "Unknown error" }} - - --- - - Choose an action: - - **Retry** — re-run `polyphony branch ensure-plan` (idempotent; - recovers from transient git-network failures) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: ensure_plan_branch - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Commit and push ──────────────────────────────────────────────────── # # Stages plans/plan-{item_id}.md, commits with a deterministic message, @@ -1148,7 +1012,7 @@ agents: - "--paths" - "plans/plan-{{ workflow.input.work_item_id }}.md,plans/plan-{{ workflow.input.work_item_id }}.children.json" routes: - - to: commit_and_push_error_gate + - to: abort_run when: "{{ commit_and_push.output.error is defined and commit_and_push.output.error }}" # v2.1: route through pr_open_platform_router so the same # commit-and-push leg can feed either open_plan_pr (github) or @@ -1156,33 +1020,6 @@ agents: # no-op fall-through. - to: pr_open_platform_router - # ── Commit and push error gate ──────────────────────────────────────── - - name: commit_and_push_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan Commit/Push Failed - - Could not commit and push the plan for work item - **{{ workflow.input.work_item_id }}**. - - **Branch:** `{{ ensure_plan_branch.output.branch }}` - **Error:** {{ commit_and_push.output.error if commit_and_push.output.error is defined else "Unknown error" }} - - --- - - Choose an action: - - **Retry** — re-run `polyphony plan commit-and-push` (idempotent; - emits no_changes when nothing is staged, recovers from transient - push-network failures) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: commit_and_push - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Open plan PR ─────────────────────────────────────────────────────── # # Opens the plan PR (or returns the existing one) and embeds the @@ -1212,41 +1049,10 @@ agents: - "--ancestor-ids" - "{{ ancestor_chain.output.ancestor_ids }}" routes: - - to: open_plan_pr_error_gate + - to: abort_run when: "{{ open_plan_pr.output.error is defined and open_plan_pr.output.error }}" - to: plan_reviewer - # ── Open plan PR error gate ──────────────────────────────────────────── - - name: open_plan_pr_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan PR Open Failed - - Could not open the plan PR for work item - **{{ workflow.input.work_item_id }}**. - - **Branch:** `{{ ensure_plan_branch.output.branch }}` - **Error:** {{ open_plan_pr.output.error if open_plan_pr.output.error is defined else "Unknown error" }} - - Common causes: stale ancestor snapshot on an existing PR (operator - must close/recreate); gh authentication missing; network timeout - (gh rate-limit, GitHub API hiccup, DNS); transient gh process hang. - - --- - - Choose an action: - - **Retry** — re-run `polyphony pr open-plan-pr` (idempotent — reuses - an existing PR with a matching ancestor snapshot; safe for - transient gh/network failures) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: open_plan_pr - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Plan reviewer (LLM advisory — sentiment-driven model) ───────────── # # Single LLM reviewer per design doc Q2. Produces a critique using a @@ -1509,7 +1315,7 @@ agents: - "{{ open_plan_pr.output.pr_url if (open_plan_pr is defined and open_plan_pr.output.pr_url is defined) else state_detector.output.pr_url }}" - "--include-metadata" routes: - - to: poll_error_gate + - to: abort_run when: "{{ poll_status.output.error is defined and poll_status.output.error }}" # Terminal: operator merged through the platform UI. - to: seeder @@ -1531,31 +1337,6 @@ agents: # Catch-all per M4 — defensive; treat unknown as deferral to analyzer. - to: pr_feedback_analyzer - # ── Poll error gate ─────────────────────────────────────────────────── - - name: poll_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan PR Poll Failed - - Could not read the plan PR review state for work item - **{{ workflow.input.work_item_id }}**. - - **State:** `{{ poll_status_ado.output.state if (poll_status_ado is defined and poll_status_ado.output.state is defined) else (poll_status.output.state if (poll_status is defined and poll_status.output.state is defined) else "unknown") }}` - **Error:** {{ poll_status_ado.output.error if (poll_status_ado is defined and poll_status_ado.output.error is defined) else (poll_status.output.error if (poll_status is defined and poll_status.output.error is defined) else "Unexpected state — check polyphony pr poll-status / poll-status-ado output schema") }} - - --- - - Choose an action: - - **Retry** — re-poll the PR - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔄 Retry" - value: retry - route: pr_poll_platform_router - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Extract parent patch (P8d wiring) ────────────────────────────────── # # Reached when either (a) state_detector returns parent_change_pending — @@ -2346,7 +2127,8 @@ agents: routes: - to: stale_generation_gate when: "{{ merge_plan_pr.output.error_code is defined and merge_plan_pr.output.error_code == 'stale_generation' }}" - - to: merge_error_gate + # on_error: auto-abort (AB#3257 — trivial gate removed; cause info in run event log) + - to: abort_run when: "{{ merge_plan_pr.output.error_code is defined and merge_plan_pr.output.error_code != '' and merge_plan_pr.output.error_code != 'stale_generation' }}" - to: extract_renegotiation_flag @@ -2473,99 +2255,6 @@ agents: routes: - to: merge_plan_pr - # ── Merge error gate ────────────────────────────────────────────────── - # - # Cause-aware: surfaces the verb's full diagnostic message (which is - # already specific — e.g. worktree_dirty includes the porcelain entry - # count and remediation hint) and then a per-error-code "Next step" - # block tailored to what the operator actually needs to do before - # retrying. Falls back to a generic message for codes not yet enumerated - # so a new error_code never silently regresses to "no guidance shown". - - name: merge_error_gate - type: human_gate - prompt: | - ## ⚠️ Plan PR Merge Failed - - Could not merge the plan PR for work item - **{{ workflow.input.work_item_id }}**. - - **Error code:** `{{ merge_plan_pr.output.error_code }}` - - {% if merge_plan_pr.output.error is defined and merge_plan_pr.output.error %} - **Details:** {{ merge_plan_pr.output.error }} - {% endif %} - - --- - - **Next step:** - {% if merge_plan_pr.output.error_code == 'worktree_dirty' %} - The worktree has uncommitted changes. From the run's worktree - directory, run `git status` to see what's there, then commit, - stash, or discard the changes before retrying. Common culprit: - stray scratch files left behind by an agent tool call (look for - orphan `.txt` / `.json` files at the repo root). - {% elif merge_plan_pr.output.error_code == 'lock_held' %} - Another polyphony run holds the same-root run lock. Check for an - in-flight workflow on the same root; wait for it to finish, or - use `polyphony lock force-release` if it's a stale lock. - {% elif merge_plan_pr.output.error_code == 'lock_stale' %} - The run lock is older than the staleness threshold. Run - `polyphony lock force-release --root-id ` to clear it, then - retry. - {% elif merge_plan_pr.output.error_code == 'manifest_push_rejected' %} - A concurrent run pushed to the manifest branch first. Retry — - the verb will refetch and re-attempt; usually succeeds on the - next try. - {% elif merge_plan_pr.output.error_code in ['repo_not_resolved', 'config_error'] %} - Configuration / environment issue. Check that gh is authenticated - to the right user (`gh auth status`), the repo slug resolves, and - the run's `.conductor` config is valid. Retry once fixed. - {% elif merge_plan_pr.output.error_code == 'pr_not_found' %} - The PR number on record no longer exists on the platform. - Someone may have closed or deleted it manually. Aborting and - restarting plan-level for this work item is usually safer than - retrying. - {% elif merge_plan_pr.output.error_code in ['head_ref_mismatch', 'base_ref_mismatch'] %} - The PR's head or base branch doesn't match what the manifest - expects. Inspect the PR on the platform — it may have been - retargeted manually. Aborting and re-opening the PR via - plan-level is safer than retrying. - {% elif merge_plan_pr.output.error_code == 'pr_state_unmergeable' %} - The platform reports the PR is not in a mergeable state - (conflicts, failing required checks, blocked by branch protection). - Resolve the underlying block on the PR page, then retry. - {% elif merge_plan_pr.output.error_code == 'merge_failed' %} - The platform's merge call returned an error. Check the PR page - for the platform-specific reason, address it, then retry. - {% elif merge_plan_pr.output.error_code in ['missing_merge_commit', 'ledger_conflict'] %} - Post-merge bookkeeping failed. Manifest may be partially - updated. Inspect the manifest branch before retrying. - {% elif merge_plan_pr.output.error_code == 'validation_blocked' %} - A pre-merge validation check rejected the merge. See **Details** - above for the specific rule that blocked it; remediate, then - retry. - {% elif merge_plan_pr.output.error_code == 'internal_error' %} - Unexpected verb-internal failure. Capture the run's event log - and file a polyphony issue. Retry is safe but may reproduce. - {% else %} - No targeted guidance for this error code. See **Details** above - and inspect the run's event log for context. Retry is safe but - may reproduce. - {% endif %} - - --- - - Choose an action: - - **Retry** — re-attempt the merge - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔄 Retry" - value: retry - route: merge_plan_pr - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Seeder ───────────────────────────────────────────────────────────── # # Idempotent seeder. Consumes architect.output.children and reconciles @@ -2625,66 +2314,10 @@ agents: - "--children-from-ref" - "origin/main" routes: - - to: seeder_error_gate + - to: child_router when: "{{ seeder.output.error_count is defined and seeder.output.error_count > 0 }}" - to: child_router - # ── Seeder error gate (AB#3067) ───────────────────────────────────────── - # - # `polyphony plan seed-children` always exits 0 and carries per-child - # failures in `errors[]`. This gate fires when `error_count > 0` so the - # operator sees the failure rather than continuing into child_router with - # a half-seeded tree (which silently terminates the root blocked). - # - # Retry routes back to `seeder` because the verb is idempotent — it - # reconciles against existing children via marker-aware matching, so - # re-running after fixing the upstream cause (e.g. resolving a duplicate - # title) is safe. - - name: seeder_error_gate - type: human_gate - prompt: | - ## ⚠️ Seeder Reported Errors - - `polyphony plan seed-children` for work item - **{{ workflow.input.work_item_id }}** returned - `error_count = {{ seeder.output.error_count if seeder.output.error_count is defined else 'unknown' }}`. - - The seeder is idempotent — it reconciles architect-emitted - children against existing children of the parent work item. - Common causes: - - Duplicate child title under the parent (twig dedup conflict). - - Invalid or inaccessible parent ID at the platform layer. - - Twig workspace misconfiguration. - - **Errors (truncated to a first line per item):** - {%- if seeder.output.errors is defined %} - {%- for e in seeder.output.errors %} - - `{{ e.child_id if e.child_id is defined and e.child_id else (e.title if e.title is defined and e.title else '(unknown)') }}` — {{ e.error if e.error is defined else '(no detail)' }} - {%- endfor %} - {%- else %} - _(no per-child error envelopes — see verb stderr in run logs)_ - {%- endif %} - - --- - - Choose an action: - - **Retry** — re-run the seeder (idempotent; safe after fixing the - upstream cause, e.g. resolving a duplicate title) - - **Continue** — proceed to `child_router` anyway (a partial seed is - usually still useful; the next driver iteration will revisit any - children that did get created) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: seeder - - label: "▶️ Continue" - value: continue - route: child_router - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Child router ───────────────────────────────────────────────────────── # # Discovers plannable children of the current work item. Filters by @@ -2829,41 +2462,10 @@ agents: - "--ancestor-ids" - "{{ ancestor_chain.output.ancestor_ids }}" routes: - - to: open_plan_pr_ado_error_gate + - to: abort_run when: "{{ open_plan_pr_ado.output.error is defined and open_plan_pr_ado.output.error }}" - to: plan_reviewer - # ── Open plan PR (ADO) error gate ───────────────────────────────────── - - name: open_plan_pr_ado_error_gate - type: human_gate - prompt: | - ## ⚠️ ADO Plan PR Open Failed - - Could not open (or reuse) the plan PR on Azure DevOps for work item - **{{ workflow.input.work_item_id }}**. - - **Branch:** `{{ ensure_plan_branch.output.branch }}` - **Error:** {{ open_plan_pr_ado.output.error if open_plan_pr_ado.output.error is defined else "Unknown error" }} - - Common causes: stale ancestor snapshot on an existing PR (operator - must abandon/recreate); ADO PAT missing or insufficient scope; - network timeout; repository identifier mismatch. - - --- - - Choose an action: - - **Retry** — re-run `polyphony pr open-plan-pr-ado` (idempotent — - reuses an existing PR with a matching ancestor snapshot; safe for - transient ADO API / network failures) - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔁 Retry" - value: retry - route: open_plan_pr_ado - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Plan reviewer poster (ADO) ──────────────────────────────────────── # # Posts the comment body produced by plan_reviewer to the ADO PR via @@ -2939,7 +2541,7 @@ agents: - "--include-metadata" - "--allow-any-approval-vote" routes: - - to: poll_error_gate + - to: abort_run when: "{{ poll_status_ado.output.error is defined and poll_status_ado.output.error }}" - to: seeder when: "{{ poll_status_ado.output.route == 'already_merged' }}" @@ -2988,7 +2590,7 @@ agents: # use a dedicated gate. - to: merge_plan_pr_ado_stale_gate when: "{{ merge_plan_pr_ado.output.error_code is defined and merge_plan_pr_ado.output.error_code == 'stale_generation' }}" - - to: merge_plan_pr_ado_error_gate + - to: abort_run when: "{{ merge_plan_pr_ado.output.error_code is defined and merge_plan_pr_ado.output.error_code != '' and merge_plan_pr_ado.output.error_code != 'stale_generation' }}" - to: restack_remedy @@ -3033,34 +2635,6 @@ agents: routes: - to: merge_plan_pr_ado - # ── Merge plan PR (ADO) error gate ──────────────────────────────────── - - name: merge_plan_pr_ado_error_gate - type: human_gate - prompt: | - ## ⚠️ ADO Plan PR Merge Failed - - Could not merge the plan PR for work item - **{{ workflow.input.work_item_id }}**. - - **Error code:** `{{ merge_plan_pr_ado.output.error_code }}` - - Common causes: lock acquisition failed (another run holds the - same-root lock); push conflict on manifest branch; ADO PAT auth; - stale head SHA; reviewer requirements not met. - - --- - - Choose an action: - - **Retry** — re-attempt the merge - - **Abort** — halt the entire run (terminates the conductor process) - options: - - label: "🔄 Retry" - value: retry - route: merge_plan_pr_ado - - label: "🛑 Abort" - value: abort - route: abort_run - # ── Plan children summary gate ─────────────────────────────────────────── # # Fires only when one or more child planning sub-workflows failed. diff --git a/.conductor/registry/workflows/restack-remedy.yaml b/.conductor/registry/workflows/restack-remedy.yaml index da54c0b..ead9f99 100644 --- a/.conductor/registry/workflows/restack-remedy.yaml +++ b/.conductor/registry/workflows/restack-remedy.yaml @@ -99,9 +99,9 @@ agents: - "--repository-override" - "{{ workflow.input.repository }}" routes: - # Verb-level error (manifest read/parse, repo slug, etc.) — surface - # to the restack-error gate so the operator can decide. - - to: classify_error_gate + # Verb-level error (manifest read/parse, repo slug, etc.) — auto-skip. + # on_error: auto-skip (AB#3257 — trivial gate removed; routes to $end on classify error) + - to: $end when: "{{ classify.output.error_code is defined and classify.output.error_code != null and classify.output.error_code != '' }}" # Nothing stale — done. - to: $end @@ -109,35 +109,6 @@ agents: # At least one stale entry — fan out remedies. - to: remedy_group - # ── Classify error gate ─────────────────────────────────────────────── - - name: classify_error_gate - type: human_gate - prompt: | - ## ⚠️ Restack Classify Failed - - `polyphony plan classify-stale-descendants` could not enumerate - stale descendants for root **{{ workflow.input.root_id }}**. - - - **Error code:** `{{ classify.output.error_code }}` - - **Error:** {{ classify.output.error | default('(none)') }} - - Common causes: manifest missing or malformed in the per-root state - directory (`/polyphony/{{ workflow.input.root_id }}/run.yaml`), - repo slug not resolvable from `origin`, twig cache stale. - - --- - - Choose an action: - - **Retry** — re-run the classifier (after fixing the cause). - - **Skip restack** — exit without remediating descendant PRs. - options: - - label: "🔄 Retry classify" - value: retry - route: classify - - label: "⏭️ Skip restack" - value: skip - route: $end - # ── Restack summary gate ────────────────────────────────────────────── # # Fires only when remedy_group.errors is non-empty (i.e. at least one diff --git a/docs/projects/on-error-migration-inventory.md b/docs/projects/on-error-migration-inventory.md index 7ee9156..0f2b35f 100644 --- a/docs/projects/on-error-migration-inventory.md +++ b/docs/projects/on-error-migration-inventory.md @@ -1,6 +1,6 @@ # on_error Migration Inventory -**Status:** Read-only inventory; informs AB#3257 sequencing +**Status:** Migration complete as of #528; conductor on_error: retry support pending AB#3257 Phase 1 **Owner:** polyphony-internal architecture **Work item:** AB#3257 — Failure-mode gate elimination **Companion:** [conductor on_error brief](https://github.com/PolyphonyRequiem/conductor/blob/02eace858dbaf2d1d22598a6e7debdf2d4c8a439/docs/projects/error-routing/on-error-routing.brainstorm.md)