From 9c1e1fb19c0a9180d2acecc6dfdeab7687a21e26 Mon Sep 17 00:00:00 2001 From: sungju Yun Date: Tue, 30 Jun 2026 15:43:15 +0900 Subject: [PATCH] feat(gate): emit self JUnit + reclassify 15 doc/scaffold test_refs as evidence_refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cladding's UNVERIFIED_AC detector verifies that each done AC's test_refs actually RAN and PASSED (via a JUnit report) — but cladding never emitted one for itself, so on its own gate the check degraded to existence-only and gave ZERO self-benefit. Enabling emission first surfaced 21 warnings: 15 test_refs across 10 features pointed at things that are not executable tests (docs, underscore-prefixed scaffolds, .gitignore, spec.yaml dogfood markers), so they have no observed result in any JUnit report. - Wire vitest to emit JUnit to .cladding/test-report.junit.xml (a DEFAULT_REPORT_CANDIDATE the detector auto-discovers; gitignored, so it never pollutes git status). In CI `npm test` runs before `clad check`, so the gate reads a fresh, complete report and now genuinely verifies AC↔test↔pass. - Reclassify the 15 non-test refs test_refs → evidence_refs. MISSING_TESTS is satisfied by EITHER test_refs or evidence_refs, so each AC stays verified; the refs are simply recorded as the doc/artifact evidence they actually are (the v0.2.2 "lump everything into test_refs" dishonesty, recurring). One AC (F-d8223c) kept its real self-consistency test_ref and moved only the scaffold. Result: UNVERIFIED_AC 0 / UNTESTED_AC 0 / MISSING_TESTS 0, strict gate GREEN with the report present — the self-gate now actually exercises its own JUnit feature instead of perpetually degrading. Co-Authored-By: Claude Opus 4.8 --- spec.yaml | 2 +- spec/attestation.yaml | 12 ++++++------ spec/features/ab-evaluation-4db939.yaml | 4 ++-- spec/features/ab-ext-dashboard-ef2fd9.yaml | 4 ++-- spec/features/ab-ext-scenarios-emit-f334fa.yaml | 4 ++-- spec/features/ab-ext-uncommit-demos-9a3b61.yaml | 6 +++--- spec/features/ab-extended-task-manager-0144b9.yaml | 2 +- spec/features/ab-outcome-quality-ba2e05.yaml | 2 +- .../features/ai-hints-preferred-patterns-32b1e0.yaml | 2 +- spec/features/blind-author-agent-d8223c.yaml | 5 +++-- .../spec-yaml-inventory-and-hints-5b9f9f.yaml | 2 +- spec/features/spec-yaml-metadata-3a5339.yaml | 2 +- vitest.config.ts | 10 ++++++++++ 13 files changed, 34 insertions(+), 23 deletions(-) diff --git a/spec.yaml b/spec.yaml index e84f4682..36794313 100644 --- a/spec.yaml +++ b/spec.yaml @@ -53,4 +53,4 @@ inventory: scenarios: 2 capabilities: 6 test_files: 170 - last_synced: "2026-07-01" + last_synced: "2026-07-02" diff --git a/spec/attestation.yaml b/spec/attestation.yaml index e1fdf1df..3afa309c 100644 --- a/spec/attestation.yaml +++ b/spec/attestation.yaml @@ -29,7 +29,7 @@ attested: F-021: 8a1a82a59a1c45c7 F-022: 8f596a1c737f6d42 F-02343cd1: 77875ee09a7ea3bf - F-023: 0c14948e5a91bb0f + F-023: 222ad5805d96f197 F-024: f67a86816b06f8ee F-025: 187339b684896b8e F-026: bb35faf43ba582cb @@ -67,7 +67,7 @@ attested: F-059: 140dab087ebf5346 F-060: 96dc261212a0dd3d F-061: c16123610e8fe7fc - F-062: 0ab83282a7f7b1ef + F-062: 6a7566e43056bc20 F-063: 76a719993cc71fa8 F-064: e176668e53f61c6e F-065: e6ed3ef916201947 @@ -107,10 +107,10 @@ attested: F-2de65d: 84ad71574d306c81 F-315fd7: c3b042c80fa7c187 F-31eeb8: d88a9880d29ae411 - F-32b1e0: 9b29e21313d121b1 + F-32b1e0: 9792422c1c008207 F-3788c2: af9778dea8687b29 F-37b4a8: e067655bad681488 - F-3a5339: b2b2ea8775f99267 + F-3a5339: 0dfdeec23abfa841 F-3b3690: 6a36aad282d36f3a F-40327b: 8295358f7b813c8a F-417ff0: 0cc5eeefc5e08377 @@ -125,7 +125,7 @@ attested: F-570a3f: 3f60012b22c9b715 F-59f093: 26735424fba6308c F-5b188856: 92b72281c248eba3 - F-5b9f9f: 0b972209be8b642f + F-5b9f9f: 9d362a585206c5fa F-5d3ed2: 9452eac28760fb99 F-5f6b45: 15323c4f5b619de7 F-64a5c159: adedb516a257c7ec @@ -184,7 +184,7 @@ attested: F-d2c806: b3d8668905855a6c F-d3bde4: 915d13b33258d3fc F-d49585: 11e3ac2dce796fc6 - F-d6b93648: f755a47c66e07635 + F-d6b93648: 2191277c0e5982ee F-d7312b: 000237d094145b6a F-d8223c: 0501e9564231899b F-d980359c: 8f1559276afc5c03 diff --git a/spec/features/ab-evaluation-4db939.yaml b/spec/features/ab-evaluation-4db939.yaml index 54aa4450..5f7baee0 100644 --- a/spec/features/ab-evaluation-4db939.yaml +++ b/spec/features/ab-evaluation-4db939.yaml @@ -51,10 +51,10 @@ acceptance_criteria: action: surface the spec-gated-detector limitation honestly — vanilla's "0 errors" is absence of signal, not absence of drift, because detectors need spec to evaluate response: 'renderFindings explicitly notes "absence of signal, not absence of drift" in the per-case markdown; docs/ab-evaluation/README.md §Limitations item 2 expands on which detectors are spec-gated (REFERENCE_INTEGRITY, MISSING_IMPLEMENTATION, ARCHITECTURE_FROM_SPEC, CAPABILITIES_FEATURE_MAPPING); docs/ab-evaluation/summary.md verdict on H4 marks it ⚠️ Partially supported with major caveat' text: The system shall surface the spec-gated-detector limitation honestly in all 3 documents. - test_refs: [docs/ab-evaluation/README.md, docs/ab-evaluation/summary.md] + evidence_refs: [docs/ab-evaluation/README.md, docs/ab-evaluation/summary.md] - id: AC-007 ears: ubiquitous action: ship the vanilla file sets at senior quality (real zod validation, executable handlers, multiple test cases, proper README) so the comparison stays fair and cladding's value does not depend on vanilla being underwritten response: 'tests/scenarios/ab/_vanilla-sim.ts vanilla code is real TypeScript with zod validation, executable handlers, multiple test cases, proper README structure (Install/Usage/API), production-grade package.json with vitest + typescript devDeps; docs/ab-evaluation/README.md §Limitations item 1 explicitly invites readers to inspect the file sets and judge fairness for themselves' text: The system shall ship vanilla file sets at senior quality so the comparison stays fair and cladding's value does not depend on vanilla being underwritten. - test_refs: [tests/scenarios/ab/_vanilla-sim.ts, docs/ab-evaluation/README.md] + evidence_refs: [tests/scenarios/ab/_vanilla-sim.ts, docs/ab-evaluation/README.md] diff --git a/spec/features/ab-ext-dashboard-ef2fd9.yaml b/spec/features/ab-ext-dashboard-ef2fd9.yaml index 01f8f19c..91ba2394 100644 --- a/spec/features/ab-ext-dashboard-ef2fd9.yaml +++ b/spec/features/ab-ext-dashboard-ef2fd9.yaml @@ -19,7 +19,7 @@ acceptance_criteria: ears: ubiquitous action: 'extract shared React + Vite + TS + Tailwind scaffold into _shared-scaffold.ts so multiple scenarios can reuse package.json / tsconfig / vite.config / tailwind.config / index.html / index.css / main.tsx / README templates without duplication' text: The system shall extract shared React + Vite scaffold into a single module reused by all extended scenarios. - test_refs: [tests/scenarios/ab-extended/_shared-scaffold.ts] + evidence_refs: [tests/scenarios/ab-extended/_shared-scaffold.ts] - id: AC-002 ears: ubiquitous action: 'define 30 analytics-dashboard features across 5 categories (layout 8 + cards 8 + charts 5 + data 5 + preferences 4) with deterministic F-hash6 ids and a dashboard-domain capabilities binding' @@ -44,4 +44,4 @@ acceptance_criteria: ears: ubiquitous action: 'rewrite docs/ab-evaluation-extended/summary.md as the cross-scenario verdict matrix declaring AB-마무리 (AB-evaluation wrap-up) complete — task-manager + dashboard share identical 3/4 cladding-exclusive drift catch rate proving cladding value generalizes across domains' text: The system shall declare AB-evaluation wrap-up complete with a cross-scenario verdict matrix proving domain-independence. - test_refs: [docs/ab-evaluation-extended/summary.md] + evidence_refs: [docs/ab-evaluation-extended/summary.md] diff --git a/spec/features/ab-ext-scenarios-emit-f334fa.yaml b/spec/features/ab-ext-scenarios-emit-f334fa.yaml index 6d665b47..6b34ba8f 100644 --- a/spec/features/ab-ext-scenarios-emit-f334fa.yaml +++ b/spec/features/ab-ext-scenarios-emit-f334fa.yaml @@ -24,9 +24,9 @@ acceptance_criteria: ears: ubiquitous action: 'with scenario shards in place, AI-query Q5 answers in 1 directory read per scenario — bringing the cladding low-cost answer rate from 2 of 5 to 3 of 5 in both scenarios while vanilla stays at 0 of 5' text: The system shall raise the AI-query low-cost answer rate from 2 of 5 to 3 of 5 in both scenarios. - test_refs: [docs/ab-evaluation-extended/scenarios/task-manager/report.md, docs/ab-evaluation-extended/scenarios/dashboard/report.md] + evidence_refs: [docs/ab-evaluation-extended/scenarios/task-manager/report.md, docs/ab-evaluation-extended/scenarios/dashboard/report.md] - id: AC-004 ears: ubiquitous action: 'cross-scenario summary upgrades the H10 verdict from partial to supported (with caveat about Q1 and Q2 linear-scan) so AB-evaluation wrap-up reflects the new measurement' text: The system shall upgrade the H10 cross-scenario verdict to supported. - test_refs: [docs/ab-evaluation-extended/summary.md] + evidence_refs: [docs/ab-evaluation-extended/summary.md] diff --git a/spec/features/ab-ext-uncommit-demos-9a3b61.yaml b/spec/features/ab-ext-uncommit-demos-9a3b61.yaml index ff269015..88155d20 100644 --- a/spec/features/ab-ext-uncommit-demos-9a3b61.yaml +++ b/spec/features/ab-ext-uncommit-demos-9a3b61.yaml @@ -14,14 +14,14 @@ acceptance_criteria: ears: ubiquitous action: 'remove the 4 committed React project directories (task-manager + dashboard × cladding + vanilla) from docs/ab-evaluation-extended/scenarios/ — together they were ~160 files / ~10K LoC of regeneratable bloat' text: The system shall remove the 4 committed React demo projects under docs/ab-evaluation-extended/scenarios/. - test_refs: [.gitignore] + evidence_refs: [.gitignore] - id: AC-002 ears: ubiquitous action: 'add gitignore rules pattern-excluding scenarios/*/cladding/ + scenarios/*/vanilla/ so the demos stay local-only when regenerated via UPDATE_AB_REPORTS=1' text: The system shall pattern-exclude scenarios cladding and vanilla directories from version control. - test_refs: [.gitignore] + evidence_refs: [.gitignore] - id: AC-003 ears: ubiquitous action: 'update README + summary to clarify that demos are regeneratable on demand rather than committed — preserve report.md as the permanent metric record while pointing users at the UPDATE_AB_REPORTS=1 workflow' text: The system shall document the regenerate-on-demand workflow in README and summary. - test_refs: [docs/ab-evaluation-extended/README.md, docs/ab-evaluation-extended/summary.md] + evidence_refs: [docs/ab-evaluation-extended/README.md, docs/ab-evaluation-extended/summary.md] diff --git a/spec/features/ab-extended-task-manager-0144b9.yaml b/spec/features/ab-extended-task-manager-0144b9.yaml index 8236593d..6fcbc4f6 100644 --- a/spec/features/ab-extended-task-manager-0144b9.yaml +++ b/spec/features/ab-extended-task-manager-0144b9.yaml @@ -44,4 +44,4 @@ acceptance_criteria: ears: ubiquitous action: 'verify all measurements support the new hypotheses H9 (linear scale), H10 (AI query cost partial — domain-tuned limitation), H11 (drift catch rate preserved at 3/4), H12 (capture duration bounded under 2s)' text: The system shall report H9, H10, H11, H12 verdicts with quantitative anchors. - test_refs: [docs/ab-evaluation-extended/summary.md] + evidence_refs: [docs/ab-evaluation-extended/summary.md] diff --git a/spec/features/ab-outcome-quality-ba2e05.yaml b/spec/features/ab-outcome-quality-ba2e05.yaml index 21521e06..c3a752c6 100644 --- a/spec/features/ab-outcome-quality-ba2e05.yaml +++ b/spec/features/ab-outcome-quality-ba2e05.yaml @@ -44,4 +44,4 @@ acceptance_criteria: action: surface real cladding bugs that the test had to work around — LLM seed architecture format incompatible with ARCHITECTURE_FROM_SPEC detector schema, and inline F-001 placeholder in init-seeded spec.yaml blocks sharded feature loading response: '_drift-injection.ts exports claddingifyForDriftCatch helper that (a) clears spec.yaml inline features so sharded F-4db939 loads via spec.load.ts heuristic, and (b) rewrites spec/architecture.yaml in canonical string[][] + {from,to}[] schema; comment block in the helper documents both as real cladding bugs tracked in docs/ab-evaluation/summary.md Future Work table' text: The system shall surface the architecture-schema mismatch and inline-F-001 blocker as known cladding bugs in the future-work table. - test_refs: [tests/scenarios/ab/_drift-injection.ts, docs/ab-evaluation/summary.md] + evidence_refs: [tests/scenarios/ab/_drift-injection.ts, docs/ab-evaluation/summary.md] diff --git a/spec/features/ai-hints-preferred-patterns-32b1e0.yaml b/spec/features/ai-hints-preferred-patterns-32b1e0.yaml index 5ef1d666..fb928ce6 100644 --- a/spec/features/ai-hints-preferred-patterns-32b1e0.yaml +++ b/spec/features/ai-hints-preferred-patterns-32b1e0.yaml @@ -37,4 +37,4 @@ acceptance_criteria: ears: ubiquitous action: 'dogfood cladding-self spec.yaml with 4 preferred_patterns covering new detector implementation, spec authoring, AI agent lookup, and Iron Law stage structure — each triple uses when plus prefer plus over to be both readable and actionable' text: The system shall dogfood preferred_patterns on cladding-self spec.yaml with 4 representative triples. - test_refs: [spec.yaml] + evidence_refs: [spec.yaml] diff --git a/spec/features/blind-author-agent-d8223c.yaml b/spec/features/blind-author-agent-d8223c.yaml index 7914b6de..d7310a53 100644 --- a/spec/features/blind-author-agent-d8223c.yaml +++ b/spec/features/blind-author-agent-d8223c.yaml @@ -25,5 +25,6 @@ acceptance_criteria: text: "The agent shall be mirrored by the plugin build (agents/ + codex skills) and registered in the glossary like every persona, with the size-budget ratchet extended, so the new definition lives under the same drift discipline as the existing five." test_refs: - - "tests/scenarios/_size-budgets.ts" - - "tests/self-consistency.test.ts#every persona file under src/agents/ has a glossary row" \ No newline at end of file + - "tests/self-consistency.test.ts#every persona file under src/agents/ has a glossary row" + evidence_refs: + - "tests/scenarios/_size-budgets.ts" \ No newline at end of file diff --git a/spec/features/spec-yaml-inventory-and-hints-5b9f9f.yaml b/spec/features/spec-yaml-inventory-and-hints-5b9f9f.yaml index f950ab72..e74ba79d 100644 --- a/spec/features/spec-yaml-inventory-and-hints-5b9f9f.yaml +++ b/spec/features/spec-yaml-inventory-and-hints-5b9f9f.yaml @@ -32,7 +32,7 @@ acceptance_criteria: ears: ubiquitous action: 'dogfood cladding-self spec.yaml with ai_hints (preferred_persona software-engineer, token_budget_per_session 4000, test_framework vitest, primary_branch develop, forbidden_patterns including eval and innerHTML and dangerouslySetInnerHTML) plus the auto-emitted inventory block' text: The system shall dogfood ai_hints + inventory on cladding-self spec.yaml. - test_refs: [spec.yaml] + evidence_refs: [spec.yaml] - id: AC-005 ears: ubiquitous action: 'last_synced field uses YYYY-MM-DD ISO date (not full ISO-8601 with time) so multiple sync runs on the same day produce identical spec.yaml content and do not create commit churn' diff --git a/spec/features/spec-yaml-metadata-3a5339.yaml b/spec/features/spec-yaml-metadata-3a5339.yaml index dcbf9cdf..dcdc7b84 100644 --- a/spec/features/spec-yaml-metadata-3a5339.yaml +++ b/spec/features/spec-yaml-metadata-3a5339.yaml @@ -26,4 +26,4 @@ acceptance_criteria: ears: ubiquitous action: 'dogfood the new fields in both cladding-self spec.yaml (description, version 0.3.49, repository URL, intent_summary) and the A/B-extended task-manager curator template; regenerate the committed task-manager spec.yaml so reviewers see the enriched front door' text: The system shall dogfood the new metadata fields in cladding-self plus the task-manager curator and regenerate committed outputs. - test_refs: [spec.yaml, tests/scenarios/ab-extended/_curator.ts] + evidence_refs: [spec.yaml, tests/scenarios/ab-extended/_curator.ts] diff --git a/vitest.config.ts b/vitest.config.ts index 9ab73fcb..85f25282 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -9,6 +9,16 @@ import {defineConfig} from 'vitest/config'; export default defineConfig({ test: { include: ['tests/**/*.test.ts'], + // Emit a JUnit report alongside the console output so cladding's own + // UNVERIFIED_AC detector can VERIFY (not just existence-check) that each + // done AC's test_refs actually ran and passed — closing the self-gate's + // AC→test→observed-pass loop (cladding dogfoods its own JUnit feature). The + // path is a DEFAULT_REPORT_CANDIDATE the detector auto-discovers; it lives + // under .cladding/ (gitignored) so it never pollutes git status. In CI + // `npm test` runs before `clad check`, so the gate reads a fresh, complete + // report; standalone `clad check` reads the prior run's (the documented + // degrade-to-existence baseline applies when no report is present yet). + reporters: ['default', ['junit', {outputFile: '.cladding/test-report.junit.xml'}]], // The heavy scenario suites (init + observed-path scan over the 8-file // fixtures, then detector snapshots) run ~6s locally but several × slower // under the 2-core GitHub runner's worker contention. The default 5s