diff --git a/multiple-comparison-control-assistant/README.md b/multiple-comparison-control-assistant/README.md new file mode 100644 index 00000000..6c9f7eb2 --- /dev/null +++ b/multiple-comparison-control-assistant/README.md @@ -0,0 +1,29 @@ +# Multiple-Comparison Control Assistant + +This module adds a focused AI Peer Review Aid slice for SCIBASE issue #13. It checks whether manuscript result families handle multiple testing before a draft claims statistical significance. + +The assistant is intentionally narrower than broad summarizers, citation tools, manuscript similarity checks, unit consistency, reporting-guideline readiness, certainty/tone calibration, and general statistical review. It targets multiplicity risk: uncorrected p-values, missing adjusted values, incompatible FDR claims, unclear hypothesis families, and missing endpoint hierarchy. + +## What It Checks + +- declared hypothesis family boundaries +- accepted correction methods such as Benjamini-Hochberg, Bonferroni, Holm, or pre-registered hierarchy +- adjusted p-values or q-values for significant tests +- result statements that overclaim based on unadjusted values +- FDR threshold consistency for claimed significant findings +- endpoint hierarchy for confirmatory families +- exploratory labels when exploratory analyses use strong significance language + +## Usage + +```bash +npm run check +npm test +npm run demo +``` + +`npm run demo` writes deterministic JSON, Markdown, SVG, and MP4 reviewer artifacts to `reports/` when `ffmpeg` is available. + +## Safety + +All sample manuscripts are synthetic. The module does not process private manuscripts, call external APIs, open network connections, or include credentials. diff --git a/multiple-comparison-control-assistant/acceptance-notes.md b/multiple-comparison-control-assistant/acceptance-notes.md new file mode 100644 index 00000000..7cefd489 --- /dev/null +++ b/multiple-comparison-control-assistant/acceptance-notes.md @@ -0,0 +1,21 @@ +# Acceptance Notes + +## Reviewer Path + +1. Run `npm run check` to verify syntax. +2. Run `npm test` to verify ready, revise, clean single-endpoint, and deterministic digest behavior. +3. Run `npm run demo` to regenerate reviewer artifacts in `reports/`. +4. Inspect `reports/multiple-comparison-report.md` for manuscript decisions and the review queue. + +## Expected Results + +- `cardio-biomarker-study` is `peer_review_ready`. +- `neuro-screening-draft` is `revise_before_submission` because confirmatory claims rely on unadjusted p-values without an accepted correction method or endpoint hierarchy. +- `metabolomics-exploration` is `revise_before_submission` because one exploratory claim overstates a finding whose adjusted value exceeds the FDR threshold. + +## Demo Artifacts + +- `reports/multiple-comparison-packet.json` +- `reports/multiple-comparison-report.md` +- `reports/summary.svg` +- `reports/demo.mp4` diff --git a/multiple-comparison-control-assistant/demo.js b/multiple-comparison-control-assistant/demo.js new file mode 100644 index 00000000..c22d6590 --- /dev/null +++ b/multiple-comparison-control-assistant/demo.js @@ -0,0 +1,89 @@ +const fs = require("node:fs") +const path = require("node:path") +const { spawnSync } = require("node:child_process") +const { evaluateMultipleComparisonControls } = require("./index") +const { manuscripts, policy } = require("./sample-data") + +const reportsDir = path.join(__dirname, "reports") +fs.mkdirSync(reportsDir, { recursive: true }) + +const packet = evaluateMultipleComparisonControls({ + asOf: "2026-05-23T03:00:00.000Z", + manuscripts, + policy, +}) + +fs.writeFileSync( + path.join(reportsDir, "multiple-comparison-packet.json"), + `${JSON.stringify(packet, null, 2)}\n`, +) + +const markdown = [ + "# Multiple-Comparison Control Assistant Report", + "", + `Manuscripts reviewed: ${packet.summary.totalManuscripts}`, + `Ready manuscripts: ${packet.summary.readyManuscripts}`, + `Revise before submission: ${packet.summary.reviseManuscripts}`, + `Analysis families: ${packet.summary.totalFamilies}`, + `Tests reviewed: ${packet.summary.totalTests}`, + `Critical findings: ${packet.summary.criticalFindings}`, + `Warning findings: ${packet.summary.warningFindings}`, + `Audit digest: \`${packet.audit.digest}\``, + "", + "## Manuscript Decisions", + ...packet.manuscripts.flatMap((manuscript) => [ + "", + `### ${manuscript.title}`, + `- Status: ${manuscript.status}`, + `- Families: ${manuscript.summary.families}`, + `- Tests: ${manuscript.summary.tests}`, + `- Finding codes: ${manuscript.findings.map((finding) => finding.code).join(", ") || "none"}`, + ]), + "", + "## Review Queue", + ...packet.reviewQueue.map((item) => ( + `- ${item.manuscriptId}/${item.familyId}: ${item.action} (${item.severity})` + )), + "", +] + +fs.writeFileSync(path.join(reportsDir, "multiple-comparison-report.md"), markdown.join("\n")) + +const svg = ` + + Multiple-Comparison Control Assistant + AI peer-review aid for FDR, adjusted p-values, and endpoint hierarchy + + ${packet.summary.readyManuscripts} + ready + + ${packet.summary.totalTests} + tests reviewed + + ${packet.summary.criticalFindings} + critical + Controls: family boundaries, correction methods, adjusted values, FDR threshold, hierarchy, exploratory labels. + Digest ${packet.audit.digest.slice(0, 28)}... + +` + +fs.writeFileSync(path.join(reportsDir, "summary.svg"), svg) + +const ffmpeg = spawnSync("ffmpeg", [ + "-y", + "-f", + "lavfi", + "-i", + "color=c=0x16202a:s=960x540:d=5:r=15", + "-vf", + "drawbox=x=48:y=172:w=250:h=150:color=0x047857@1:t=fill,drawbox=x=355:y=172:w=250:h=150:color=0x7c3aed@1:t=fill,drawbox=x=662:y=172:w=250:h=150:color=0xbe123c@1:t=fill,drawbox=x=48:y=368:w=864:h=18:color=0x38bdf8@1:t=fill", + "-pix_fmt", + "yuv420p", + path.join(reportsDir, "demo.mp4"), +], { stdio: "ignore" }) + +if (ffmpeg.status !== 0) { + console.warn("ffmpeg video generation failed; JSON, Markdown, and SVG reports were still generated.") +} + +console.log(`Wrote multiple-comparison assistant artifacts to ${reportsDir}`) diff --git a/multiple-comparison-control-assistant/index.js b/multiple-comparison-control-assistant/index.js new file mode 100644 index 00000000..57800c49 --- /dev/null +++ b/multiple-comparison-control-assistant/index.js @@ -0,0 +1,340 @@ +const crypto = require("node:crypto") + +const DEFAULT_POLICY = Object.freeze({ + pValueThreshold: 0.05, + fdrThreshold: 0.05, + familySizeWarningThreshold: 5, + allowedCorrectionMethods: [ + "benjamini-hochberg", + "bonferroni", + "holm", + "pre-registered-hierarchy", + ], + policyVersion: "multiple-comparison-control-v1", +}) + +function evaluateMultipleComparisonControls(input = {}) { + const policy = { ...DEFAULT_POLICY, ...(input.policy || {}) } + const asOf = input.asOf || new Date().toISOString() + const manuscripts = Array.isArray(input.manuscripts) ? input.manuscripts : [] + const manuscriptDecisions = manuscripts.map((manuscript) => evaluateManuscript(manuscript, policy)) + const reviewQueue = manuscriptDecisions.flatMap((manuscript) => manuscript.reviewQueue) + const summary = summarize(manuscriptDecisions, reviewQueue) + const auditPayload = { + asOf, + policy, + summary, + manuscripts: manuscriptDecisions.map((manuscript) => ({ + manuscriptId: manuscript.manuscriptId, + status: manuscript.status, + findingCodes: manuscript.findings.map((finding) => finding.code), + })), + } + + return { + asOf, + policy, + summary, + manuscripts: manuscriptDecisions, + reviewQueue, + readyQueue: manuscriptDecisions + .filter((manuscript) => manuscript.status === "peer_review_ready") + .map((manuscript) => ({ + manuscriptId: manuscript.manuscriptId, + action: "allow_peer_review_submission", + checkedFamilies: manuscript.summary.families, + })), + audit: { + algorithm: "sha256", + digest: digest(auditPayload), + policyVersion: policy.policyVersion, + }, + } +} + +function evaluateManuscript(manuscript, policy) { + const families = Array.isArray(manuscript.analysisFamilies) ? manuscript.analysisFamilies : [] + const statements = Array.isArray(manuscript.resultStatements) ? manuscript.resultStatements : [] + const familyDecisions = families.map((family) => evaluateFamily(family, statements, policy)) + const findings = familyDecisions.flatMap((family) => family.findings) + const criticalFindings = findings.filter((finding) => finding.severity === "critical") + const warningFindings = findings.filter((finding) => finding.severity === "warning") + const status = criticalFindings.length > 0 + ? "revise_before_submission" + : warningFindings.length > 0 + ? "statistical_review" + : "peer_review_ready" + + return { + manuscriptId: manuscript.manuscriptId, + title: manuscript.title, + status, + summary: { + families: familyDecisions.length, + tests: familyDecisions.reduce((total, family) => total + family.testsReviewed, 0), + passedFamilies: familyDecisions.filter((family) => family.status === "passed").length, + criticalFindings: criticalFindings.length, + warningFindings: warningFindings.length, + }, + families: familyDecisions, + findings, + reviewQueue: findings.map((finding) => ({ + manuscriptId: manuscript.manuscriptId, + familyId: finding.familyId, + code: finding.code, + severity: finding.severity, + action: reviewAction(finding.code), + remediation: finding.remediation, + })), + } +} + +function evaluateFamily(family, statements, policy) { + const tests = Array.isArray(family.tests) ? family.tests : [] + const familyStatements = statements.filter((statement) => statement.familyId === family.familyId) + const findings = [] + const checks = [] + const declaredFamilySize = Number.isFinite(family.declaredFamilySize) ? family.declaredFamilySize : null + const familySize = declaredFamilySize || tests.length + const correctionMethod = normalizeMethod(family.correctionMethod) + const needsCorrection = familySize > 1 || tests.length > 1 + const confirmatory = family.intent === "confirmatory" + const exploratory = family.intent === "exploratory" + + addCheck(checks, findings, { + familyId: family.familyId, + code: "family_boundary_declared", + severity: "warning", + pass: Boolean(declaredFamilySize && declaredFamilySize === tests.length), + message: declaredFamilySize + ? `Declared family size ${declaredFamilySize} covers ${tests.length} tests.` + : "No hypothesis family size is declared.", + remediation: "Declare the hypothesis family boundary and number of tests used for correction.", + evidence: { declaredFamilySize, observedTests: tests.length }, + }) + + const methodAllowed = policy.allowedCorrectionMethods.includes(correctionMethod) + addCheck(checks, findings, { + familyId: family.familyId, + code: "correction_method_allowed", + severity: confirmatory ? "critical" : "warning", + pass: !needsCorrection || methodAllowed, + message: methodAllowed + ? `Correction method ${correctionMethod} is accepted.` + : `Correction method ${correctionMethod || "missing"} is not accepted for this family.`, + remediation: "Choose a declared multiplicity method such as Benjamini-Hochberg, Bonferroni, Holm, or a pre-registered hierarchy.", + evidence: { correctionMethod: correctionMethod || null, needsCorrection, intent: family.intent }, + }) + + const missingAdjusted = significantTests(tests, policy) + .filter((test) => needsCorrection && !hasAdjustedEvidence(test, correctionMethod)) + addCheck(checks, findings, { + familyId: family.familyId, + code: "adjusted_values_present", + severity: "critical", + pass: missingAdjusted.length === 0, + message: missingAdjusted.length === 0 + ? "Every significant test has adjusted p/q evidence when correction is required." + : `${missingAdjusted.length} significant tests are missing adjusted p/q evidence.`, + remediation: "Report adjusted p-values or q-values for every significant claim in the family.", + evidence: { missingTestIds: missingAdjusted.map((test) => test.testId) }, + }) + + const inflatedClaims = findInflatedClaims(familyStatements, tests, policy, correctionMethod) + addCheck(checks, findings, { + familyId: family.familyId, + code: "unadjusted_significance_claim", + severity: "critical", + pass: inflatedClaims.length === 0, + message: inflatedClaims.length === 0 + ? "Result claims align with adjusted evidence." + : `${inflatedClaims.length} result claims rely on unadjusted or non-significant adjusted evidence.`, + remediation: "Rewrite significance language or support it with adjusted p/q values below the declared threshold.", + evidence: { claimIds: inflatedClaims.map((claim) => claim.claimId) }, + }) + + const thresholdMismatches = tests + .filter((test) => adjustedValue(test) !== null) + .filter((test) => test.claimedSignificant === true && adjustedValue(test) > policy.fdrThreshold) + addCheck(checks, findings, { + familyId: family.familyId, + code: "fdr_threshold_consistency", + severity: "critical", + pass: thresholdMismatches.length === 0, + message: thresholdMismatches.length === 0 + ? "Claimed significant tests are within the adjusted threshold." + : `${thresholdMismatches.length} tests exceed the adjusted threshold but are still claimed significant.`, + remediation: "Downgrade the claim or revise the correction threshold and reported adjusted values.", + evidence: { + threshold: policy.fdrThreshold, + testIds: thresholdMismatches.map((test) => test.testId), + }, + }) + + addCheck(checks, findings, { + familyId: family.familyId, + code: "endpoint_hierarchy_covered", + severity: "warning", + pass: !confirmatory || Boolean(family.endpointHierarchy && family.endpointHierarchy.primaryEndpoints && family.endpointHierarchy.primaryEndpoints.length > 0), + message: confirmatory + ? "Confirmatory family endpoint hierarchy reviewed." + : "Endpoint hierarchy is optional for exploratory families.", + remediation: "Declare primary and secondary endpoints before making confirmatory significance claims.", + evidence: { + intent: family.intent, + primaryEndpoints: family.endpointHierarchy ? family.endpointHierarchy.primaryEndpoints || [] : [], + }, + }) + + const exploratoryOverclaims = exploratory + ? familyStatements.filter((statement) => hasConfirmatoryLanguage(statement.text) && !statement.exploratoryLabel) + : [] + addCheck(checks, findings, { + familyId: family.familyId, + code: "exploratory_labeling", + severity: "warning", + pass: exploratoryOverclaims.length === 0, + message: exploratoryOverclaims.length === 0 + ? "Exploratory claims are labeled without confirmatory overstatement." + : `${exploratoryOverclaims.length} exploratory claims use confirmatory language without a label.`, + remediation: "Mark exploratory findings clearly and avoid confirmatory language for hypothesis-generating analyses.", + evidence: { claimIds: exploratoryOverclaims.map((statement) => statement.claimId) }, + }) + + const criticalFindings = findings.filter((finding) => finding.severity === "critical") + const warningFindings = findings.filter((finding) => finding.severity === "warning") + + return { + familyId: family.familyId, + label: family.label, + intent: family.intent, + correctionMethod: correctionMethod || null, + testsReviewed: tests.length, + status: criticalFindings.length > 0 ? "failed" : warningFindings.length > 0 ? "needs_statistical_review" : "passed", + checks, + findings, + } +} + +function significantTests(tests, policy) { + return tests.filter((test) => Number.isFinite(test.pValue) && test.pValue <= policy.pValueThreshold) +} + +function hasAdjustedEvidence(test, correctionMethod) { + if (correctionMethod === "pre-registered-hierarchy") { + return Boolean(test.hierarchyDecision) + } + + return adjustedValue(test) !== null +} + +function adjustedValue(test) { + if (Number.isFinite(test.adjustedPValue)) { + return test.adjustedPValue + } + + if (Number.isFinite(test.qValue)) { + return test.qValue + } + + return null +} + +function findInflatedClaims(statements, tests, policy, correctionMethod) { + const testsById = new Map(tests.map((test) => [test.testId, test])) + + return statements.filter((statement) => { + if (!statement.usesSignificanceLanguage) { + return false + } + + const test = testsById.get(statement.testId) + if (!test) { + return true + } + + const adjusted = adjustedValue(test) + if (correctionMethod === "pre-registered-hierarchy" && test.hierarchyDecision) { + return false + } + + if (adjusted === null) { + return Number.isFinite(test.pValue) && test.pValue <= policy.pValueThreshold + } + + return adjusted > policy.fdrThreshold + }) +} + +function addCheck(checks, findings, check) { + checks.push({ + code: check.code, + severity: check.severity, + pass: check.pass, + message: check.message, + evidence: check.evidence || {}, + }) + + if (!check.pass) { + findings.push({ + familyId: check.familyId, + code: check.code, + severity: check.severity, + message: check.message, + remediation: check.remediation, + evidence: check.evidence || {}, + }) + } +} + +function summarize(manuscriptDecisions, reviewQueue) { + return { + totalManuscripts: manuscriptDecisions.length, + readyManuscripts: manuscriptDecisions.filter((manuscript) => manuscript.status === "peer_review_ready").length, + statisticalReviewManuscripts: manuscriptDecisions.filter((manuscript) => manuscript.status === "statistical_review").length, + reviseManuscripts: manuscriptDecisions.filter((manuscript) => manuscript.status === "revise_before_submission").length, + totalFamilies: manuscriptDecisions.reduce((total, manuscript) => total + manuscript.summary.families, 0), + totalTests: manuscriptDecisions.reduce((total, manuscript) => total + manuscript.summary.tests, 0), + totalFindings: reviewQueue.length, + criticalFindings: reviewQueue.filter((item) => item.severity === "critical").length, + warningFindings: reviewQueue.filter((item) => item.severity === "warning").length, + } +} + +function normalizeMethod(method) { + return typeof method === "string" ? method.trim().toLowerCase() : "" +} + +function hasConfirmatoryLanguage(text) { + if (typeof text !== "string") { + return false + } + + return /\b(confirm|confirmed|primary|definitive|significant effect|proved)\b/i.test(text) +} + +function reviewAction(code) { + const actions = { + family_boundary_declared: "declare_hypothesis_family", + correction_method_allowed: "choose_multiplicity_method", + adjusted_values_present: "report_adjusted_values", + unadjusted_significance_claim: "rewrite_significance_claim", + fdr_threshold_consistency: "align_claim_with_adjusted_threshold", + endpoint_hierarchy_covered: "declare_endpoint_hierarchy", + exploratory_labeling: "label_exploratory_claims", + } + + return actions[code] || "statistical_reviewer_followup" +} + +function digest(payload) { + return crypto + .createHash("sha256") + .update(JSON.stringify(payload)) + .digest("hex") +} + +module.exports = { + DEFAULT_POLICY, + evaluateMultipleComparisonControls, +} diff --git a/multiple-comparison-control-assistant/package.json b/multiple-comparison-control-assistant/package.json new file mode 100644 index 00000000..f4084363 --- /dev/null +++ b/multiple-comparison-control-assistant/package.json @@ -0,0 +1,11 @@ +{ + "name": "multiple-comparison-control-assistant", + "version": "1.0.0", + "private": true, + "type": "commonjs", + "scripts": { + "check": "node --check index.js && node --check sample-data.js && node --check test.js && node --check demo.js", + "test": "node test.js", + "demo": "node demo.js" + } +} diff --git a/multiple-comparison-control-assistant/reports/demo.mp4 b/multiple-comparison-control-assistant/reports/demo.mp4 new file mode 100644 index 00000000..176dbd84 Binary files /dev/null and b/multiple-comparison-control-assistant/reports/demo.mp4 differ diff --git a/multiple-comparison-control-assistant/reports/multiple-comparison-packet.json b/multiple-comparison-control-assistant/reports/multiple-comparison-packet.json new file mode 100644 index 00000000..334a0cb8 --- /dev/null +++ b/multiple-comparison-control-assistant/reports/multiple-comparison-packet.json @@ -0,0 +1,637 @@ +{ + "asOf": "2026-05-23T03:00:00.000Z", + "policy": { + "pValueThreshold": 0.05, + "fdrThreshold": 0.05, + "familySizeWarningThreshold": 5, + "allowedCorrectionMethods": [ + "benjamini-hochberg", + "bonferroni", + "holm", + "pre-registered-hierarchy" + ], + "policyVersion": "multiple-comparison-control-v1" + }, + "summary": { + "totalManuscripts": 3, + "readyManuscripts": 1, + "statisticalReviewManuscripts": 0, + "reviseManuscripts": 2, + "totalFamilies": 3, + "totalTests": 15, + "totalFindings": 7, + "criticalFindings": 5, + "warningFindings": 2 + }, + "manuscripts": [ + { + "manuscriptId": "cardio-biomarker-study", + "title": "Cardiac Biomarker Response Study", + "status": "peer_review_ready", + "summary": { + "families": 1, + "tests": 3, + "passedFamilies": 1, + "criticalFindings": 0, + "warningFindings": 0 + }, + "families": [ + { + "familyId": "primary-biomarkers", + "label": "Primary biomarker outcomes", + "intent": "confirmatory", + "correctionMethod": "benjamini-hochberg", + "testsReviewed": 3, + "status": "passed", + "checks": [ + { + "code": "family_boundary_declared", + "severity": "warning", + "pass": true, + "message": "Declared family size 3 covers 3 tests.", + "evidence": { + "declaredFamilySize": 3, + "observedTests": 3 + } + }, + { + "code": "correction_method_allowed", + "severity": "critical", + "pass": true, + "message": "Correction method benjamini-hochberg is accepted.", + "evidence": { + "correctionMethod": "benjamini-hochberg", + "needsCorrection": true, + "intent": "confirmatory" + } + }, + { + "code": "adjusted_values_present", + "severity": "critical", + "pass": true, + "message": "Every significant test has adjusted p/q evidence when correction is required.", + "evidence": { + "missingTestIds": [] + } + }, + { + "code": "unadjusted_significance_claim", + "severity": "critical", + "pass": true, + "message": "Result claims align with adjusted evidence.", + "evidence": { + "claimIds": [] + } + }, + { + "code": "fdr_threshold_consistency", + "severity": "critical", + "pass": true, + "message": "Claimed significant tests are within the adjusted threshold.", + "evidence": { + "threshold": 0.05, + "testIds": [] + } + }, + { + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "pass": true, + "message": "Confirmatory family endpoint hierarchy reviewed.", + "evidence": { + "intent": "confirmatory", + "primaryEndpoints": [ + "troponin-change" + ] + } + }, + { + "code": "exploratory_labeling", + "severity": "warning", + "pass": true, + "message": "Exploratory claims are labeled without confirmatory overstatement.", + "evidence": { + "claimIds": [] + } + } + ], + "findings": [] + } + ], + "findings": [], + "reviewQueue": [] + }, + { + "manuscriptId": "neuro-screening-draft", + "title": "Neuro Screening Draft", + "status": "revise_before_submission", + "summary": { + "families": 1, + "tests": 8, + "passedFamilies": 0, + "criticalFindings": 3, + "warningFindings": 1 + }, + "families": [ + { + "familyId": "neuro-panel", + "label": "Neurocognitive screening panel", + "intent": "confirmatory", + "correctionMethod": "none", + "testsReviewed": 8, + "status": "failed", + "checks": [ + { + "code": "family_boundary_declared", + "severity": "warning", + "pass": true, + "message": "Declared family size 8 covers 8 tests.", + "evidence": { + "declaredFamilySize": 8, + "observedTests": 8 + } + }, + { + "code": "correction_method_allowed", + "severity": "critical", + "pass": false, + "message": "Correction method none is not accepted for this family.", + "evidence": { + "correctionMethod": "none", + "needsCorrection": true, + "intent": "confirmatory" + } + }, + { + "code": "adjusted_values_present", + "severity": "critical", + "pass": false, + "message": "4 significant tests are missing adjusted p/q evidence.", + "evidence": { + "missingTestIds": [ + "memory", + "attention", + "language", + "mood" + ] + } + }, + { + "code": "unadjusted_significance_claim", + "severity": "critical", + "pass": false, + "message": "2 result claims rely on unadjusted or non-significant adjusted evidence.", + "evidence": { + "claimIds": [ + "claim-neuro-1", + "claim-neuro-2" + ] + } + }, + { + "code": "fdr_threshold_consistency", + "severity": "critical", + "pass": true, + "message": "Claimed significant tests are within the adjusted threshold.", + "evidence": { + "threshold": 0.05, + "testIds": [] + } + }, + { + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "pass": false, + "message": "Confirmatory family endpoint hierarchy reviewed.", + "evidence": { + "intent": "confirmatory", + "primaryEndpoints": [] + } + }, + { + "code": "exploratory_labeling", + "severity": "warning", + "pass": true, + "message": "Exploratory claims are labeled without confirmatory overstatement.", + "evidence": { + "claimIds": [] + } + } + ], + "findings": [ + { + "familyId": "neuro-panel", + "code": "correction_method_allowed", + "severity": "critical", + "message": "Correction method none is not accepted for this family.", + "remediation": "Choose a declared multiplicity method such as Benjamini-Hochberg, Bonferroni, Holm, or a pre-registered hierarchy.", + "evidence": { + "correctionMethod": "none", + "needsCorrection": true, + "intent": "confirmatory" + } + }, + { + "familyId": "neuro-panel", + "code": "adjusted_values_present", + "severity": "critical", + "message": "4 significant tests are missing adjusted p/q evidence.", + "remediation": "Report adjusted p-values or q-values for every significant claim in the family.", + "evidence": { + "missingTestIds": [ + "memory", + "attention", + "language", + "mood" + ] + } + }, + { + "familyId": "neuro-panel", + "code": "unadjusted_significance_claim", + "severity": "critical", + "message": "2 result claims rely on unadjusted or non-significant adjusted evidence.", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold.", + "evidence": { + "claimIds": [ + "claim-neuro-1", + "claim-neuro-2" + ] + } + }, + { + "familyId": "neuro-panel", + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "message": "Confirmatory family endpoint hierarchy reviewed.", + "remediation": "Declare primary and secondary endpoints before making confirmatory significance claims.", + "evidence": { + "intent": "confirmatory", + "primaryEndpoints": [] + } + } + ] + } + ], + "findings": [ + { + "familyId": "neuro-panel", + "code": "correction_method_allowed", + "severity": "critical", + "message": "Correction method none is not accepted for this family.", + "remediation": "Choose a declared multiplicity method such as Benjamini-Hochberg, Bonferroni, Holm, or a pre-registered hierarchy.", + "evidence": { + "correctionMethod": "none", + "needsCorrection": true, + "intent": "confirmatory" + } + }, + { + "familyId": "neuro-panel", + "code": "adjusted_values_present", + "severity": "critical", + "message": "4 significant tests are missing adjusted p/q evidence.", + "remediation": "Report adjusted p-values or q-values for every significant claim in the family.", + "evidence": { + "missingTestIds": [ + "memory", + "attention", + "language", + "mood" + ] + } + }, + { + "familyId": "neuro-panel", + "code": "unadjusted_significance_claim", + "severity": "critical", + "message": "2 result claims rely on unadjusted or non-significant adjusted evidence.", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold.", + "evidence": { + "claimIds": [ + "claim-neuro-1", + "claim-neuro-2" + ] + } + }, + { + "familyId": "neuro-panel", + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "message": "Confirmatory family endpoint hierarchy reviewed.", + "remediation": "Declare primary and secondary endpoints before making confirmatory significance claims.", + "evidence": { + "intent": "confirmatory", + "primaryEndpoints": [] + } + } + ], + "reviewQueue": [ + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "correction_method_allowed", + "severity": "critical", + "action": "choose_multiplicity_method", + "remediation": "Choose a declared multiplicity method such as Benjamini-Hochberg, Bonferroni, Holm, or a pre-registered hierarchy." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "adjusted_values_present", + "severity": "critical", + "action": "report_adjusted_values", + "remediation": "Report adjusted p-values or q-values for every significant claim in the family." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "unadjusted_significance_claim", + "severity": "critical", + "action": "rewrite_significance_claim", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "action": "declare_endpoint_hierarchy", + "remediation": "Declare primary and secondary endpoints before making confirmatory significance claims." + } + ] + }, + { + "manuscriptId": "metabolomics-exploration", + "title": "Metabolomics Exploration", + "status": "revise_before_submission", + "summary": { + "families": 1, + "tests": 4, + "passedFamilies": 0, + "criticalFindings": 2, + "warningFindings": 1 + }, + "families": [ + { + "familyId": "metabolite-screen", + "label": "Exploratory metabolite screen", + "intent": "exploratory", + "correctionMethod": "benjamini-hochberg", + "testsReviewed": 4, + "status": "failed", + "checks": [ + { + "code": "family_boundary_declared", + "severity": "warning", + "pass": true, + "message": "Declared family size 4 covers 4 tests.", + "evidence": { + "declaredFamilySize": 4, + "observedTests": 4 + } + }, + { + "code": "correction_method_allowed", + "severity": "warning", + "pass": true, + "message": "Correction method benjamini-hochberg is accepted.", + "evidence": { + "correctionMethod": "benjamini-hochberg", + "needsCorrection": true, + "intent": "exploratory" + } + }, + { + "code": "adjusted_values_present", + "severity": "critical", + "pass": true, + "message": "Every significant test has adjusted p/q evidence when correction is required.", + "evidence": { + "missingTestIds": [] + } + }, + { + "code": "unadjusted_significance_claim", + "severity": "critical", + "pass": false, + "message": "1 result claims rely on unadjusted or non-significant adjusted evidence.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + }, + { + "code": "fdr_threshold_consistency", + "severity": "critical", + "pass": false, + "message": "1 tests exceed the adjusted threshold but are still claimed significant.", + "evidence": { + "threshold": 0.05, + "testIds": [ + "metabolite-b" + ] + } + }, + { + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "pass": true, + "message": "Endpoint hierarchy is optional for exploratory families.", + "evidence": { + "intent": "exploratory", + "primaryEndpoints": [] + } + }, + { + "code": "exploratory_labeling", + "severity": "warning", + "pass": false, + "message": "1 exploratory claims use confirmatory language without a label.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + } + ], + "findings": [ + { + "familyId": "metabolite-screen", + "code": "unadjusted_significance_claim", + "severity": "critical", + "message": "1 result claims rely on unadjusted or non-significant adjusted evidence.", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + }, + { + "familyId": "metabolite-screen", + "code": "fdr_threshold_consistency", + "severity": "critical", + "message": "1 tests exceed the adjusted threshold but are still claimed significant.", + "remediation": "Downgrade the claim or revise the correction threshold and reported adjusted values.", + "evidence": { + "threshold": 0.05, + "testIds": [ + "metabolite-b" + ] + } + }, + { + "familyId": "metabolite-screen", + "code": "exploratory_labeling", + "severity": "warning", + "message": "1 exploratory claims use confirmatory language without a label.", + "remediation": "Mark exploratory findings clearly and avoid confirmatory language for hypothesis-generating analyses.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + } + ] + } + ], + "findings": [ + { + "familyId": "metabolite-screen", + "code": "unadjusted_significance_claim", + "severity": "critical", + "message": "1 result claims rely on unadjusted or non-significant adjusted evidence.", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + }, + { + "familyId": "metabolite-screen", + "code": "fdr_threshold_consistency", + "severity": "critical", + "message": "1 tests exceed the adjusted threshold but are still claimed significant.", + "remediation": "Downgrade the claim or revise the correction threshold and reported adjusted values.", + "evidence": { + "threshold": 0.05, + "testIds": [ + "metabolite-b" + ] + } + }, + { + "familyId": "metabolite-screen", + "code": "exploratory_labeling", + "severity": "warning", + "message": "1 exploratory claims use confirmatory language without a label.", + "remediation": "Mark exploratory findings clearly and avoid confirmatory language for hypothesis-generating analyses.", + "evidence": { + "claimIds": [ + "claim-metab-2" + ] + } + } + ], + "reviewQueue": [ + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "unadjusted_significance_claim", + "severity": "critical", + "action": "rewrite_significance_claim", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold." + }, + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "fdr_threshold_consistency", + "severity": "critical", + "action": "align_claim_with_adjusted_threshold", + "remediation": "Downgrade the claim or revise the correction threshold and reported adjusted values." + }, + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "exploratory_labeling", + "severity": "warning", + "action": "label_exploratory_claims", + "remediation": "Mark exploratory findings clearly and avoid confirmatory language for hypothesis-generating analyses." + } + ] + } + ], + "reviewQueue": [ + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "correction_method_allowed", + "severity": "critical", + "action": "choose_multiplicity_method", + "remediation": "Choose a declared multiplicity method such as Benjamini-Hochberg, Bonferroni, Holm, or a pre-registered hierarchy." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "adjusted_values_present", + "severity": "critical", + "action": "report_adjusted_values", + "remediation": "Report adjusted p-values or q-values for every significant claim in the family." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "unadjusted_significance_claim", + "severity": "critical", + "action": "rewrite_significance_claim", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold." + }, + { + "manuscriptId": "neuro-screening-draft", + "familyId": "neuro-panel", + "code": "endpoint_hierarchy_covered", + "severity": "warning", + "action": "declare_endpoint_hierarchy", + "remediation": "Declare primary and secondary endpoints before making confirmatory significance claims." + }, + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "unadjusted_significance_claim", + "severity": "critical", + "action": "rewrite_significance_claim", + "remediation": "Rewrite significance language or support it with adjusted p/q values below the declared threshold." + }, + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "fdr_threshold_consistency", + "severity": "critical", + "action": "align_claim_with_adjusted_threshold", + "remediation": "Downgrade the claim or revise the correction threshold and reported adjusted values." + }, + { + "manuscriptId": "metabolomics-exploration", + "familyId": "metabolite-screen", + "code": "exploratory_labeling", + "severity": "warning", + "action": "label_exploratory_claims", + "remediation": "Mark exploratory findings clearly and avoid confirmatory language for hypothesis-generating analyses." + } + ], + "readyQueue": [ + { + "manuscriptId": "cardio-biomarker-study", + "action": "allow_peer_review_submission", + "checkedFamilies": 1 + } + ], + "audit": { + "algorithm": "sha256", + "digest": "b092238857bd2f7d0f4e18118723850c200466ee9f6ccc42ab2872d3c324cd19", + "policyVersion": "multiple-comparison-control-v1" + } +} diff --git a/multiple-comparison-control-assistant/reports/multiple-comparison-report.md b/multiple-comparison-control-assistant/reports/multiple-comparison-report.md new file mode 100644 index 00000000..f05f0c06 --- /dev/null +++ b/multiple-comparison-control-assistant/reports/multiple-comparison-report.md @@ -0,0 +1,39 @@ +# Multiple-Comparison Control Assistant Report + +Manuscripts reviewed: 3 +Ready manuscripts: 1 +Revise before submission: 2 +Analysis families: 3 +Tests reviewed: 15 +Critical findings: 5 +Warning findings: 2 +Audit digest: `b092238857bd2f7d0f4e18118723850c200466ee9f6ccc42ab2872d3c324cd19` + +## Manuscript Decisions + +### Cardiac Biomarker Response Study +- Status: peer_review_ready +- Families: 1 +- Tests: 3 +- Finding codes: none + +### Neuro Screening Draft +- Status: revise_before_submission +- Families: 1 +- Tests: 8 +- Finding codes: correction_method_allowed, adjusted_values_present, unadjusted_significance_claim, endpoint_hierarchy_covered + +### Metabolomics Exploration +- Status: revise_before_submission +- Families: 1 +- Tests: 4 +- Finding codes: unadjusted_significance_claim, fdr_threshold_consistency, exploratory_labeling + +## Review Queue +- neuro-screening-draft/neuro-panel: choose_multiplicity_method (critical) +- neuro-screening-draft/neuro-panel: report_adjusted_values (critical) +- neuro-screening-draft/neuro-panel: rewrite_significance_claim (critical) +- neuro-screening-draft/neuro-panel: declare_endpoint_hierarchy (warning) +- metabolomics-exploration/metabolite-screen: rewrite_significance_claim (critical) +- metabolomics-exploration/metabolite-screen: align_claim_with_adjusted_threshold (critical) +- metabolomics-exploration/metabolite-screen: label_exploratory_claims (warning) diff --git a/multiple-comparison-control-assistant/reports/summary.svg b/multiple-comparison-control-assistant/reports/summary.svg new file mode 100644 index 00000000..20d557e2 --- /dev/null +++ b/multiple-comparison-control-assistant/reports/summary.svg @@ -0,0 +1,16 @@ + + + Multiple-Comparison Control Assistant + AI peer-review aid for FDR, adjusted p-values, and endpoint hierarchy + + 1 + ready + + 15 + tests reviewed + + 5 + critical + Controls: family boundaries, correction methods, adjusted values, FDR threshold, hierarchy, exploratory labels. + Digest b092238857bd2f7d0f4e18118723... + diff --git a/multiple-comparison-control-assistant/requirements-map.md b/multiple-comparison-control-assistant/requirements-map.md new file mode 100644 index 00000000..71efab5d --- /dev/null +++ b/multiple-comparison-control-assistant/requirements-map.md @@ -0,0 +1,13 @@ +# Requirements Map + +| Issue #13 requirement | Implementation coverage | +| --- | --- | +| AI Peer Review Aid | Produces manuscript-level diagnostics, reviewer actions, and readiness status for statistical multiplicity risks. | +| Statistical error detection | Detects missing multiple-comparison correction, missing adjusted values, FDR mismatches, and overclaimed p-value results. | +| Customizable review templates | Separates confirmatory and exploratory family logic, endpoint hierarchy checks, and correction-method policy. | +| Immediate diagnostic report | Emits deterministic JSON and Markdown reports plus a review queue for each manuscript and result family. | +| Institutional quality dashboard | Exposes summary counts, manuscript statuses, family statuses, and audit digest for dashboard/API use. | + +## Non-Overlap Notes + +This PR focuses only on multiple-comparison control. It does not implement a broad research-tools suite, paper summarizer, citation recommender, citation context-fit checker, manuscript similarity detector, ethics/data availability checks, unit consistency, biomethods provenance, reporting-guideline compliance, or certainty/tone calibration. diff --git a/multiple-comparison-control-assistant/sample-data.js b/multiple-comparison-control-assistant/sample-data.js new file mode 100644 index 00000000..ef0e607c --- /dev/null +++ b/multiple-comparison-control-assistant/sample-data.js @@ -0,0 +1,169 @@ +const policy = { + pValueThreshold: 0.05, + fdrThreshold: 0.05, + familySizeWarningThreshold: 5, +} + +const manuscripts = [ + { + manuscriptId: "cardio-biomarker-study", + title: "Cardiac Biomarker Response Study", + analysisFamilies: [ + { + familyId: "primary-biomarkers", + label: "Primary biomarker outcomes", + intent: "confirmatory", + declaredFamilySize: 3, + correctionMethod: "benjamini-hochberg", + endpointHierarchy: { + primaryEndpoints: ["troponin-change"], + secondaryEndpoints: ["crp-change", "bnp-change"], + }, + tests: [ + { + testId: "troponin-change", + outcome: "Troponin change", + pValue: 0.003, + qValue: 0.009, + claimedSignificant: true, + }, + { + testId: "crp-change", + outcome: "CRP change", + pValue: 0.021, + qValue: 0.031, + claimedSignificant: true, + }, + { + testId: "bnp-change", + outcome: "BNP change", + pValue: 0.14, + qValue: 0.14, + claimedSignificant: false, + }, + ], + }, + ], + resultStatements: [ + { + familyId: "primary-biomarkers", + claimId: "claim-cardio-1", + testId: "troponin-change", + text: "Troponin change remained significant after Benjamini-Hochberg correction.", + usesSignificanceLanguage: true, + }, + { + familyId: "primary-biomarkers", + claimId: "claim-cardio-2", + testId: "bnp-change", + text: "BNP did not pass the adjusted threshold.", + usesSignificanceLanguage: false, + }, + ], + }, + { + manuscriptId: "neuro-screening-draft", + title: "Neuro Screening Draft", + analysisFamilies: [ + { + familyId: "neuro-panel", + label: "Neurocognitive screening panel", + intent: "confirmatory", + declaredFamilySize: 8, + correctionMethod: "none", + tests: [ + { testId: "memory", outcome: "Memory", pValue: 0.018, claimedSignificant: true }, + { testId: "attention", outcome: "Attention", pValue: 0.041, claimedSignificant: true }, + { testId: "motor", outcome: "Motor", pValue: 0.22, claimedSignificant: false }, + { testId: "language", outcome: "Language", pValue: 0.031, claimedSignificant: true }, + { testId: "spatial", outcome: "Spatial", pValue: 0.08, claimedSignificant: false }, + { testId: "sleep", outcome: "Sleep", pValue: 0.19, claimedSignificant: false }, + { testId: "mood", outcome: "Mood", pValue: 0.049, claimedSignificant: true }, + { testId: "fatigue", outcome: "Fatigue", pValue: 0.16, claimedSignificant: false }, + ], + }, + ], + resultStatements: [ + { + familyId: "neuro-panel", + claimId: "claim-neuro-1", + testId: "memory", + text: "Memory showed a significant effect at p < 0.05.", + usesSignificanceLanguage: true, + }, + { + familyId: "neuro-panel", + claimId: "claim-neuro-2", + testId: "attention", + text: "Attention was significant and confirms the primary hypothesis.", + usesSignificanceLanguage: true, + }, + ], + }, + { + manuscriptId: "metabolomics-exploration", + title: "Metabolomics Exploration", + analysisFamilies: [ + { + familyId: "metabolite-screen", + label: "Exploratory metabolite screen", + intent: "exploratory", + declaredFamilySize: 4, + correctionMethod: "benjamini-hochberg", + tests: [ + { + testId: "metabolite-a", + outcome: "Metabolite A", + pValue: 0.004, + qValue: 0.041, + claimedSignificant: true, + }, + { + testId: "metabolite-b", + outcome: "Metabolite B", + pValue: 0.011, + qValue: 0.071, + claimedSignificant: true, + }, + { + testId: "metabolite-c", + outcome: "Metabolite C", + pValue: 0.052, + qValue: 0.09, + claimedSignificant: false, + }, + { + testId: "metabolite-d", + outcome: "Metabolite D", + pValue: 0.18, + qValue: 0.21, + claimedSignificant: false, + }, + ], + }, + ], + resultStatements: [ + { + familyId: "metabolite-screen", + claimId: "claim-metab-1", + testId: "metabolite-a", + text: "Metabolite A is an exploratory FDR-controlled hit.", + usesSignificanceLanguage: true, + exploratoryLabel: true, + }, + { + familyId: "metabolite-screen", + claimId: "claim-metab-2", + testId: "metabolite-b", + text: "Metabolite B showed a significant effect that confirms pathway activation.", + usesSignificanceLanguage: true, + exploratoryLabel: false, + }, + ], + }, +] + +module.exports = { + policy, + manuscripts, +} diff --git a/multiple-comparison-control-assistant/test.js b/multiple-comparison-control-assistant/test.js new file mode 100644 index 00000000..14b4950a --- /dev/null +++ b/multiple-comparison-control-assistant/test.js @@ -0,0 +1,82 @@ +const assert = require("node:assert/strict") +const { evaluateMultipleComparisonControls } = require("./index") +const { manuscripts, policy } = require("./sample-data") + +const packet = evaluateMultipleComparisonControls({ + asOf: "2026-05-23T03:00:00.000Z", + manuscripts, + policy, +}) + +assert.equal(packet.summary.totalManuscripts, 3) +assert.equal(packet.summary.readyManuscripts, 1) +assert.equal(packet.summary.statisticalReviewManuscripts, 0) +assert.equal(packet.summary.reviseManuscripts, 2) +assert.equal(packet.summary.totalFamilies, 3) +assert.equal(packet.summary.totalTests, 15) +assert.equal(packet.summary.criticalFindings, 5) +assert.equal(packet.summary.warningFindings, 2) +assert.equal(packet.readyQueue.length, 1) +assert.equal(packet.readyQueue[0].manuscriptId, "cardio-biomarker-study") + +const cardio = packet.manuscripts.find((manuscript) => manuscript.manuscriptId === "cardio-biomarker-study") +assert.equal(cardio.status, "peer_review_ready") +assert(cardio.families[0].checks.every((check) => check.pass)) + +const neuro = packet.manuscripts.find((manuscript) => manuscript.manuscriptId === "neuro-screening-draft") +assert.equal(neuro.status, "revise_before_submission") +assert(neuro.findings.some((finding) => finding.code === "correction_method_allowed")) +assert(neuro.findings.some((finding) => finding.code === "adjusted_values_present")) +assert(neuro.findings.some((finding) => finding.code === "unadjusted_significance_claim")) +assert(neuro.findings.some((finding) => finding.code === "endpoint_hierarchy_covered")) + +const adjustedFinding = neuro.findings.find((finding) => finding.code === "adjusted_values_present") +assert.deepEqual(adjustedFinding.evidence.missingTestIds, ["memory", "attention", "language", "mood"]) + +const metabolomics = packet.manuscripts.find((manuscript) => manuscript.manuscriptId === "metabolomics-exploration") +assert.equal(metabolomics.status, "revise_before_submission") +assert(metabolomics.findings.some((finding) => finding.code === "unadjusted_significance_claim")) +assert(metabolomics.findings.some((finding) => finding.code === "fdr_threshold_consistency")) +assert(metabolomics.findings.some((finding) => finding.code === "exploratory_labeling")) + +const cleanSingle = evaluateMultipleComparisonControls({ + asOf: "2026-05-23T03:00:00.000Z", + manuscripts: [{ + manuscriptId: "clean-single-endpoint", + title: "Clean Single Endpoint", + analysisFamilies: [{ + familyId: "primary", + label: "Primary endpoint", + intent: "confirmatory", + declaredFamilySize: 1, + correctionMethod: "pre-registered-hierarchy", + endpointHierarchy: { primaryEndpoints: ["survival"] }, + tests: [{ + testId: "survival", + outcome: "Survival", + pValue: 0.012, + hierarchyDecision: "primary_endpoint_passed", + claimedSignificant: true, + }], + }], + resultStatements: [{ + familyId: "primary", + claimId: "claim-clean", + testId: "survival", + text: "The pre-registered primary endpoint passed the hierarchy.", + usesSignificanceLanguage: true, + }], + }], + policy, +}) +assert.equal(cleanSingle.summary.readyManuscripts, 1) +assert.equal(cleanSingle.summary.totalFindings, 0) + +const repeated = evaluateMultipleComparisonControls({ + asOf: "2026-05-23T03:00:00.000Z", + manuscripts, + policy, +}) +assert.equal(repeated.audit.digest, packet.audit.digest) + +console.log("multiple comparison control assistant tests passed")