diff --git a/.gitignore b/.gitignore index 0c90d52..2c4152f 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,10 @@ google-cloud-* # Temporary and generated files ___* *___ + +# Node.js V8 compile cache (created by tsx / NODE_COMPILE_CACHE) +node-compile-cache/ + # Sentry .sentryclirc *.sentry-build-info diff --git a/playwright.config.ts b/playwright.config.ts index dfa8900..d34417e 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -15,6 +15,15 @@ const BASE_URL = process.env.E2E_BASE_URL ?? `http://localhost:${PORT}`; export default defineConfig({ testDir: './tests/e2e', + /** + * Seed `.results/` with local-storage fixtures before the dev server boots + * (setup), then restore it afterwards (teardown). This lets the data-driven + * pages render real content without S3 or live LLM calls. Skipped when + * E2E_BASE_URL points at an already-running/remote app, whose storage we + * neither control nor want to mutate. + */ + globalSetup: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-setup'), + globalTeardown: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-teardown'), fullyParallel: true, forbidOnly: !!process.env.CI, retries: process.env.CI ? 2 : 0, diff --git a/tests/e2e/README.md b/tests/e2e/README.md index bd7c117..6138ce6 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -30,17 +30,35 @@ E2E_BASE_URL=https://your-preview.example.com pnpm test:e2e ## What's safe to test here -Smoke tests deliberately target **statically rendered, dependency-free routes** -(`/about`, `/what-is-an-eval`, …) so they pass in CI without any secrets. +Two kinds of routes are covered: -Routes that read from storage (S3) or call external LLM APIs — the homepage, -`/analysis/*`, `/latest`, etc. — will be slow or error without env/network. To -cover those, either: +1. **Statically rendered, dependency-free routes** (`/about`, + `/what-is-an-eval`, …) — see `smoke.spec.ts`. These need no data at all. -- provide the relevant env vars (see `.env.template`), or -- intercept network calls with `page.route(...)` and serve fixtures. +2. **Data-driven routes** (the homepage `/`, `/latest`, `/analysis/*`) — see + `homepage.spec.ts`, `latest.spec.ts`, `analysis.spec.ts`. These read from + storage but do **not** call LLMs at render time, so they work against + seeded local fixtures without any secrets or network. -Keep flaky, data-dependent assertions out of the default suite. +### How the data-driven fixtures work + +In dev/test mode the app's `storageService` uses the `local` provider and reads +results from the `.results/` directory on disk (S3 is only used in production). +`playwright.config.ts` registers a `globalSetup` that seeds `.results/` from +`tests/e2e/fixtures/results/` **before** the dev server boots, and a +`globalTeardown` that restores it afterwards. Seeding is non-destructive: if you +already have a real local `.results/`, overwritten files are backed up and +restored and only the added files are removed. + +The fixtures describe one deterministic run (`test-eval` / `test-run`); the +identifiers live in `tests/e2e/fixtures/constants.ts`. To cover another page or +data shape, add JSON under `tests/e2e/fixtures/results/` mirroring the on-disk +layout the storage service expects (e.g. +`live/aggregates/…`, `live/blueprints//_/core.json`). + +Routes that genuinely call external LLM APIs at request time (sandbox runs, +story generation, etc.) still need real env vars or `page.route(...)` mocks — +keep those out of the default suite. ## Conventions diff --git a/tests/e2e/analysis.spec.ts b/tests/e2e/analysis.spec.ts new file mode 100644 index 0000000..0c407ef --- /dev/null +++ b/tests/e2e/analysis.spec.ts @@ -0,0 +1,39 @@ +import { test, expect } from '@playwright/test'; +import { FIXTURE, ANALYSIS_PATH } from './fixtures/constants'; + +/** + * The analysis page is server-rendered from the run's `core.json` artefact + * (via getCoreResult). The seeded fixture lets the full analysis view render + * end-to-end without S3 or live model calls. + */ +test.describe('analysis run', () => { + test('renders the seeded run title, models and coverage', async ({ page }) => { + await page.goto(ANALYSIS_PATH); + + // Page title is derived from the run's configTitle. + await expect(page).toHaveTitle(new RegExp(FIXTURE.configTitle, 'i')); + + // Config title is shown in the page header. + await expect(page.getByText(FIXTURE.configTitle).first()).toBeVisible(); + + // Both fixture models are rendered (by their parsed display names) in the + // aggregate coverage view. + await expect(page.getByText(/GPT 4o Mini/i).first()).toBeVisible(); + await expect(page.getByText(/Claude 3 Haiku/i).first()).toBeVisible(); + + // The prompt selector is populated from the run's prompts. + await expect( + page.locator('option[value="prompt-math"]'), + ).toHaveCount(1); + }); + + test('shows a not-found state for a run that does not exist', async ({ page }) => { + await page.goto( + `/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/2099-01-01T00-00-00-000Z`, + ); + + // notFound() renders Next's not-found UI; the fixture content must be absent. + await expect(page.getByText(/this page could not be found/i)).toBeVisible(); + await expect(page.getByText(/Claude 3 Haiku/i)).toHaveCount(0); + }); +}); diff --git a/tests/e2e/fixtures/constants.ts b/tests/e2e/fixtures/constants.ts new file mode 100644 index 0000000..3b08191 --- /dev/null +++ b/tests/e2e/fixtures/constants.ts @@ -0,0 +1,12 @@ +/** + * Identifiers for the seeded fixture run. Kept in one place so specs and the + * fixture JSON stay in sync. + */ +export const FIXTURE = { + configId: 'test-eval', + configTitle: 'Test Evaluation Blueprint', + runLabel: 'test-run', + timestamp: '2025-06-01T12-00-00-000Z', +} as const; + +export const ANALYSIS_PATH = `/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/${FIXTURE.timestamp}`; diff --git a/tests/e2e/fixtures/results/live/aggregates/homepage_summary.json b/tests/e2e/fixtures/results/live/aggregates/homepage_summary.json new file mode 100644 index 0000000..1fc7928 --- /dev/null +++ b/tests/e2e/fixtures/results/live/aggregates/homepage_summary.json @@ -0,0 +1,36 @@ +{ + "configs": [ + { + "configId": "test-eval", + "configTitle": "Test Evaluation Blueprint", + "id": "test-eval", + "title": "Test Evaluation Blueprint", + "description": "A fixture blueprint used by the end-to-end test suite.", + "tags": ["test-fixture", "_featured"], + "runs": [ + { + "runLabel": "test-run", + "timestamp": "2025-06-01T12-00-00-000Z", + "fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json", + "numPrompts": 2, + "numModels": 2, + "totalModelsAttempted": 2, + "hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 }, + "perModelHybridScores": { + "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 }, + "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 } + }, + "tags": ["test-fixture", "_featured"], + "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"], + "promptIds": ["prompt-greeting", "prompt-math"] + } + ], + "latestRunTimestamp": "2025-06-01T12-00-00-000Z", + "overallAverageHybridScore": 0.9125, + "hybridScoreStdDev": 0.0125 + } + ], + "headlineStats": null, + "driftDetectionResult": null, + "lastUpdated": "2025-06-01T12:00:00.000Z" +} diff --git a/tests/e2e/fixtures/results/live/aggregates/latest_runs_summary.json b/tests/e2e/fixtures/results/live/aggregates/latest_runs_summary.json new file mode 100644 index 0000000..6ac74c6 --- /dev/null +++ b/tests/e2e/fixtures/results/live/aggregates/latest_runs_summary.json @@ -0,0 +1,23 @@ +{ + "runs": [ + { + "configId": "test-eval", + "configTitle": "Test Evaluation Blueprint", + "runLabel": "test-run", + "timestamp": "2025-06-01T12-00-00-000Z", + "fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json", + "numPrompts": 2, + "numModels": 2, + "totalModelsAttempted": 2, + "hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 }, + "perModelHybridScores": { + "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 }, + "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 } + }, + "tags": ["test-fixture", "_featured"], + "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"], + "promptIds": ["prompt-greeting", "prompt-math"] + } + ], + "lastUpdated": "2025-06-01T12:00:00.000Z" +} diff --git a/tests/e2e/fixtures/results/live/blueprints/test-eval/test-run_2025-06-01T12-00-00-000Z/core.json b/tests/e2e/fixtures/results/live/blueprints/test-eval/test-run_2025-06-01T12-00-00-000Z/core.json new file mode 100644 index 0000000..848d259 --- /dev/null +++ b/tests/e2e/fixtures/results/live/blueprints/test-eval/test-run_2025-06-01T12-00-00-000Z/core.json @@ -0,0 +1,89 @@ +{ + "configId": "test-eval", + "configTitle": "Test Evaluation Blueprint", + "runLabel": "test-run", + "timestamp": "2025-06-01T12-00-00-000Z", + "description": "A fixture blueprint used by the end-to-end test suite.", + "config": { + "id": "test-eval", + "title": "Test Evaluation Blueprint", + "description": "A fixture blueprint used by the end-to-end test suite.", + "tags": ["test-fixture", "_featured"], + "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"], + "prompts": [ + { + "id": "prompt-greeting", + "promptText": "Say hello to the user politely.", + "points": ["Contains a greeting", "Uses a polite tone"] + }, + { + "id": "prompt-math", + "promptText": "What is 2 + 2?", + "points": ["States that the answer is 4"] + } + ] + }, + "evalMethodsUsed": ["llm-coverage"], + "effectiveModels": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"], + "promptIds": ["prompt-greeting", "prompt-math"], + "promptContexts": { + "prompt-greeting": "Say hello to the user politely.", + "prompt-math": "What is 2 + 2?" + }, + "allFinalAssistantResponses": { + "prompt-greeting": { + "openai:gpt-4o-mini": "Hello! It's a pleasure to meet you. How can I help you today?", + "anthropic:claude-3-haiku-20240307": "Hi there! Lovely to meet you. What can I do for you?" + }, + "prompt-math": { + "openai:gpt-4o-mini": "2 + 2 equals 4.", + "anthropic:claude-3-haiku-20240307": "The answer is 4." + } + }, + "evaluationResults": { + "llmCoverageScores": { + "prompt-greeting": { + "openai:gpt-4o-mini": { + "keyPointsCount": 2, + "avgCoverageExtent": 0.95, + "pointAssessments": [ + { "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "The response opens with a clear greeting." }, + { "keyPointText": "Uses a polite tone", "coverageExtent": 0.9, "reflection": "The tone is warm and courteous." } + ] + }, + "anthropic:claude-3-haiku-20240307": { + "keyPointsCount": 2, + "avgCoverageExtent": 0.9, + "pointAssessments": [ + { "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "Begins with a friendly greeting." }, + { "keyPointText": "Uses a polite tone", "coverageExtent": 0.8, "reflection": "Polite, though slightly informal." } + ] + } + }, + "prompt-math": { + "openai:gpt-4o-mini": { + "keyPointsCount": 1, + "avgCoverageExtent": 0.9, + "pointAssessments": [ + { "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." } + ] + }, + "anthropic:claude-3-haiku-20240307": { + "keyPointsCount": 1, + "avgCoverageExtent": 0.9, + "pointAssessments": [ + { "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." } + ] + } + } + }, + "perModelHybridScores": { + "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 }, + "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 } + }, + "perModelSemanticScores": {}, + "similarityMatrix": {}, + "perPromptSimilarities": {}, + "promptStatistics": {} + } +} diff --git a/tests/e2e/fixtures/seed.ts b/tests/e2e/fixtures/seed.ts new file mode 100644 index 0000000..fca8cda --- /dev/null +++ b/tests/e2e/fixtures/seed.ts @@ -0,0 +1,139 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +/** + * Seeds the local-filesystem storage backend (`.results/`) with deterministic + * fixtures so the data-driven pages (homepage, /latest, /analysis/*) render + * real content in e2e runs without S3 credentials or live LLM calls. + * + * In dev/test mode the app's storageService reads from `.results/` on disk + * (STORAGE_PROVIDER defaults to `local` when NODE_ENV is development/test), so + * dropping fixtures there is all that's needed. + * + * Seeding is non-destructive. The common case (CI, no local `.results/`) is + * handled by simply removing the whole directory on teardown. When a developer + * already has a real `.results/`, we back up any file we overwrite and restore + * it, and remove only the files/dirs we added. + */ + +const FIXTURES_ROOT = path.join(__dirname, 'results'); +const RESULTS_DIR = path.resolve(process.cwd(), '.results'); +const MANIFEST_PATH = path.join(RESULTS_DIR, '.e2e-fixture-manifest.json'); + +interface SeedManifest { + // Whether `.results/` already existed before seeding. + resultsPreexisted: boolean; + // Files that already existed and were overwritten: dest path -> backup path. + backedUp: Record; + // Files we created (did not exist before): absolute paths. + createdFiles: string[]; + // Directories we created, deepest first, so they can be removed in order. + createdDirs: string[]; +} + +function listFixtureFiles(dir: string): string[] { + const out: string[] = []; + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + out.push(...listFixtureFiles(full)); + } else { + out.push(full); + } + } + return out; +} + +export function seedFixtures(): void { + const resultsPreexisted = fs.existsSync(RESULTS_DIR); + const manifest: SeedManifest = { + resultsPreexisted, + backedUp: {}, + createdFiles: [], + createdDirs: [], + }; + + for (const src of listFixtureFiles(FIXTURES_ROOT)) { + const relFromFixtures = path.relative(FIXTURES_ROOT, src); + const dest = path.join(RESULTS_DIR, relFromFixtures); + + // Track directories we need to create (outermost first). + const dirsToCreate: string[] = []; + let cursor = path.dirname(dest); + while (!fs.existsSync(cursor)) { + dirsToCreate.unshift(cursor); + cursor = path.dirname(cursor); + } + for (const d of dirsToCreate) { + fs.mkdirSync(d); + manifest.createdDirs.push(d); + } + + if (fs.existsSync(dest)) { + const backup = `${dest}.e2e-bak`; + fs.copyFileSync(dest, backup); + manifest.backedUp[dest] = backup; + } else { + manifest.createdFiles.push(dest); + } + fs.copyFileSync(src, dest); + } + + fs.mkdirSync(RESULTS_DIR, { recursive: true }); + fs.writeFileSync(MANIFEST_PATH, JSON.stringify(manifest, null, 2), 'utf-8'); +} + +export function cleanupFixtures(): void { + if (!fs.existsSync(MANIFEST_PATH)) return; + + let manifest: SeedManifest; + try { + manifest = JSON.parse(fs.readFileSync(MANIFEST_PATH, 'utf-8')); + } catch { + return; + } + + // Simple, bulletproof path: we created `.results/` from scratch, so remove it + // wholesale (also cleans up anything the app wrote there during the run). + if (!manifest.resultsPreexisted) { + fs.rmSync(RESULTS_DIR, { recursive: true, force: true }); + return; + } + + // Otherwise restore the pre-existing directory precisely. + for (const [dest, backup] of Object.entries(manifest.backedUp)) { + try { + if (fs.existsSync(backup)) { + fs.copyFileSync(backup, dest); + fs.rmSync(backup); + } + } catch { + /* best effort */ + } + } + + for (const file of manifest.createdFiles) { + try { + if (fs.existsSync(file)) fs.rmSync(file); + } catch { + /* best effort */ + } + } + + // Remove directories we created, deepest first. + for (const dir of [...manifest.createdDirs].sort((a, b) => b.length - a.length)) { + try { + if (fs.existsSync(dir) && fs.readdirSync(dir).length === 0) { + fs.rmdirSync(dir); + } + } catch { + /* best effort */ + } + } + + try { + fs.rmSync(MANIFEST_PATH); + } catch { + /* best effort */ + } +} diff --git a/tests/e2e/global-setup.ts b/tests/e2e/global-setup.ts new file mode 100644 index 0000000..0764226 --- /dev/null +++ b/tests/e2e/global-setup.ts @@ -0,0 +1,9 @@ +import { seedFixtures } from './fixtures/seed'; + +/** + * Runs once before the Playwright webServer boots, so the seeded `.results/` + * fixtures are already on disk when `pnpm dev` starts reading them. + */ +export default function globalSetup() { + seedFixtures(); +} diff --git a/tests/e2e/global-teardown.ts b/tests/e2e/global-teardown.ts new file mode 100644 index 0000000..ce40ed8 --- /dev/null +++ b/tests/e2e/global-teardown.ts @@ -0,0 +1,9 @@ +import { cleanupFixtures } from './fixtures/seed'; + +/** + * Restores `.results/` to its pre-run state: overwritten files are put back + * and anything the seed step created is removed. + */ +export default function globalTeardown() { + cleanupFixtures(); +} diff --git a/tests/e2e/homepage.spec.ts b/tests/e2e/homepage.spec.ts new file mode 100644 index 0000000..cb63371 --- /dev/null +++ b/tests/e2e/homepage.spec.ts @@ -0,0 +1,25 @@ +import { test, expect } from '@playwright/test'; +import { FIXTURE } from './fixtures/constants'; + +/** + * The homepage is server-rendered from the `homepage_summary.json` aggregate. + * With the seeded fixture in `.results/`, it renders the featured fixture + * blueprint instead of erroring on empty storage. + */ +test.describe('homepage', () => { + test('renders the title and the seeded featured blueprint', async ({ page }) => { + await page.goto('/'); + + await expect(page).toHaveTitle(/Weval/i); + + // The seeded config should surface somewhere on the page. + await expect(page.getByText(FIXTURE.configTitle).first()).toBeVisible(); + }); + + test('links to the Collective Intelligence Project', async ({ page }) => { + await page.goto('/'); + + const cipLink = page.locator('a[href*="cip.org"]').first(); + await expect(cipLink).toBeVisible(); + }); +}); diff --git a/tests/e2e/latest.spec.ts b/tests/e2e/latest.spec.ts new file mode 100644 index 0000000..4b85509 --- /dev/null +++ b/tests/e2e/latest.spec.ts @@ -0,0 +1,29 @@ +import { test, expect } from '@playwright/test'; +import { FIXTURE } from './fixtures/constants'; + +/** + * /latest is a client component that fetches `/api/runs/latest`, which reads + * `latest_runs_summary.json` from storage. The seeded fixture makes the list + * render a real run. + */ +test.describe('latest runs', () => { + test('renders the seeded run from the latest-runs API', async ({ page }) => { + await page.goto('/latest'); + + // Fixture run's config title should appear once the fetch resolves. + await expect(page.getByText(FIXTURE.configTitle).first()).toBeVisible(); + + // A link into the analysis view for the fixture run should be present. + await expect( + page.locator(`a[href*="/analysis/${FIXTURE.configId}"]`).first(), + ).toBeVisible(); + }); + + test('serves the latest-runs API payload', async ({ request }) => { + const res = await request.get('/api/runs/latest'); + expect(res.ok()).toBeTruthy(); + const body = await res.json(); + expect(Array.isArray(body.runs)).toBe(true); + expect(body.runs.some((r: any) => r.configId === FIXTURE.configId)).toBe(true); + }); +}); diff --git a/tests/e2e/smoke.spec.ts b/tests/e2e/smoke.spec.ts index 10da7e0..ec33c10 100644 --- a/tests/e2e/smoke.spec.ts +++ b/tests/e2e/smoke.spec.ts @@ -3,8 +3,8 @@ import { test, expect } from '@playwright/test'; /** * Smoke tests target dependency-free, statically rendered routes so they pass * in CI without storage/API secrets. Data-driven routes (the homepage, - * /analysis, etc.) hit storage and external LLM APIs — to test those, provide - * env vars or mock the network first. See tests/e2e/README.md. + * /latest, /analysis/*) are covered separately against seeded local fixtures — + * see homepage.spec.ts, latest.spec.ts, analysis.spec.ts and tests/e2e/README.md. */ test.describe('smoke', () => { test('about page renders its title and key content', async ({ page }) => {