weval-org · nojibe · Jul 2, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -84,6 +84,10 @@ google-cloud-*
 # Temporary and generated files
 ___*
 *___
+
+# Node.js V8 compile cache (created by tsx / NODE_COMPILE_CACHE)
+node-compile-cache/
+
 # Sentry
 .sentryclirc
 *.sentry-build-info

diff --git a/playwright.config.ts b/playwright.config.ts
@@ -15,6 +15,15 @@ const BASE_URL = process.env.E2E_BASE_URL ?? `http://localhost:${PORT}`;
 
 export default defineConfig({
   testDir: './tests/e2e',
+  /**
+   * Seed `.results/` with local-storage fixtures before the dev server boots
+   * (setup), then restore it afterwards (teardown). This lets the data-driven
+   * pages render real content without S3 or live LLM calls. Skipped when
+   * E2E_BASE_URL points at an already-running/remote app, whose storage we
+   * neither control nor want to mutate.
+   */
+  globalSetup: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-setup'),
+  globalTeardown: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-teardown'),
   fullyParallel: true,
   forbidOnly: !!process.env.CI,
   retries: process.env.CI ? 2 : 0,

diff --git a/tests/e2e/README.md b/tests/e2e/README.md
@@ -30,17 +30,35 @@ E2E_BASE_URL=https://your-preview.example.com pnpm test:e2e
 
 ## What's safe to test here
 
-Smoke tests deliberately target **statically rendered, dependency-free routes**
-(`/about`, `/what-is-an-eval`, …) so they pass in CI without any secrets.
+Two kinds of routes are covered:
 
-Routes that read from storage (S3) or call external LLM APIs — the homepage,
-`/analysis/*`, `/latest`, etc. — will be slow or error without env/network. To
-cover those, either:
+1. **Statically rendered, dependency-free routes** (`/about`,
+   `/what-is-an-eval`, …) — see `smoke.spec.ts`. These need no data at all.
 
-- provide the relevant env vars (see `.env.template`), or
-- intercept network calls with `page.route(...)` and serve fixtures.
+2. **Data-driven routes** (the homepage `/`, `/latest`, `/analysis/*`) — see
+   `homepage.spec.ts`, `latest.spec.ts`, `analysis.spec.ts`. These read from
+   storage but do **not** call LLMs at render time, so they work against
+   seeded local fixtures without any secrets or network.
 
-Keep flaky, data-dependent assertions out of the default suite.
+### How the data-driven fixtures work
+
+In dev/test mode the app's `storageService` uses the `local` provider and reads
+results from the `.results/` directory on disk (S3 is only used in production).
+`playwright.config.ts` registers a `globalSetup` that seeds `.results/` from
+`tests/e2e/fixtures/results/` **before** the dev server boots, and a
+`globalTeardown` that restores it afterwards. Seeding is non-destructive: if you
+already have a real local `.results/`, overwritten files are backed up and
+restored and only the added files are removed.
+
+The fixtures describe one deterministic run (`test-eval` / `test-run`); the
+identifiers live in `tests/e2e/fixtures/constants.ts`. To cover another page or
+data shape, add JSON under `tests/e2e/fixtures/results/` mirroring the on-disk
+layout the storage service expects (e.g.
+`live/aggregates/…`, `live/blueprints/<configId>/<runLabel>_<timestamp>/core.json`).
+
+Routes that genuinely call external LLM APIs at request time (sandbox runs,
+story generation, etc.) still need real env vars or `page.route(...)` mocks —
+keep those out of the default suite.
 
 ## Conventions
 

diff --git a/tests/e2e/analysis.spec.ts b/tests/e2e/analysis.spec.ts
@@ -0,0 +1,39 @@
+import { test, expect } from '@playwright/test';
+import { FIXTURE, ANALYSIS_PATH } from './fixtures/constants';
+
+/**
+ * The analysis page is server-rendered from the run's `core.json` artefact
+ * (via getCoreResult). The seeded fixture lets the full analysis view render
+ * end-to-end without S3 or live model calls.
+ */
+test.describe('analysis run', () => {
+  test('renders the seeded run title, models and coverage', async ({ page }) => {
+    await page.goto(ANALYSIS_PATH);
+
+    // Page title is derived from the run's configTitle.
+    await expect(page).toHaveTitle(new RegExp(FIXTURE.configTitle, 'i'));
+
+    // Config title is shown in the page header.
+    await expect(page.getByText(FIXTURE.configTitle).first()).toBeVisible();
+
+    // Both fixture models are rendered (by their parsed display names) in the
+    // aggregate coverage view.
+    await expect(page.getByText(/GPT 4o Mini/i).first()).toBeVisible();
+    await expect(page.getByText(/Claude 3 Haiku/i).first()).toBeVisible();
+
+    // The prompt selector is populated from the run's prompts.
+    await expect(
+      page.locator('option[value="prompt-math"]'),
+    ).toHaveCount(1);
+  });
+
+  test('shows a not-found state for a run that does not exist', async ({ page }) => {
+    await page.goto(
+      `/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/2099-01-01T00-00-00-000Z`,
+    );
+
+    // notFound() renders Next's not-found UI; the fixture content must be absent.
+    await expect(page.getByText(/this page could not be found/i)).toBeVisible();
+    await expect(page.getByText(/Claude 3 Haiku/i)).toHaveCount(0);
+  });
+});
diff --git a/tests/e2e/fixtures/constants.ts b/tests/e2e/fixtures/constants.ts
@@ -0,0 +1,12 @@
+/**
+ * Identifiers for the seeded fixture run. Kept in one place so specs and the
+ * fixture JSON stay in sync.
+ */
+export const FIXTURE = {
+  configId: 'test-eval',
+  configTitle: 'Test Evaluation Blueprint',
+  runLabel: 'test-run',
+  timestamp: '2025-06-01T12-00-00-000Z',
+} as const;
+
+export const ANALYSIS_PATH = `/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/${FIXTURE.timestamp}`;
diff --git a/tests/e2e/fixtures/results/live/aggregates/homepage_summary.json b/tests/e2e/fixtures/results/live/aggregates/homepage_summary.json
@@ -0,0 +1,36 @@
+{
+  "configs": [
+    {
+      "configId": "test-eval",
+      "configTitle": "Test Evaluation Blueprint",
+      "id": "test-eval",
+      "title": "Test Evaluation Blueprint",
+      "description": "A fixture blueprint used by the end-to-end test suite.",
+      "tags": ["test-fixture", "_featured"],
+      "runs": [
+        {
+          "runLabel": "test-run",
+          "timestamp": "2025-06-01T12-00-00-000Z",
+          "fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json",
+          "numPrompts": 2,
+          "numModels": 2,
+          "totalModelsAttempted": 2,
+          "hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 },
+          "perModelHybridScores": {
+            "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
+            "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
+          },
+          "tags": ["test-fixture", "_featured"],
+          "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
+          "promptIds": ["prompt-greeting", "prompt-math"]
+        }
+      ],
+      "latestRunTimestamp": "2025-06-01T12-00-00-000Z",
+      "overallAverageHybridScore": 0.9125,
+      "hybridScoreStdDev": 0.0125
+    }
+  ],
+  "headlineStats": null,
+  "driftDetectionResult": null,
+  "lastUpdated": "2025-06-01T12:00:00.000Z"
+}
diff --git a/tests/e2e/fixtures/results/live/aggregates/latest_runs_summary.json b/tests/e2e/fixtures/results/live/aggregates/latest_runs_summary.json
@@ -0,0 +1,23 @@
+{
+  "runs": [
+    {
+      "configId": "test-eval",
+      "configTitle": "Test Evaluation Blueprint",
+      "runLabel": "test-run",
+      "timestamp": "2025-06-01T12-00-00-000Z",
+      "fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json",
+      "numPrompts": 2,
+      "numModels": 2,
+      "totalModelsAttempted": 2,
+      "hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 },
+      "perModelHybridScores": {
+        "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
+        "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
+      },
+      "tags": ["test-fixture", "_featured"],
+      "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
+      "promptIds": ["prompt-greeting", "prompt-math"]
+    }
+  ],
+  "lastUpdated": "2025-06-01T12:00:00.000Z"
+}
diff --git a/...2e/fixtures/results/live/blueprints/test-eval/test-run_2025-06-01T12-00-00-000Z/core.json b/...2e/fixtures/results/live/blueprints/test-eval/test-run_2025-06-01T12-00-00-000Z/core.json
@@ -0,0 +1,89 @@
+{
+  "configId": "test-eval",
+  "configTitle": "Test Evaluation Blueprint",
+  "runLabel": "test-run",
+  "timestamp": "2025-06-01T12-00-00-000Z",
+  "description": "A fixture blueprint used by the end-to-end test suite.",
+  "config": {
+    "id": "test-eval",
+    "title": "Test Evaluation Blueprint",
+    "description": "A fixture blueprint used by the end-to-end test suite.",
+    "tags": ["test-fixture", "_featured"],
+    "models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
+    "prompts": [
+      {
+        "id": "prompt-greeting",
+        "promptText": "Say hello to the user politely.",
+        "points": ["Contains a greeting", "Uses a polite tone"]
+      },
+      {
+        "id": "prompt-math",
+        "promptText": "What is 2 + 2?",
+        "points": ["States that the answer is 4"]
+      }
+    ]
+  },
+  "evalMethodsUsed": ["llm-coverage"],
+  "effectiveModels": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
+  "promptIds": ["prompt-greeting", "prompt-math"],
+  "promptContexts": {
+    "prompt-greeting": "Say hello to the user politely.",
+    "prompt-math": "What is 2 + 2?"
+  },
+  "allFinalAssistantResponses": {
+    "prompt-greeting": {
+      "openai:gpt-4o-mini": "Hello! It's a pleasure to meet you. How can I help you today?",
+      "anthropic:claude-3-haiku-20240307": "Hi there! Lovely to meet you. What can I do for you?"
+    },
+    "prompt-math": {
+      "openai:gpt-4o-mini": "2 + 2 equals 4.",
+      "anthropic:claude-3-haiku-20240307": "The answer is 4."
+    }
+  },
+  "evaluationResults": {
+    "llmCoverageScores": {
+      "prompt-greeting": {
+        "openai:gpt-4o-mini": {
+          "keyPointsCount": 2,
+          "avgCoverageExtent": 0.95,
+          "pointAssessments": [
+            { "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "The response opens with a clear greeting." },
+            { "keyPointText": "Uses a polite tone", "coverageExtent": 0.9, "reflection": "The tone is warm and courteous." }
+          ]
+        },
+        "anthropic:claude-3-haiku-20240307": {
+          "keyPointsCount": 2,
+          "avgCoverageExtent": 0.9,
+          "pointAssessments": [
+            { "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "Begins with a friendly greeting." },
+            { "keyPointText": "Uses a polite tone", "coverageExtent": 0.8, "reflection": "Polite, though slightly informal." }
+          ]
+        }
+      },
+      "prompt-math": {
+        "openai:gpt-4o-mini": {
+          "keyPointsCount": 1,
+          "avgCoverageExtent": 0.9,
+          "pointAssessments": [
+            { "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." }
+          ]
+        },
+        "anthropic:claude-3-haiku-20240307": {
+          "keyPointsCount": 1,
+          "avgCoverageExtent": 0.9,
+          "pointAssessments": [
+            { "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." }
+          ]
+        }
+      }
+    },
+    "perModelHybridScores": {
+      "openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
+      "anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
+    },
+    "perModelSemanticScores": {},
+    "similarityMatrix": {},
+    "perPromptSimilarities": {},
+    "promptStatistics": {}
+  }
+}