Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ google-cloud-*
# Temporary and generated files
___*
*___

# Node.js V8 compile cache (created by tsx / NODE_COMPILE_CACHE)
node-compile-cache/

# Sentry
.sentryclirc
*.sentry-build-info
Expand Down
9 changes: 9 additions & 0 deletions playwright.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ const BASE_URL = process.env.E2E_BASE_URL ?? `http://localhost:${PORT}`;

export default defineConfig({
testDir: './tests/e2e',
/**
* Seed `.results/` with local-storage fixtures before the dev server boots
* (setup), then restore it afterwards (teardown). This lets the data-driven
* pages render real content without S3 or live LLM calls. Skipped when
* E2E_BASE_URL points at an already-running/remote app, whose storage we
* neither control nor want to mutate.
*/
globalSetup: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-setup'),
globalTeardown: process.env.E2E_BASE_URL ? undefined : require.resolve('./tests/e2e/global-teardown'),
fullyParallel: true,
forbidOnly: !!process.env.CI,
retries: process.env.CI ? 2 : 0,
Expand Down
34 changes: 26 additions & 8 deletions tests/e2e/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,35 @@ E2E_BASE_URL=https://your-preview.example.com pnpm test:e2e

## What's safe to test here

Smoke tests deliberately target **statically rendered, dependency-free routes**
(`/about`, `/what-is-an-eval`, …) so they pass in CI without any secrets.
Two kinds of routes are covered:

Routes that read from storage (S3) or call external LLM APIs — the homepage,
`/analysis/*`, `/latest`, etc. — will be slow or error without env/network. To
cover those, either:
1. **Statically rendered, dependency-free routes** (`/about`,
`/what-is-an-eval`, …) — see `smoke.spec.ts`. These need no data at all.

- provide the relevant env vars (see `.env.template`), or
- intercept network calls with `page.route(...)` and serve fixtures.
2. **Data-driven routes** (the homepage `/`, `/latest`, `/analysis/*`) — see
`homepage.spec.ts`, `latest.spec.ts`, `analysis.spec.ts`. These read from
storage but do **not** call LLMs at render time, so they work against
seeded local fixtures without any secrets or network.

Keep flaky, data-dependent assertions out of the default suite.
### How the data-driven fixtures work

In dev/test mode the app's `storageService` uses the `local` provider and reads
results from the `.results/` directory on disk (S3 is only used in production).
`playwright.config.ts` registers a `globalSetup` that seeds `.results/` from
`tests/e2e/fixtures/results/` **before** the dev server boots, and a
`globalTeardown` that restores it afterwards. Seeding is non-destructive: if you
already have a real local `.results/`, overwritten files are backed up and
restored and only the added files are removed.

The fixtures describe one deterministic run (`test-eval` / `test-run`); the
identifiers live in `tests/e2e/fixtures/constants.ts`. To cover another page or
data shape, add JSON under `tests/e2e/fixtures/results/` mirroring the on-disk
layout the storage service expects (e.g.
`live/aggregates/…`, `live/blueprints/<configId>/<runLabel>_<timestamp>/core.json`).

Routes that genuinely call external LLM APIs at request time (sandbox runs,
story generation, etc.) still need real env vars or `page.route(...)` mocks —
keep those out of the default suite.

## Conventions

Expand Down
39 changes: 39 additions & 0 deletions tests/e2e/analysis.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { test, expect } from '@playwright/test';
import { FIXTURE, ANALYSIS_PATH } from './fixtures/constants';

/**
* The analysis page is server-rendered from the run's `core.json` artefact
* (via getCoreResult). The seeded fixture lets the full analysis view render
* end-to-end without S3 or live model calls.
*/
test.describe('analysis run', () => {
test('renders the seeded run title, models and coverage', async ({ page }) => {
await page.goto(ANALYSIS_PATH);

// Page title is derived from the run's configTitle.
await expect(page).toHaveTitle(new RegExp(FIXTURE.configTitle, 'i'));

// Config title is shown in the page header.
await expect(page.getByText(FIXTURE.configTitle).first()).toBeVisible();

// Both fixture models are rendered (by their parsed display names) in the
// aggregate coverage view.
await expect(page.getByText(/GPT 4o Mini/i).first()).toBeVisible();
await expect(page.getByText(/Claude 3 Haiku/i).first()).toBeVisible();

// The prompt selector is populated from the run's prompts.
await expect(
page.locator('option[value="prompt-math"]'),
).toHaveCount(1);
});

test('shows a not-found state for a run that does not exist', async ({ page }) => {
await page.goto(
`/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/2099-01-01T00-00-00-000Z`,
);

// notFound() renders Next's not-found UI; the fixture content must be absent.
await expect(page.getByText(/this page could not be found/i)).toBeVisible();
await expect(page.getByText(/Claude 3 Haiku/i)).toHaveCount(0);
});
});
12 changes: 12 additions & 0 deletions tests/e2e/fixtures/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/**
* Identifiers for the seeded fixture run. Kept in one place so specs and the
* fixture JSON stay in sync.
*/
export const FIXTURE = {
configId: 'test-eval',
configTitle: 'Test Evaluation Blueprint',
runLabel: 'test-run',
timestamp: '2025-06-01T12-00-00-000Z',
} as const;

export const ANALYSIS_PATH = `/analysis/${FIXTURE.configId}/${FIXTURE.runLabel}/${FIXTURE.timestamp}`;
36 changes: 36 additions & 0 deletions tests/e2e/fixtures/results/live/aggregates/homepage_summary.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"configs": [
{
"configId": "test-eval",
"configTitle": "Test Evaluation Blueprint",
"id": "test-eval",
"title": "Test Evaluation Blueprint",
"description": "A fixture blueprint used by the end-to-end test suite.",
"tags": ["test-fixture", "_featured"],
"runs": [
{
"runLabel": "test-run",
"timestamp": "2025-06-01T12-00-00-000Z",
"fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json",
"numPrompts": 2,
"numModels": 2,
"totalModelsAttempted": 2,
"hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 },
"perModelHybridScores": {
"openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
"anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
},
"tags": ["test-fixture", "_featured"],
"models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
"promptIds": ["prompt-greeting", "prompt-math"]
}
],
"latestRunTimestamp": "2025-06-01T12-00-00-000Z",
"overallAverageHybridScore": 0.9125,
"hybridScoreStdDev": 0.0125
}
],
"headlineStats": null,
"driftDetectionResult": null,
"lastUpdated": "2025-06-01T12:00:00.000Z"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"runs": [
{
"configId": "test-eval",
"configTitle": "Test Evaluation Blueprint",
"runLabel": "test-run",
"timestamp": "2025-06-01T12-00-00-000Z",
"fileName": "test-run_2025-06-01T12-00-00-000Z_comparison.json",
"numPrompts": 2,
"numModels": 2,
"totalModelsAttempted": 2,
"hybridScoreStats": { "average": 0.9125, "stddev": 0.0125 },
"perModelHybridScores": {
"openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
"anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
},
"tags": ["test-fixture", "_featured"],
"models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
"promptIds": ["prompt-greeting", "prompt-math"]
}
],
"lastUpdated": "2025-06-01T12:00:00.000Z"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"configId": "test-eval",
"configTitle": "Test Evaluation Blueprint",
"runLabel": "test-run",
"timestamp": "2025-06-01T12-00-00-000Z",
"description": "A fixture blueprint used by the end-to-end test suite.",
"config": {
"id": "test-eval",
"title": "Test Evaluation Blueprint",
"description": "A fixture blueprint used by the end-to-end test suite.",
"tags": ["test-fixture", "_featured"],
"models": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
"prompts": [
{
"id": "prompt-greeting",
"promptText": "Say hello to the user politely.",
"points": ["Contains a greeting", "Uses a polite tone"]
},
{
"id": "prompt-math",
"promptText": "What is 2 + 2?",
"points": ["States that the answer is 4"]
}
]
},
"evalMethodsUsed": ["llm-coverage"],
"effectiveModels": ["openai:gpt-4o-mini", "anthropic:claude-3-haiku-20240307"],
"promptIds": ["prompt-greeting", "prompt-math"],
"promptContexts": {
"prompt-greeting": "Say hello to the user politely.",
"prompt-math": "What is 2 + 2?"
},
"allFinalAssistantResponses": {
"prompt-greeting": {
"openai:gpt-4o-mini": "Hello! It's a pleasure to meet you. How can I help you today?",
"anthropic:claude-3-haiku-20240307": "Hi there! Lovely to meet you. What can I do for you?"
},
"prompt-math": {
"openai:gpt-4o-mini": "2 + 2 equals 4.",
"anthropic:claude-3-haiku-20240307": "The answer is 4."
}
},
"evaluationResults": {
"llmCoverageScores": {
"prompt-greeting": {
"openai:gpt-4o-mini": {
"keyPointsCount": 2,
"avgCoverageExtent": 0.95,
"pointAssessments": [
{ "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "The response opens with a clear greeting." },
{ "keyPointText": "Uses a polite tone", "coverageExtent": 0.9, "reflection": "The tone is warm and courteous." }
]
},
"anthropic:claude-3-haiku-20240307": {
"keyPointsCount": 2,
"avgCoverageExtent": 0.9,
"pointAssessments": [
{ "keyPointText": "Contains a greeting", "coverageExtent": 1.0, "reflection": "Begins with a friendly greeting." },
{ "keyPointText": "Uses a polite tone", "coverageExtent": 0.8, "reflection": "Polite, though slightly informal." }
]
}
},
"prompt-math": {
"openai:gpt-4o-mini": {
"keyPointsCount": 1,
"avgCoverageExtent": 0.9,
"pointAssessments": [
{ "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." }
]
},
"anthropic:claude-3-haiku-20240307": {
"keyPointsCount": 1,
"avgCoverageExtent": 0.9,
"pointAssessments": [
{ "keyPointText": "States that the answer is 4", "coverageExtent": 0.9, "reflection": "Correctly states the answer is 4." }
]
}
}
},
"perModelHybridScores": {
"openai:gpt-4o-mini": { "average": 0.925, "stddev": 0.025 },
"anthropic:claude-3-haiku-20240307": { "average": 0.9, "stddev": 0.0 }
},
"perModelSemanticScores": {},
"similarityMatrix": {},
"perPromptSimilarities": {},
"promptStatistics": {}
}
}
Loading
Loading