From 035e1b94a36bd1f07039500165144b03581d0296 Mon Sep 17 00:00:00 2001 From: Nick DiMoro Date: Thu, 21 May 2026 05:12:19 -0700 Subject: [PATCH] fix(core): scope graph pages to selected snapshot --- .changeset/fix-snapshot-query-node-scope.md | 5 ++++ .../src/db/repositories/PageRepository.ts | 9 ++++--- packages/core/tests/db.test.ts | 26 +++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 .changeset/fix-snapshot-query-node-scope.md diff --git a/.changeset/fix-snapshot-query-node-scope.md b/.changeset/fix-snapshot-query-node-scope.md new file mode 100644 index 0000000..1ce0dd4 --- /dev/null +++ b/.changeset/fix-snapshot-query-node-scope.md @@ -0,0 +1,5 @@ +--- +"@crawlith/core": patch +--- + +Scope snapshot page loading to URLs seen in the selected snapshot so rerunning crawls with different URL normalization policies, such as `--no-query`, does not retain stale query-URL nodes in graph exports. Fixes #103. diff --git a/packages/core/src/db/repositories/PageRepository.ts b/packages/core/src/db/repositories/PageRepository.ts index daf1d94..f9ca131 100644 --- a/packages/core/src/db/repositories/PageRepository.ts +++ b/packages/core/src/db/repositories/PageRepository.ts @@ -220,19 +220,20 @@ export class PageRepository { if (runType === 'single') { return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').all(snapshotId) as Page[]; } - return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId) as Page[]; + return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.last_seen_snapshot_id = ?').all(snapshotId, snapshotId) as Page[]; } getPagesIdentityBySnapshot(snapshotId: number): { id: number; normalized_url: string }[] { - // For identities, always loading all up to this point is fine for the crawler to map URLs to IDs. - return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId) as { id: number; normalized_url: string }[]; + // Use pages seen in this snapshot only so graph exports reflect the current crawl policy + // (for example, rerunning with stripQuery should not retain stale query-URL nodes). + return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.last_seen_snapshot_id = ?').all(snapshotId, snapshotId) as { id: number; normalized_url: string }[]; } getPagesIteratorBySnapshot(snapshotId: number, runType: string = 'completed'): IterableIterator { if (runType === 'single') { return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').iterate(snapshotId) as IterableIterator; } - return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').iterate(snapshotId, snapshotId) as IterableIterator; + return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND p.last_seen_snapshot_id = ?').iterate(snapshotId, snapshotId) as IterableIterator; } getIdByUrl(siteId: number, url: string): number | undefined { diff --git a/packages/core/tests/db.test.ts b/packages/core/tests/db.test.ts index c237ffb..5b85530 100644 --- a/packages/core/tests/db.test.ts +++ b/packages/core/tests/db.test.ts @@ -89,6 +89,32 @@ describe('Database Layer', () => { expect(page?.last_seen_snapshot_id).toBe(snapshotId2); // Should update to the second one }); + it('should scope snapshot page lists to URLs seen in that snapshot', () => { + const siteId = siteRepo.createSite('example.com'); + const snapshotId = snapshotRepo.createSnapshot(siteId, 'completed'); + const snapshotId2 = snapshotRepo.createSnapshot(siteId, 'completed'); + + pageRepo.upsertPage({ + site_id: siteId, + normalized_url: '/book-a-call?intent=old', + last_seen_snapshot_id: snapshotId, + http_status: 200, + depth: 1 + }); + + pageRepo.upsertPage({ + site_id: siteId, + normalized_url: '/book-a-call', + last_seen_snapshot_id: snapshotId2, + http_status: 200, + depth: 1 + }); + + expect(pageRepo.getPagesBySnapshot(snapshotId2).map(p => p.normalized_url)).toEqual(['/book-a-call']); + expect(Array.from(pageRepo.getPagesIteratorBySnapshot(snapshotId2)).map(p => p.normalized_url)).toEqual(['/book-a-call']); + expect(pageRepo.getPagesIdentityBySnapshot(snapshotId2).map(p => p.normalized_url)).toEqual(['/book-a-call']); + }); + it('should persist new columns (nofollow, security_error, retries)', () => { const siteId = siteRepo.createSite('new-cols.com'); const snapshotId = snapshotRepo.createSnapshot(siteId, 'completed');