diff --git a/web/package-lock.json b/web/package-lock.json index 2e09f5e..4d18f39 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "dependencies": { "@tanstack/react-virtual": "^3.13.13", + "@technical-1/email-archive-parser": "^3.0.0", "date-fns": "^4.1.0", "dexie": "^4.2.1", "dompurify": "^3.4.7", @@ -1886,6 +1887,18 @@ "url": "https://github.com/sponsors/tannerlinsley" } }, + "node_modules/@technical-1/email-archive-parser": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/@technical-1/email-archive-parser/-/email-archive-parser-3.0.0.tgz", + "integrity": "sha512-kYhvOfA10b1izX30rKyBe9ugDcZEvD8B9F2NnZt1NviSfh88otuZtgNl7Ik36XG5Omkd58mYCPnFDfNuin2wWg==", + "license": "MIT", + "dependencies": { + "jszip": "^3.10.1" + }, + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/@testing-library/dom": { "version": "10.4.1", "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", diff --git a/web/package.json b/web/package.json index df52137..e5f0c52 100644 --- a/web/package.json +++ b/web/package.json @@ -14,6 +14,7 @@ }, "dependencies": { "@tanstack/react-virtual": "^3.13.13", + "@technical-1/email-archive-parser": "^3.0.0", "date-fns": "^4.1.0", "dexie": "^4.2.1", "dompurify": "^3.4.7", diff --git a/web/src/__tests__/phase-7/mboxParser.test.ts b/web/src/__tests__/phase-7/mboxParser.test.ts deleted file mode 100644 index fd5f0bd..0000000 --- a/web/src/__tests__/phase-7/mboxParser.test.ts +++ /dev/null @@ -1,129 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { mboxParser } from '../../services/mboxParser'; - -describe('MBOXParser', () => { - describe('isMBOXFile', () => { - it('should identify .mbox files', () => { - const file = new File([''], 'test.mbox', { type: 'application/mbox' }); - expect(mboxParser.isMBOXFile(file)).toBe(true); - }); - - it('should identify .mbx files', () => { - const file = new File([''], 'inbox.mbx'); - expect(mboxParser.isMBOXFile(file)).toBe(true); - }); - - it('should not identify other file types', () => { - const file = new File([''], 'emails.json', { type: 'application/json' }); - expect(mboxParser.isMBOXFile(file)).toBe(false); - }); - }); - - describe('parseMBOXFile', () => { - it('should parse a simple MBOX email', async () => { - const mboxContent = `From sender@example.com Mon Jan 01 00:00:00 2024 -From: John Doe -To: jane@example.com -Subject: Test Email -Date: Mon, 01 Jan 2024 12:00:00 +0000 - -This is the email body. -`; - - const file = new File([mboxContent], 'test.mbox', { type: 'application/mbox' }); - const emails = await mboxParser.parseMBOXFile(file); - - expect(emails).toHaveLength(1); - expect(emails[0].subject).toBe('Test Email'); - expect(emails[0].sender).toBe('john@example.com'); - expect(emails[0].body).toBe('This is the email body.'); - }); - - it('should parse multiple emails', async () => { - const mboxContent = `From sender1@example.com Mon Jan 01 00:00:00 2024 -From: sender1@example.com -Subject: Email 1 -Date: Mon, 01 Jan 2024 12:00:00 +0000 - -Body 1 -From sender2@example.com Tue Jan 02 00:00:00 2024 -From: sender2@example.com -Subject: Email 2 -Date: Tue, 02 Jan 2024 12:00:00 +0000 - -Body 2 -`; - - const file = new File([mboxContent], 'test.mbox', { type: 'application/mbox' }); - const emails = await mboxParser.parseMBOXFile(file); - - expect(emails).toHaveLength(2); - expect(emails[0].subject).toBe('Email 1'); - expect(emails[1].subject).toBe('Email 2'); - }); - - it('should handle emails with multiple recipients', async () => { - const mboxContent = `From sender@example.com Mon Jan 01 00:00:00 2024 -From: sender@example.com -To: user1@example.com, user2@example.com, user3@example.com -Subject: Group Email -Date: Mon, 01 Jan 2024 12:00:00 +0000 - -Hello everyone! -`; - - const file = new File([mboxContent], 'test.mbox', { type: 'application/mbox' }); - const emails = await mboxParser.parseMBOXFile(file); - - expect(emails).toHaveLength(1); - expect(emails[0].recipients).toHaveLength(3); - expect(emails[0].recipients).toContain('user1@example.com'); - expect(emails[0].recipients).toContain('user2@example.com'); - expect(emails[0].recipients).toContain('user3@example.com'); - }); - - it('should handle quoted-printable encoding', async () => { - const mboxContent = `From sender@example.com Mon Jan 01 00:00:00 2024 -From: sender@example.com -Subject: Encoded Email -Content-Transfer-Encoding: quoted-printable -Date: Mon, 01 Jan 2024 12:00:00 +0000 - -Hello=20World -`; - - const file = new File([mboxContent], 'test.mbox', { type: 'application/mbox' }); - const emails = await mboxParser.parseMBOXFile(file); - - expect(emails).toHaveLength(1); - expect(emails[0].body).toBe('Hello World'); - }); - - it('should handle empty MBOX file', async () => { - const file = new File([''], 'empty.mbox', { type: 'application/mbox' }); - const emails = await mboxParser.parseMBOXFile(file); - expect(emails).toHaveLength(0); - }); - - it('should report progress during parsing', async () => { - const mboxContent = `From sender@example.com Mon Jan 01 00:00:00 2024 -From: sender@example.com -Subject: Test -Date: Mon, 01 Jan 2024 12:00:00 +0000 - -Body -`; - - const file = new File([mboxContent], 'test.mbox', { type: 'application/mbox' }); - const progressUpdates: number[] = []; - - await mboxParser.parseMBOXFile(file, (progress) => { - progressUpdates.push(progress); - }); - - expect(progressUpdates.length).toBeGreaterThan(0); - expect(progressUpdates[progressUpdates.length - 1]).toBe(100); - }); - }); -}); - diff --git a/web/src/__tests__/phase-9/accountDetector.detect.test.ts b/web/src/__tests__/phase-9/accountDetector.detect.test.ts deleted file mode 100644 index 6fd172d..0000000 --- a/web/src/__tests__/phase-9/accountDetector.detect.test.ts +++ /dev/null @@ -1,125 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { accountDetector } from '../../services/accountDetector'; - -// Behavioral coverage for the primary detectAccountSignup() surface and the -// createAccountFromEmail() factory. The existing accountDetector.domain test -// only covers substring/subdomain matching (issue 5), so this fills the -// happy-path + confidence-threshold gaps. - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Hello', - sender: 'someone@example.com', - senderName: undefined, - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'plain message', - attachments: [], - size: 100, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('AccountDetector.detectAccountSignup', () => { - it('detects a known service with a welcome subject (known + subject = 80)', () => { - const result = accountDetector.detectAccountSignup( - email({ - sender: 'info@netflix.com', - subject: 'Welcome to Netflix', - body: 'Your account has been created. Thanks for signing up!', - }) - ); - - expect(result.type).toBe('account'); - expect(result.confidence).toBeGreaterThanOrEqual(70); - expect(result.data?.serviceName).toBe('Netflix'); - expect(result.data?.serviceType).toBe('streaming'); - }); - - it('detects an unknown service via strong subject + body and falls back to the domain name', () => { - const result = accountDetector.detectAccountSignup( - email({ - sender: 'hi@coolapp.io', - subject: 'Verify your email', - body: 'Click here to verify your email and finish setup.', - }) - ); - - expect(result.type).toBe('account'); - // No known service, no extractable name in subject -> formatted domain. - expect(result.data?.serviceName).toBe('Coolapp'); - expect(result.data?.serviceType).toBe('other'); - }); - - it('extracts the service name from a "Welcome to X!" subject for an unknown domain', () => { - const result = accountDetector.detectAccountSignup( - email({ - sender: 'team@mailer.acmewidgets.com', - subject: 'Welcome to Acme!', - body: 'Thanks for signing up. Your account has been created.', - }) - ); - - expect(result.type).toBe('account'); - expect(result.data?.serviceName).toBe('Acme'); - }); - - it('does NOT flag a regular personal email as an account signup', () => { - const result = accountDetector.detectAccountSignup( - email({ - sender: 'friend@gmail.com', - subject: 'lunch tomorrow?', - body: 'wanna grab lunch around noon?', - }) - ); - - expect(result.type).toBe('none'); - expect(result.confidence).toBe(0); - }); - - it('does NOT flag a known service email that lacks any signup language (known alone = 40 < 70)', () => { - const result = accountDetector.detectAccountSignup( - email({ - sender: 'info@netflix.com', - subject: 'New arrivals this week', - body: 'Check out what is new to stream.', - }) - ); - - expect(result.type).toBe('none'); - }); - - it('classifies known services into the correct serviceType', () => { - expect(accountDetector.getServiceType('github.com')).toBe('development'); - expect(accountDetector.getServiceType('chase.com')).toBe('banking'); - expect(accountDetector.getServiceType('instagram.com')).toBe('social'); - expect(accountDetector.getServiceType('unknown-brand-xyz.com')).toBe('other'); - }); -}); - -describe('AccountDetector.createAccountFromEmail', () => { - it('builds an account record from the email, inferring serviceType from the domain', () => { - const e = email({ id: 42, sender: 'noreply@github.com', date: new Date('2023-06-01') }); - const account = accountDetector.createAccountFromEmail(e, 'GitHub'); - - expect(account.serviceName).toBe('GitHub'); - expect(account.signupEmailId).toBe(42); - expect(account.domain).toBe('github.com'); - expect(account.serviceType).toBe('development'); - expect(account.emailCount).toBe(1); - expect(account.signupDate).toEqual(new Date('2023-06-01')); - }); - - it('honors an explicit serviceType override', () => { - const account = accountDetector.createAccountFromEmail( - email({ sender: 'noreply@github.com' }), - 'GitHub', - 'other' - ); - expect(account.serviceType).toBe('other'); - }); -}); diff --git a/web/src/__tests__/phase-9/accountDetector.domain.test.ts b/web/src/__tests__/phase-9/accountDetector.domain.test.ts deleted file mode 100644 index bf61ddf..0000000 --- a/web/src/__tests__/phase-9/accountDetector.domain.test.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { accountDetector } from '../../services/accountDetector'; - -describe('AccountDetector domain matching (issue 5)', () => { - it('does NOT treat notnetflix.com as a known streaming service via "netflix"', () => { - // 'other' is the fallback when no known service matches. - // Bug: notnetflix.com.includes('.netflix') is true, so it incorrectly returns 'streaming'. - expect(accountDetector.getServiceType('notnetflix.com')).toBe('other'); - }); - - it('does NOT treat pineapple.com as Apple', () => { - // apple.com type is 'other', so this verifies via service NAME rather than type. - // The buggy loop causes pineapple.com to match apple.com (apple. substring match). - // After fix, findKnownService returns null so getServiceType returns 'other' (correct reason). - // We verify the subdomain test below proves the helper works correctly. - expect(accountDetector.getServiceType('pineapple.com')).toBe('other'); - }); - - it('still resolves real subdomains of a known service', () => { - // mail.netflix.com should resolve to netflix's known type, not 'other' - expect(accountDetector.getServiceType('mail.netflix.com')).not.toBe('other'); - }); -}); diff --git a/web/src/__tests__/phase-9/bucket-d-regression.test.tsx b/web/src/__tests__/phase-9/bucket-d-regression.test.tsx index 1b6c272..0ab3330 100644 --- a/web/src/__tests__/phase-9/bucket-d-regression.test.tsx +++ b/web/src/__tests__/phase-9/bucket-d-regression.test.tsx @@ -122,8 +122,8 @@ describe('ThreadView expanded body (lazy-loaded via useLazyEmailBody)', () => { subject: 'Thread Test', emails: [storeEmail, email2], participants: ['a@b.com'], - lastMessageDate: storeEmail.date, - firstMessageDate: storeEmail.date, + lastMessageDate: storeEmail.date!, + firstMessageDate: storeEmail.date!, messageCount: 2, unreadCount: 0, hasAttachments: false, @@ -154,8 +154,8 @@ describe('ThreadView expanded body (lazy-loaded via useLazyEmailBody)', () => { // storeEmail is the LAST (latest) item and will be rendered expanded emails: [olderEmail, storeEmail], participants: ['a@b.com'], - lastMessageDate: storeEmail.date, - firstMessageDate: storeEmail.date, + lastMessageDate: storeEmail.date!, + firstMessageDate: storeEmail.date!, messageCount: 2, unreadCount: 0, hasAttachments: false, diff --git a/web/src/__tests__/phase-9/domainMatch.test.ts b/web/src/__tests__/phase-9/domainMatch.test.ts deleted file mode 100644 index e5f1874..0000000 --- a/web/src/__tests__/phase-9/domainMatch.test.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { isDomainMatch } from '../../services/domainMatch'; - -describe('isDomainMatch', () => { - it('matches exact domain', () => { - expect(isDomainMatch('netflix.com', 'netflix.com')).toBe(true); - }); - - it('matches a subdomain of the service domain', () => { - expect(isDomainMatch('mail.netflix.com', 'netflix.com')).toBe(true); - expect(isDomainMatch('noreply.spotify.com', 'spotify.com')).toBe(true); - }); - - it('does NOT match an unrelated domain that merely contains the base word', () => { - expect(isDomainMatch('maxwell.com', 'max.com')).toBe(false); - expect(isDomainMatch('pineapple.com', 'apple.com')).toBe(false); - expect(isDomainMatch('php.net', 'hp.com')).toBe(false); - }); - - it('does NOT match when service domain is a suffix without a dot boundary', () => { - // 'notnetflix.com' ends with 'netflix.com' as a string but not on a label boundary - expect(isDomainMatch('notnetflix.com', 'netflix.com')).toBe(false); - }); - - it('is case-insensitive and trims', () => { - expect(isDomainMatch('Mail.Netflix.COM', 'netflix.com')).toBe(true); - expect(isDomainMatch(' netflix.com ', ' NETFLIX.COM ')).toBe(true); - }); - - it('returns false for empty inputs', () => { - expect(isDomainMatch('', 'netflix.com')).toBe(false); - expect(isDomainMatch('netflix.com', '')).toBe(false); - }); -}); diff --git a/web/src/__tests__/phase-9/newsletterDetector.classify.test.ts b/web/src/__tests__/phase-9/newsletterDetector.classify.test.ts deleted file mode 100644 index 1d2e3e1..0000000 --- a/web/src/__tests__/phase-9/newsletterDetector.classify.test.ts +++ /dev/null @@ -1,101 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { newsletterDetector } from '../../services/newsletterDetector'; - -// Behavioral coverage for the newsletter-vs-promotional split, unsubscribe-link -// extraction, and groupBySender/frequency aggregation. The existing -// newsletterDetector.domain test covers promotional-domain matching only. - -const MARKETING_FOOTER = - 'Unsubscribe here. View in browser. Privacy policy. All rights reserved.'; - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Hello', - sender: 'someone@example.com', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'plain message', - attachments: [], - size: 100, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('NewsletterDetector.categorize', () => { - it('classifies a discount blast as promotional', () => { - const category = newsletterDetector.categorize( - email({ - sender: 'deals@news.brand.com', - subject: 'Save 50% off everything', - body: MARKETING_FOOTER, - }) - ); - expect(category).toBe('promotional'); - }); - - it('classifies a weekly digest as a newsletter (not promotional)', () => { - const category = newsletterDetector.categorize( - email({ - sender: 'news@substack.com', - subject: 'Weekly digest', - body: MARKETING_FOOTER, - }) - ); - expect(category).toBe('newsletter'); - }); - - it('classifies an ordinary personal email as regular', () => { - const category = newsletterDetector.categorize( - email({ - sender: 'friend@gmail.com', - subject: 'coffee?', - body: 'free tomorrow morning?', - }) - ); - expect(category).toBe('regular'); - }); -}); - -describe('NewsletterDetector.extractUnsubscribeLink', () => { - it('pulls an unsubscribe href out of an anchor tag', () => { - const link = newsletterDetector.extractUnsubscribeLink( - 'Unsubscribe' - ); - expect(link).toBe('https://example.com/unsubscribe?id=9'); - }); - - it('returns undefined when there is no unsubscribe link', () => { - expect(newsletterDetector.extractUnsubscribeLink('

just text

')).toBeUndefined(); - expect(newsletterDetector.extractUnsubscribeLink('')).toBeUndefined(); - }); -}); - -describe('NewsletterDetector.groupBySender', () => { - it('aggregates a sender across emails, deriving name, count and frequency', () => { - const emails = [ - email({ id: 1, sender: 'news@substack.com', subject: 'Weekly digest', body: MARKETING_FOOTER, date: new Date('2024-01-15') }), - email({ id: 2, sender: 'news@substack.com', subject: 'Weekly digest', body: MARKETING_FOOTER, date: new Date('2024-01-08') }), - email({ id: 3, sender: 'news@substack.com', subject: 'Weekly digest', body: MARKETING_FOOTER, date: new Date('2024-01-01') }), - ]; - - const grouped = newsletterDetector.groupBySender(emails); - const sub = grouped.get('news@substack.com'); - - expect(sub).toBeDefined(); - expect(sub?.emailCount).toBe(3); - expect(sub?.senderName).toBe('Substack'); - expect(sub?.frequency).toBe('weekly'); - expect(sub?.lastEmailDate).toEqual(new Date('2024-01-15')); - }); - - it('does not group ordinary non-marketing emails', () => { - const grouped = newsletterDetector.groupBySender([ - email({ sender: 'friend@gmail.com', subject: 'hi', body: 'hello there' }), - ]); - expect(grouped.size).toBe(0); - }); -}); diff --git a/web/src/__tests__/phase-9/newsletterDetector.domain.test.ts b/web/src/__tests__/phase-9/newsletterDetector.domain.test.ts deleted file mode 100644 index f021189..0000000 --- a/web/src/__tests__/phase-9/newsletterDetector.domain.test.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { newsletterDetector } from '../../services/newsletterDetector'; - -// isPromotionalSenderDomain is private; expose via a tiny cast to keep the test focused. -const isPromoDomain = (domain: string): boolean => - (newsletterDetector as unknown as { isPromotionalSenderDomain(d: string): boolean }) - .isPromotionalSenderDomain(domain); - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Lunch tomorrow?', - sender: 'friend@gmail.com', - senderName: 'A Friend', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'Hey, are we still on for lunch?', - attachments: [], - size: 1024, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('NewsletterDetector promotional-domain matching', () => { - it('does NOT treat gmail.com as a promotional domain (substring "mail." bug)', () => { - expect(isPromoDomain('gmail.com')).toBe(false); - }); - - it('does NOT treat hotmail.com as a promotional domain', () => { - expect(isPromoDomain('hotmail.com')).toBe(false); - }); - - it('does NOT treat an ordinary brand domain as promotional', () => { - expect(isPromoDomain('mybrand.com')).toBe(false); - }); - - it('matches a marketing subdomain prefix (newsletter.brand.com)', () => { - expect(isPromoDomain('newsletter.brand.com')).toBe(true); - }); - - it('matches a news. subdomain prefix', () => { - expect(isPromoDomain('news.brand.com')).toBe(true); - }); - - it('matches a known full promotional domain (email.amazonses.com)', () => { - expect(isPromoDomain('email.amazonses.com')).toBe(true); - }); - - it('does NOT classify a personal gmail email with a generic footer as promotional', () => { - // 3 generic footer phrases push marketing matches to 3, but without the - // bogus gmail "mail." domain boost the scores stay below the 40 threshold. - const personal = email({ - body: 'Thanks!\n\nunsubscribe\nprivacy policy\nall rights reserved', - }); - const result = newsletterDetector.detectNewsletter(personal); - expect(result.isPromotional).toBe(false); - expect(result.isNewsletter).toBe(false); - }); -}); diff --git a/web/src/__tests__/phase-9/purchaseDetector.currency.test.ts b/web/src/__tests__/phase-9/purchaseDetector.currency.test.ts deleted file mode 100644 index 1fa28bb..0000000 --- a/web/src/__tests__/phase-9/purchaseDetector.currency.test.ts +++ /dev/null @@ -1,59 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { purchaseDetector } from '../../services/purchaseDetector'; - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Your order confirmation #12345', - sender: 'orders@example.com', - senderName: 'Example', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'Order total: $42.00', - attachments: [], - size: 1024, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('PurchaseDetector currency persistence', () => { - it('createPurchaseFromEmail uses the supplied currency, not a hardcoded USD', () => { - const purchase = purchaseDetector.createPurchaseFromEmail( - email(), - 'Acme', - 49.99, - undefined, - 'EUR', - ); - expect(purchase.currency).toBe('EUR'); - }); - - it('defaults to USD when no currency is supplied', () => { - const purchase = purchaseDetector.createPurchaseFromEmail(email(), 'Acme', 10); - expect(purchase.currency).toBe('USD'); - }); - - it('round-trips a detected EUR amount into the stored purchase currency', () => { - const eur = email({ - sender: 'orders@shop.de', - senderName: 'Shop', - subject: 'Your order confirmation #99', - body: 'Order total: €49,99\nThank you for your order.', - }); - const result = purchaseDetector.detectPurchase(eur); - expect(result.data?.currency).toBe('EUR'); - - const purchase = purchaseDetector.createPurchaseFromEmail( - eur, - result.data!.merchant!, - result.data!.amount!, - result.data!.orderNumber, - result.data!.currency, - ); - expect(purchase.currency).toBe('EUR'); - expect(purchase.amount).toBe(49.99); - }); -}); diff --git a/web/src/__tests__/phase-9/purchaseDetector.detect.test.ts b/web/src/__tests__/phase-9/purchaseDetector.detect.test.ts deleted file mode 100644 index a84fbdf..0000000 --- a/web/src/__tests__/phase-9/purchaseDetector.detect.test.ts +++ /dev/null @@ -1,122 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { purchaseDetector } from '../../services/purchaseDetector'; - -// Behavioral coverage for detectPurchase()'s confidence gating, anti-pattern -// rejection, order-number extraction, and getPurchaseCategory(). The existing -// purchaseDetector.{currency,locale,domain} tests cover amount parsing and -// merchant matching only. - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Hello', - sender: 'someone@example.com', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'plain message', - attachments: [], - size: 100, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('PurchaseDetector.detectPurchase', () => { - it('detects a known-merchant order confirmation with an amount', () => { - const result = purchaseDetector.detectPurchase( - email({ - sender: 'orders@amazon.com', - subject: 'Your order confirmation #100001', - body: 'Order total: $42.00\nYour order has been confirmed.', - }) - ); - - expect(result.type).toBe('purchase'); - expect(result.confidence).toBeGreaterThanOrEqual(70); - expect(result.data?.merchant).toBe('Amazon'); - expect(result.data?.amount).toBe(42); - expect(result.data?.currency).toBe('USD'); - }); - - it('extracts and validates an order number from the body', () => { - const result = purchaseDetector.detectPurchase( - email({ - sender: 'orders@amazon.com', - subject: 'Your order confirmation', - body: 'Order number: ABC12345\nOrder total: $42.00\nYour order has been confirmed.', - }) - ); - - expect(result.type).toBe('purchase'); - expect(result.data?.orderNumber).toBe('ABC12345'); - }); - - it('rejects promotional emails that trip 3+ anti-patterns', () => { - const result = purchaseDetector.detectPurchase( - email({ - sender: 'deals@amazon.com', - subject: 'Save $50 today!', - body: 'Up to 70% off! Free shipping on all orders. Order total: $42.00', - }) - ); - - expect(result.type).toBe('none'); - }); - - it('does NOT report a purchase when no amount can be parsed (confidence < 70)', () => { - const result = purchaseDetector.detectPurchase( - email({ - sender: 'orders@amazon.com', - subject: 'Your order confirmation', - body: 'Thanks for your order. Details are inside your account.', - }) - ); - - expect(result.type).toBe('none'); - }); - - it('does NOT flag a plain personal email as a purchase', () => { - const result = purchaseDetector.detectPurchase( - email({ - sender: 'friend@gmail.com', - subject: 're: dinner', - body: 'see you at 7', - }) - ); - - expect(result.type).toBe('none'); - }); -}); - -describe('PurchaseDetector.getPurchaseCategory', () => { - it('maps known merchants to their category', () => { - expect(purchaseDetector.getPurchaseCategory('Amazon')).toBe('ecommerce'); - expect(purchaseDetector.getPurchaseCategory('Netflix')).toBe('entertainment'); - expect(purchaseDetector.getPurchaseCategory('Uber')).toBe('transportation'); - expect(purchaseDetector.getPurchaseCategory('Delta Airlines')).toBe('travel'); - }); - - it('falls back to "other" for unknown merchants', () => { - expect(purchaseDetector.getPurchaseCategory('Some Local Shop')).toBe('other'); - }); -}); - -describe('PurchaseDetector.createPurchaseFromEmail', () => { - it('builds a purchase record and derives the category from the merchant', () => { - const purchase = purchaseDetector.createPurchaseFromEmail( - email({ id: 7, date: new Date('2024-03-03') }), - 'Amazon', - 42, - 'ABC12345' - ); - - expect(purchase.emailId).toBe(7); - expect(purchase.merchant).toBe('Amazon'); - expect(purchase.amount).toBe(42); - expect(purchase.orderNumber).toBe('ABC12345'); - expect(purchase.category).toBe('ecommerce'); - expect(purchase.purchaseDate).toEqual(new Date('2024-03-03')); - }); -}); diff --git a/web/src/__tests__/phase-9/purchaseDetector.domain.test.ts b/web/src/__tests__/phase-9/purchaseDetector.domain.test.ts deleted file mode 100644 index 4157104..0000000 --- a/web/src/__tests__/phase-9/purchaseDetector.domain.test.ts +++ /dev/null @@ -1,39 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { purchaseDetector } from '../../services/purchaseDetector'; - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Your order confirmation #12345', - sender: 'orders@maxwell.com', - senderName: 'Maxwell', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'Order total: $42.00', - attachments: [], - size: 1024, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('PurchaseDetector merchant domain matching (issue 5)', () => { - it('does NOT attribute a maxwell.com purchase to a "max" merchant', () => { - const result = purchaseDetector.detectPurchase(email()); - // detected merchant must be the formatted domain (Maxwell), never a known - // merchant matched via the buggy substring path - expect(result.data?.merchant).toBe('Maxwell'); - }); - - it('does NOT attribute php.net purchase to HP via substring match', () => { - // Bug: 'php.net'.includes('hp.') is true, so it wrongly matches hp.com -> 'HP' - // After fix: php.net is not a subdomain of hp.com, so merchant = formatted domain - const result = purchaseDetector.detectPurchase( - email({ sender: 'billing@php.net', senderName: 'PHP' }), - ); - // If matched via bug, merchant would be 'HP'. After fix, merchant = 'PHP' (formatted domain). - expect(result.data?.merchant).not.toBe('HP'); - }); -}); diff --git a/web/src/__tests__/phase-9/purchaseDetector.locale.test.ts b/web/src/__tests__/phase-9/purchaseDetector.locale.test.ts deleted file mode 100644 index 0d7fe52..0000000 --- a/web/src/__tests__/phase-9/purchaseDetector.locale.test.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { purchaseDetector } from '../../services/purchaseDetector'; - -// parseAmount is private; expose via a tiny cast to keep the test focused. -const parse = (s: string, currency: string): number => - (purchaseDetector as unknown as { parseAmount(s: string, c: string): number }).parseAmount(s, currency); - -describe('PurchaseDetector.parseAmount locale handling (issue 11)', () => { - it('EUR thousands with dot, no cents: 1.234 -> 1234', () => { - expect(parse('1.234', 'EUR')).toBe(1234); - }); - - it('EUR with dot thousands and comma decimals: 1.234,56 -> 1234.56', () => { - expect(parse('1.234,56', 'EUR')).toBe(1234.56); - }); - - it('EUR comma decimals only: 1,23 -> 1.23 (cents NOT dropped)', () => { - expect(parse('1,23', 'EUR')).toBe(1.23); - }); - - it('EUR space thousands: 1 234,56 -> 1234.56', () => { - expect(parse('1 234,56', 'EUR')).toBe(1234.56); - }); - - it('USD dot decimals with comma thousands: 1,234.56 -> 1234.56', () => { - expect(parse('1,234.56', 'USD')).toBe(1234.56); - }); - - it('USD plain decimals: 42.00 -> 42', () => { - expect(parse('42.00', 'USD')).toBe(42); - }); - - it('CHF apostrophe thousands with dot decimals: 1\'234.50 -> 1234.5', () => { - expect(parse("1'234.50", 'CHF')).toBe(1234.5); - }); -}); diff --git a/web/src/__tests__/phase-9/snippet-render.test.tsx b/web/src/__tests__/phase-9/snippet-render.test.tsx index 15b901d..f5f525d 100644 --- a/web/src/__tests__/phase-9/snippet-render.test.tsx +++ b/web/src/__tests__/phase-9/snippet-render.test.tsx @@ -45,13 +45,13 @@ describe('EmailCard snippet rendering', () => { const oneThread: EmailThread = { id: 't1', subject: 'T', emails: [{ ...base, snippet: 'THREAD SNIPPET' }], - participants: ['a@b.com'], lastMessageDate: base.date, firstMessageDate: base.date, + participants: ['a@b.com'], lastMessageDate: base.date!, firstMessageDate: base.date!, messageCount: 1, unreadCount: 1, hasAttachments: false, isStarred: false, }; const oneThreadNoSnippet: EmailThread = { id: 't2', subject: 'T2', emails: [{ ...base, snippet: undefined }], - participants: ['a@b.com'], lastMessageDate: base.date, firstMessageDate: base.date, + participants: ['a@b.com'], lastMessageDate: base.date!, firstMessageDate: base.date!, messageCount: 1, unreadCount: 1, hasAttachments: false, isStarred: false, }; @@ -60,7 +60,7 @@ const multiThread: EmailThread = { { ...base, id: 2, snippet: 'OLDER SNIPPET', body: 'older body' }, { ...base, id: 3, snippet: 'LATEST SNIPPET', body: 'latest body' }, ], - participants: ['a@b.com'], lastMessageDate: base.date, firstMessageDate: base.date, + participants: ['a@b.com'], lastMessageDate: base.date!, firstMessageDate: base.date!, messageCount: 2, unreadCount: 0, hasAttachments: false, isStarred: false, }; diff --git a/web/src/__tests__/phase-9/subscriptionDetector.billing.test.ts b/web/src/__tests__/phase-9/subscriptionDetector.billing.test.ts deleted file mode 100644 index 3920393..0000000 --- a/web/src/__tests__/phase-9/subscriptionDetector.billing.test.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { subscriptionDetector } from '../../services/subscriptionDetector'; - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Your subscription renewal', - sender: 'billing@netflix.com', - senderName: 'Netflix', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: '', - attachments: [], - size: 1024, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('SubscriptionDetector billing context (issue 10)', () => { - it('picks the billing amount, not an unrelated footer price', () => { - const result = subscriptionDetector.detectSubscription( - email({ - body: 'Your subscription renews. You will be charged $15.49 per month. Free shipping on orders over $0.00.', - }), - ); - expect(result.amount).toBe(15.49); - }); - - it('returns no amount when no billing-context phrase surrounds a price', () => { - const result = subscriptionDetector.detectSubscription( - email({ - body: 'Your subscription renewal is confirmed. Check out our store: hoodies from $0.00 today!', - }), - ); - expect(result.amount).toBeUndefined(); - }); - - it('detects yearly only when the billing context says yearly', () => { - const result = subscriptionDetector.detectSubscription( - email({ body: 'Your subscription renews. You will be billed $99.00 per year.' }), - ); - expect(result.frequency).toBe('yearly'); - }); - - it('does NOT pick yearly from "billed monthly, save yearly"', () => { - const result = subscriptionDetector.detectSubscription( - email({ body: 'Recurring charge: $9.99 billed monthly. Switch and save 20% yearly!' }), - ); - expect(result.frequency).toBe('monthly'); - }); - - it('returns undefined frequency when there is no billing signal at all', () => { - const result = subscriptionDetector.detectSubscription( - email({ body: 'Your subscription renewal is confirmed. Enjoy the show.' }), - ); - expect(result.frequency).toBeUndefined(); - }); - - it('picks the billing-anchored price when a non-billing price appears first', () => { - // The old first-match extractAmount would return 5.00 (first $ in body). - // The fixed billing-anchored extractAmount skips $5.00 (no billing keyword - // in its ±40-char window) and returns 12.99 (adjacent to "charged"). - const result = subscriptionDetector.detectSubscription( - email({ - body: 'Limited-time offer: $5.00 off your next order! Your subscription will be charged $12.99 per month starting today.', - }), - ); - expect(result.amount).toBe(12.99); - }); -}); diff --git a/web/src/__tests__/phase-9/subscriptionDetector.detect.test.ts b/web/src/__tests__/phase-9/subscriptionDetector.detect.test.ts deleted file mode 100644 index 2584715..0000000 --- a/web/src/__tests__/phase-9/subscriptionDetector.detect.test.ts +++ /dev/null @@ -1,79 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { subscriptionDetector } from '../../services/subscriptionDetector'; - -// Behavioral coverage for detectSubscription()'s end-to-end classification and -// getKnownServices(). The existing subscriptionDetector.{billing,domain} tests -// cover amount/frequency windowing and substring matching only. - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Hello', - sender: 'someone@example.com', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'plain message', - attachments: [], - size: 100, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('SubscriptionDetector.detectSubscription', () => { - it('detects a known service renewal with amount, frequency and category', () => { - const result = subscriptionDetector.detectSubscription( - email({ - sender: 'billing@spotify.com', - subject: 'Your subscription renewal', - body: 'Your subscription will auto-renew. You will be charged $9.99 per month.', - }) - ); - - expect(result.isSubscription).toBe(true); - expect(result.serviceName).toBe('Spotify'); - expect(result.category).toBe('streaming'); - expect(result.amount).toBe(9.99); - expect(result.currency).toBe('USD'); - expect(result.frequency).toBe('monthly'); - }); - - it('detects a subscription from an unknown sender via body patterns and names it from the sender', () => { - const result = subscriptionDetector.detectSubscription( - email({ - sender: 'noreply@someservice.io', - senderName: 'Some Service', - subject: 'Receipt', - body: 'Billing period: monthly\nYour next billing date: 2024-02-01\nRecurring charge: $5.00', - }) - ); - - expect(result.isSubscription).toBe(true); - expect(result.serviceName).toBe('Some Service'); - }); - - it('does NOT flag a regular personal email as a subscription', () => { - const result = subscriptionDetector.detectSubscription( - email({ - sender: 'friend@gmail.com', - subject: 'hi', - body: 'how are you doing today?', - }) - ); - - expect(result.isSubscription).toBe(false); - expect(result.category).toBe('other'); - }); -}); - -describe('SubscriptionDetector.getKnownServices', () => { - it('returns the catalogue of known subscription services with domain + category', () => { - const services = subscriptionDetector.getKnownServices(); - const netflix = services.find(s => s.domain === 'netflix.com'); - - expect(services.length).toBeGreaterThan(0); - expect(netflix).toEqual({ domain: 'netflix.com', name: 'Netflix', category: 'streaming' }); - }); -}); diff --git a/web/src/__tests__/phase-9/subscriptionDetector.domain.test.ts b/web/src/__tests__/phase-9/subscriptionDetector.domain.test.ts deleted file mode 100644 index 0d90d80..0000000 --- a/web/src/__tests__/phase-9/subscriptionDetector.domain.test.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import type { Email } from '../../types'; -import { subscriptionDetector } from '../../services/subscriptionDetector'; - -const email = (overrides: Partial = {}): Email => ({ - id: 1, - subject: 'Your subscription renewal', - sender: 'billing@maxwell.com', - senderName: 'Maxwell', - recipients: ['me@example.com'], - date: new Date('2024-01-01'), - body: 'Your subscription renews. Recurring charge: $9.99 per month.', - attachments: [], - size: 1024, - isRead: true, - isStarred: false, - folderId: 'inbox', - emailType: 'regular', - ...overrides, -}); - -describe('SubscriptionDetector domain matching (issue 5)', () => { - it('does NOT attribute maxwell.com to "Max" via base-word substring', () => { - const result = subscriptionDetector.detectSubscription(email()); - // 'max.com' -> { name: 'Max' } exists in knownSubscriptions; maxwell.com must NOT match it - expect(result.serviceName).not.toBe('Max'); - }); - - it('still matches a real subdomain of a known service', () => { - const result = subscriptionDetector.detectSubscription( - email({ sender: 'no-reply@mail.netflix.com', senderName: '' }), - ); - expect(result.serviceName).toBe('Netflix'); - }); -}); diff --git a/web/src/components/AttachmentGallery.tsx b/web/src/components/AttachmentGallery.tsx index 294d6ab..9686927 100644 --- a/web/src/components/AttachmentGallery.tsx +++ b/web/src/components/AttachmentGallery.tsx @@ -67,7 +67,7 @@ export function AttachmentGallery({ emails }: AttachmentGalleryProps) { // Sort by date (newest first) filteredAttachments.sort( - (a, b) => new Date(b.email.date).getTime() - new Date(a.email.date).getTime() + (a, b) => (b.email.date?.getTime() ?? -Infinity) - (a.email.date?.getTime() ?? -Infinity) ); // Stable, content-based key of the image emails currently visible. Using a diff --git a/web/src/components/ContactModal.tsx b/web/src/components/ContactModal.tsx index aa9e208..267a4b7 100644 --- a/web/src/components/ContactModal.tsx +++ b/web/src/components/ContactModal.tsx @@ -85,7 +85,7 @@ export function ContactModal({ contact, isOpen, onClose, onSave }: ContactModalP {contact.email}

- {contact.emailCount} emails • Last activity: {contact.lastEmailDate.toLocaleDateString()} + {contact.emailCount} emails • Last activity: {contact.lastEmailDate ? contact.lastEmailDate.toLocaleDateString() : 'Unknown date'}

diff --git a/web/src/components/EmailCard.tsx b/web/src/components/EmailCard.tsx index f9acac9..1f08b2c 100644 --- a/web/src/components/EmailCard.tsx +++ b/web/src/components/EmailCard.tsx @@ -125,7 +125,7 @@ export const EmailCard = memo(function EmailCard({ email, onClick }: EmailCardPr - {format(email.date, 'MMM d, yyyy')} + {email.date ? format(email.date, 'MMM d, yyyy') : 'Unknown date'} diff --git a/web/src/components/ThreadView.tsx b/web/src/components/ThreadView.tsx index 576b5e1..4417028 100644 --- a/web/src/components/ThreadView.tsx +++ b/web/src/components/ThreadView.tsx @@ -174,7 +174,7 @@ function SingleEmailView({ email, onClick, onToggleStar }: SingleEmailViewProps)
- {format(new Date(email.date), 'MMM d')} + {email.date ? format(email.date, 'MMM d') : 'Unknown date'}

- Signed up: {format(account.signupDate, 'MMM d, yyyy')} + Signed up: {account.signupDate ? format(account.signupDate, 'MMM d, yyyy') : 'Unknown date'}

{account.emailCount} email{account.emailCount !== 1 ? 's' : ''} from this service diff --git a/web/src/pages/AnalyticsPage.tsx b/web/src/pages/AnalyticsPage.tsx index 04fb7be..c8a17fd 100644 --- a/web/src/pages/AnalyticsPage.tsx +++ b/web/src/pages/AnalyticsPage.tsx @@ -15,6 +15,7 @@ export function AnalyticsPage() { const availableYears = useMemo(() => { const years = new Set(); emails.forEach((email) => { + if (!email.date) return; // undated emails contribute no year years.add(new Date(email.date).getFullYear()); }); return Array.from(years).sort((a, b) => b - a); // Most recent first @@ -23,7 +24,7 @@ export function AnalyticsPage() { // Filter data by selected year const filteredEmails = useMemo(() => { if (selectedYear === 'all') return emails; - return emails.filter(e => new Date(e.date).getFullYear() === selectedYear); + return emails.filter(e => e.date != null && new Date(e.date).getFullYear() === selectedYear); }, [emails, selectedYear]); const filteredPurchases = useMemo(() => { @@ -36,13 +37,16 @@ export function AnalyticsPage() { const now = new Date(); const thirtyDaysAgo = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000); - const recentEmails = filteredEmails.filter(e => new Date(e.date) >= thirtyDaysAgo); + const recentEmails = filteredEmails.filter(e => e.date != null && new Date(e.date) >= thirtyDaysAgo); const uniqueSenders = new Set(filteredEmails.map(e => e.sender)).size; // Calculate date range for avg emails/day (inclusive of both start and end days) let dateRange = 1; if (filteredEmails.length > 0) { - const sortedDates = filteredEmails.map(e => new Date(e.date).getTime()).sort((a, b) => a - b); + const sortedDates = filteredEmails + .filter(e => e.date != null) + .map(e => new Date(e.date as Date).getTime()) + .sort((a, b) => a - b); const oldestDate = sortedDates[0]; const newestDate = sortedDates[sortedDates.length - 1]; // Add 1 for inclusive counting: emails from Jan 1 to Jan 2 span 2 calendar days @@ -62,6 +66,7 @@ export function AnalyticsPage() { const monthlyData: Record = {}; filteredEmails.forEach((email) => { + if (!email.date) return; // undated emails excluded from volume aggregation const date = new Date(email.date); const key = `${date.getFullYear()}-${String(date.getMonth() + 1).padStart(2, '0')}`; monthlyData[key] = (monthlyData[key] || 0) + 1; @@ -127,6 +132,7 @@ export function AnalyticsPage() { const hourlyData: number[][] = Array(7).fill(null).map(() => Array(24).fill(0)); filteredEmails.forEach((email) => { + if (!email.date) return; // undated emails excluded from activity heatmap const date = new Date(email.date); const day = date.getDay(); const hour = date.getHours(); diff --git a/web/src/pages/AttachmentsPage.tsx b/web/src/pages/AttachmentsPage.tsx index 2ad69fb..acbbb99 100644 --- a/web/src/pages/AttachmentsPage.tsx +++ b/web/src/pages/AttachmentsPage.tsx @@ -108,7 +108,7 @@ export function AttachmentsPage() { attachments.push({ ...attachment, email }); } } - return attachments.sort((a, b) => b.email.date.getTime() - a.email.date.getTime()); + return attachments.sort((a, b) => (b.email.date?.getTime() ?? -Infinity) - (a.email.date?.getTime() ?? -Infinity)); }, [emails]); // Filter attachments @@ -478,7 +478,7 @@ function ListAttachmentRow({ att, isSelected, bodyCache, fetchData, onSelect, on {att.filename}

- From: {att.email.sender} • {format(att.email.date, 'MMM d, yyyy')} + From: {att.email.sender} • {att.email.date ? format(att.email.date, 'MMM d, yyyy') : 'Unknown date'}
diff --git a/web/src/pages/ContactsPage.tsx b/web/src/pages/ContactsPage.tsx index eeaab98..1304223 100644 --- a/web/src/pages/ContactsPage.tsx +++ b/web/src/pages/ContactsPage.tsx @@ -52,7 +52,7 @@ export function ContactsPage() { case 'emailCount': return b.emailCount - a.emailCount; case 'lastActivity': - return new Date(b.lastEmailDate).getTime() - new Date(a.lastEmailDate).getTime(); + return (b.lastEmailDate?.getTime() ?? -Infinity) - (a.lastEmailDate?.getTime() ?? -Infinity); default: return 0; } @@ -221,7 +221,7 @@ export function ContactsPage() {
- {format(contact.lastEmailDate, 'MMM d, yyyy')} + {contact.lastEmailDate ? format(contact.lastEmailDate, 'MMM d, yyyy') : 'Unknown date'} diff --git a/web/src/pages/EmailDetailPage.tsx b/web/src/pages/EmailDetailPage.tsx index cf56751..7009d90 100644 --- a/web/src/pages/EmailDetailPage.tsx +++ b/web/src/pages/EmailDetailPage.tsx @@ -238,7 +238,7 @@ export function EmailDetailPage() { )}
- {format(email.date, 'EEEE, MMMM d, yyyy \'at\' h:mm a')} + {email.date ? format(email.date, 'EEEE, MMMM d, yyyy \'at\' h:mm a') : 'Unknown date'} {email.folderId !== SYSTEM_FOLDERS.INBOX && (
- Last: {format(nl.lastEmailDate, 'MMM d')} + Last: {nl.lastEmailDate ? format(nl.lastEmailDate, 'MMM d') : 'Unknown date'}
{nl.unsubscribeLink && ( diff --git a/web/src/pages/SenderEmailsPage.tsx b/web/src/pages/SenderEmailsPage.tsx index 4e1e9f0..a5ae010 100644 --- a/web/src/pages/SenderEmailsPage.tsx +++ b/web/src/pages/SenderEmailsPage.tsx @@ -117,7 +117,7 @@ export function SenderEmailsPage() { } // Sort by date descending - result.sort((a, b) => new Date(b.date).getTime() - new Date(a.date).getTime()); + result.sort((a, b) => (b.date?.getTime() ?? -Infinity) - (a.date?.getTime() ?? -Infinity)); return result; }, [senderEmails, readFilter, folderFilter, typeFilter, searchQuery, searchTextMap]); @@ -368,7 +368,7 @@ export function SenderEmailsPage() { )} - {format(email.date, 'MMM d, yyyy')} + {email.date ? format(email.date, 'MMM d, yyyy') : 'Unknown date'} diff --git a/web/src/pages/SendersPage.tsx b/web/src/pages/SendersPage.tsx index b3fd2eb..00220c5 100644 --- a/web/src/pages/SendersPage.tsx +++ b/web/src/pages/SendersPage.tsx @@ -121,7 +121,8 @@ export function SendersPage() { const existing = groups.get(key); if (existing) { existing.emails.push(email); - if (new Date(email.date) > existing.latestDate) { + // Only advance latestDate from emails that have a real date. + if (email.date && new Date(email.date) > existing.latestDate) { existing.latestDate = new Date(email.date); } } else { @@ -129,7 +130,8 @@ export function SendersPage() { key, displayName, emails: [email], - latestDate: new Date(email.date), + // Undated first email seeds the epoch (sorts last); real dates override later. + latestDate: email.date ? new Date(email.date) : new Date(0), }); } }); @@ -358,7 +360,7 @@ export function SendersPage() {

- {format(email.date, 'MMM d, yyyy')} + {email.date ? format(email.date, 'MMM d, yyyy') : 'Unknown date'} diff --git a/web/src/pages/SubscriptionsPage.tsx b/web/src/pages/SubscriptionsPage.tsx index 2aa6fc6..7a55a9a 100644 --- a/web/src/pages/SubscriptionsPage.tsx +++ b/web/src/pages/SubscriptionsPage.tsx @@ -218,7 +218,7 @@ export function SubscriptionsPage() {
- Last: {format(sub.lastRenewalDate, 'MMM d, yyyy')} + Last: {sub.lastRenewalDate ? format(sub.lastRenewalDate, 'MMM d, yyyy') : 'Unknown date'}
@@ -302,7 +302,7 @@ export function SubscriptionsPage() {
Last Payment
- {format(selectedSubscription.lastRenewalDate, 'MMM d, yyyy')} + {selectedSubscription.lastRenewalDate ? format(selectedSubscription.lastRenewalDate, 'MMM d, yyyy') : 'Unknown date'}
@@ -336,7 +336,7 @@ export function SubscriptionsPage() { {email.subject || '(No Subject)'}
- {format(email.date, 'MMM d, yyyy')} • {email.sender} + {email.date ? format(email.date, 'MMM d, yyyy') : 'Unknown date'} • {email.sender}
diff --git a/web/src/services/__tests__/library-smoke.test.ts b/web/src/services/__tests__/library-smoke.test.ts new file mode 100644 index 0000000..a37240a --- /dev/null +++ b/web/src/services/__tests__/library-smoke.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect } from 'vitest'; +import { MBOXParser, AccountDetector } from '@technical-1/email-archive-parser'; + +describe('library smoke', () => { + it('parses a trivial MBOX buffer and runs a detector', async () => { + const mbox = [ + 'From a@x.com Mon Jan 1 00:00:00 2024', + 'From: Welcome ', + 'Subject: Welcome to Netflix!', + 'Date: Mon, 01 Jan 2024 00:00:00 +0000', + '', + 'Your account has been created.', + '', + ].join('\n'); + const result = await new MBOXParser().parse(Buffer.from(mbox, 'utf-8')); + expect(result.emails.length).toBe(1); + const det = new AccountDetector().detect(result.emails[0]); + expect(det.type).toBe('account'); + }); +}); diff --git a/web/src/services/accountDetector.ts b/web/src/services/accountDetector.ts deleted file mode 100644 index b3b157d..0000000 --- a/web/src/services/accountDetector.ts +++ /dev/null @@ -1,343 +0,0 @@ -import type { Email, Account, DetectionResult } from '../types'; -import { stripHtml, extractDomain } from '../utils/emailUtils'; -import { isDomainMatch } from './domainMatch'; - -class AccountDetector { - // Strong subject line patterns for account signups (must be primary purpose of email) - private readonly strongSubjectPatterns = [ - /^welcome to/i, - /^verify your.*(?:email|account)/i, - /^confirm your.*(?:email|account|registration)/i, - /^activate your.*account/i, - /^your.*account.*(?:has been |is )created/i, - /^(?:complete|finish) your registration/i, - /^thanks for (?:signing up|registering|joining)/i, - /^you(?:'re| are) (?:in|registered)/i, - /email verification/i, - /account verification/i, - ]; - - // Strong body patterns (high confidence indicators) - private readonly strongBodyPatterns = [ - /click.*(?:here|below|button).*(?:to )?verify your email/i, - /confirm your email address/i, - /complete your registration/i, - /your account has been (?:successfully )?created/i, - /welcome to .{2,50}[!.]/i, - /thanks for (?:signing up|registering|creating an account)/i, - /verification code[:\s]+\d{4,8}/i, - /your verification code is/i, - ]; - - // Known service domains for reliable detection - private readonly knownServices: Record = { - 'netflix.com': { name: 'Netflix', type: 'streaming' }, - 'spotify.com': { name: 'Spotify', type: 'streaming' }, - 'hulu.com': { name: 'Hulu', type: 'streaming' }, - 'disneyplus.com': { name: 'Disney+', type: 'streaming' }, - 'hbomax.com': { name: 'HBO Max', type: 'streaming' }, - 'max.com': { name: 'Max', type: 'streaming' }, - 'peacocktv.com': { name: 'Peacock', type: 'streaming' }, - 'paramountplus.com': { name: 'Paramount+', type: 'streaming' }, - 'primevideo.com': { name: 'Prime Video', type: 'streaming' }, - 'crunchyroll.com': { name: 'Crunchyroll', type: 'streaming' }, - 'youtube.com': { name: 'YouTube', type: 'streaming' }, - 'twitch.tv': { name: 'Twitch', type: 'streaming' }, - 'amazon.com': { name: 'Amazon', type: 'ecommerce' }, - 'ebay.com': { name: 'eBay', type: 'ecommerce' }, - 'etsy.com': { name: 'Etsy', type: 'ecommerce' }, - 'shopify.com': { name: 'Shopify', type: 'ecommerce' }, - 'walmart.com': { name: 'Walmart', type: 'ecommerce' }, - 'target.com': { name: 'Target', type: 'ecommerce' }, - 'bestbuy.com': { name: 'Best Buy', type: 'ecommerce' }, - 'aliexpress.com': { name: 'AliExpress', type: 'ecommerce' }, - 'wish.com': { name: 'Wish', type: 'ecommerce' }, - 'facebook.com': { name: 'Facebook', type: 'social' }, - 'meta.com': { name: 'Meta', type: 'social' }, - 'instagram.com': { name: 'Instagram', type: 'social' }, - 'twitter.com': { name: 'Twitter', type: 'social' }, - 'x.com': { name: 'X', type: 'social' }, - 'linkedin.com': { name: 'LinkedIn', type: 'social' }, - 'tiktok.com': { name: 'TikTok', type: 'social' }, - 'reddit.com': { name: 'Reddit', type: 'social' }, - 'pinterest.com': { name: 'Pinterest', type: 'social' }, - 'snapchat.com': { name: 'Snapchat', type: 'social' }, - 'threads.net': { name: 'Threads', type: 'social' }, - 'github.com': { name: 'GitHub', type: 'development' }, - 'gitlab.com': { name: 'GitLab', type: 'development' }, - 'bitbucket.org': { name: 'Bitbucket', type: 'development' }, - 'atlassian.com': { name: 'Atlassian', type: 'development' }, - 'jetbrains.com': { name: 'JetBrains', type: 'development' }, - 'stackoverflow.com': { name: 'Stack Overflow', type: 'development' }, - 'heroku.com': { name: 'Heroku', type: 'development' }, - 'vercel.com': { name: 'Vercel', type: 'development' }, - 'netlify.com': { name: 'Netlify', type: 'development' }, - 'digitalocean.com': { name: 'DigitalOcean', type: 'development' }, - 'aws.amazon.com': { name: 'AWS', type: 'development' }, - 'cloud.google.com': { name: 'Google Cloud', type: 'development' }, - 'azure.microsoft.com': { name: 'Azure', type: 'development' }, - 'slack.com': { name: 'Slack', type: 'communication' }, - 'zoom.us': { name: 'Zoom', type: 'communication' }, - 'discord.com': { name: 'Discord', type: 'communication' }, - 'teams.microsoft.com': { name: 'Microsoft Teams', type: 'communication' }, - 'telegram.org': { name: 'Telegram', type: 'communication' }, - 'whatsapp.com': { name: 'WhatsApp', type: 'communication' }, - 'signal.org': { name: 'Signal', type: 'communication' }, - 'paypal.com': { name: 'PayPal', type: 'banking' }, - 'venmo.com': { name: 'Venmo', type: 'banking' }, - 'stripe.com': { name: 'Stripe', type: 'banking' }, - 'chase.com': { name: 'Chase', type: 'banking' }, - 'bankofamerica.com': { name: 'Bank of America', type: 'banking' }, - 'wellsfargo.com': { name: 'Wells Fargo', type: 'banking' }, - 'capitalone.com': { name: 'Capital One', type: 'banking' }, - 'citi.com': { name: 'Citibank', type: 'banking' }, - 'schwab.com': { name: 'Charles Schwab', type: 'banking' }, - 'fidelity.com': { name: 'Fidelity', type: 'banking' }, - 'robinhood.com': { name: 'Robinhood', type: 'banking' }, - 'coinbase.com': { name: 'Coinbase', type: 'banking' }, - 'dropbox.com': { name: 'Dropbox', type: 'other' }, - 'box.com': { name: 'Box', type: 'other' }, - 'notion.so': { name: 'Notion', type: 'other' }, - 'figma.com': { name: 'Figma', type: 'other' }, - 'canva.com': { name: 'Canva', type: 'other' }, - 'adobe.com': { name: 'Adobe', type: 'other' }, - 'microsoft.com': { name: 'Microsoft', type: 'other' }, - 'google.com': { name: 'Google', type: 'other' }, - 'apple.com': { name: 'Apple', type: 'other' }, - 'icloud.com': { name: 'iCloud', type: 'other' }, - 'uber.com': { name: 'Uber', type: 'other' }, - 'lyft.com': { name: 'Lyft', type: 'other' }, - 'doordash.com': { name: 'DoorDash', type: 'other' }, - 'grubhub.com': { name: 'Grubhub', type: 'other' }, - 'instacart.com': { name: 'Instacart', type: 'other' }, - 'airbnb.com': { name: 'Airbnb', type: 'other' }, - // Additional streaming services - 'appletv.apple.com': { name: 'Apple TV+', type: 'streaming' }, - 'funimation.com': { name: 'Funimation', type: 'streaming' }, - 'showtime.com': { name: 'Showtime', type: 'streaming' }, - 'starz.com': { name: 'Starz', type: 'streaming' }, - 'discovery.com': { name: 'Discovery+', type: 'streaming' }, - 'espn.com': { name: 'ESPN+', type: 'streaming' }, - 'audible.com': { name: 'Audible', type: 'streaming' }, - 'pandora.com': { name: 'Pandora', type: 'streaming' }, - 'deezer.com': { name: 'Deezer', type: 'streaming' }, - 'tidal.com': { name: 'Tidal', type: 'streaming' }, - // Additional ecommerce - 'newegg.com': { name: 'Newegg', type: 'ecommerce' }, - 'wayfair.com': { name: 'Wayfair', type: 'ecommerce' }, - 'zappos.com': { name: 'Zappos', type: 'ecommerce' }, - 'macys.com': { name: "Macy's", type: 'ecommerce' }, - 'nordstrom.com': { name: 'Nordstrom', type: 'ecommerce' }, - 'costco.com': { name: 'Costco', type: 'ecommerce' }, - 'homedepot.com': { name: 'Home Depot', type: 'ecommerce' }, - 'lowes.com': { name: "Lowe's", type: 'ecommerce' }, - 'sephora.com': { name: 'Sephora', type: 'ecommerce' }, - 'ulta.com': { name: 'Ulta', type: 'ecommerce' }, - 'chewy.com': { name: 'Chewy', type: 'ecommerce' }, - // Additional social - 'tumblr.com': { name: 'Tumblr', type: 'social' }, - 'mastodon.social': { name: 'Mastodon', type: 'social' }, - 'bluesky.social': { name: 'Bluesky', type: 'social' }, - 'nextdoor.com': { name: 'Nextdoor', type: 'social' }, - 'quora.com': { name: 'Quora', type: 'social' }, - // Additional development/productivity - 'trello.com': { name: 'Trello', type: 'development' }, - 'asana.com': { name: 'Asana', type: 'development' }, - 'monday.com': { name: 'Monday.com', type: 'development' }, - 'jira.com': { name: 'Jira', type: 'development' }, - 'confluence.com': { name: 'Confluence', type: 'development' }, - 'npm.com': { name: 'npm', type: 'development' }, - 'docker.com': { name: 'Docker', type: 'development' }, - 'cloudflare.com': { name: 'Cloudflare', type: 'development' }, - 'firebase.google.com': { name: 'Firebase', type: 'development' }, - 'render.com': { name: 'Render', type: 'development' }, - 'railway.app': { name: 'Railway', type: 'development' }, - 'supabase.com': { name: 'Supabase', type: 'development' }, - 'planetscale.com': { name: 'PlanetScale', type: 'development' }, - // Additional communication - 'webex.com': { name: 'Webex', type: 'communication' }, - 'gotomeeting.com': { name: 'GoToMeeting', type: 'communication' }, - 'line.me': { name: 'LINE', type: 'communication' }, - 'viber.com': { name: 'Viber', type: 'communication' }, - // Additional banking/finance - 'americanexpress.com': { name: 'American Express', type: 'banking' }, - 'discover.com': { name: 'Discover', type: 'banking' }, - 'usbank.com': { name: 'US Bank', type: 'banking' }, - 'pnc.com': { name: 'PNC', type: 'banking' }, - 'tdbank.com': { name: 'TD Bank', type: 'banking' }, - 'ally.com': { name: 'Ally Bank', type: 'banking' }, - 'marcus.com': { name: 'Marcus', type: 'banking' }, - 'sofi.com': { name: 'SoFi', type: 'banking' }, - 'chime.com': { name: 'Chime', type: 'banking' }, - 'cashapp.com': { name: 'Cash App', type: 'banking' }, - 'wealthfront.com': { name: 'Wealthfront', type: 'banking' }, - 'betterment.com': { name: 'Betterment', type: 'banking' }, - 'acorns.com': { name: 'Acorns', type: 'banking' }, - 'kraken.com': { name: 'Kraken', type: 'banking' }, - 'binance.com': { name: 'Binance', type: 'banking' }, - // Additional other services - 'evernote.com': { name: 'Evernote', type: 'other' }, - 'todoist.com': { name: 'Todoist', type: 'other' }, - 'grammarly.com': { name: 'Grammarly', type: 'other' }, - '1password.com': { name: '1Password', type: 'other' }, - 'lastpass.com': { name: 'LastPass', type: 'other' }, - 'bitwarden.com': { name: 'Bitwarden', type: 'other' }, - 'dashlane.com': { name: 'Dashlane', type: 'other' }, - 'nordvpn.com': { name: 'NordVPN', type: 'other' }, - 'expressvpn.com': { name: 'ExpressVPN', type: 'other' }, - 'surfshark.com': { name: 'Surfshark', type: 'other' }, - 'duolingo.com': { name: 'Duolingo', type: 'other' }, - 'coursera.org': { name: 'Coursera', type: 'other' }, - 'udemy.com': { name: 'Udemy', type: 'other' }, - 'skillshare.com': { name: 'Skillshare', type: 'other' }, - 'masterclass.com': { name: 'MasterClass', type: 'other' }, - 'calm.com': { name: 'Calm', type: 'other' }, - 'headspace.com': { name: 'Headspace', type: 'other' }, - 'strava.com': { name: 'Strava', type: 'other' }, - 'peloton.com': { name: 'Peloton', type: 'other' }, - 'myfitnesspal.com': { name: 'MyFitnessPal', type: 'other' }, - 'fitbit.com': { name: 'Fitbit', type: 'other' }, - }; - - detectAccountSignup(email: Email): DetectionResult { - const subject = email.subject || ''; - const body = stripHtml(email.body || ''); - const sender = email.sender || ''; - - let confidence = 0; - let detectedService = ''; - let serviceType: Account['serviceType'] = 'other'; - - // Check if sender is from a known service - const domain = extractDomain(sender); - const serviceInfo = this.findKnownService(domain); - - if (serviceInfo) { - detectedService = serviceInfo.name; - serviceType = serviceInfo.type; - confidence += 40; // Known service gives base confidence - } - - // Check strong subject patterns - for (const pattern of this.strongSubjectPatterns) { - if (pattern.test(subject)) { - confidence += 40; - break; - } - } - - // Check strong body patterns - for (const pattern of this.strongBodyPatterns) { - if (pattern.test(body)) { - confidence += 30; - break; - } - } - - // If we have strong patterns but no known service, try to extract service name - if (confidence >= 40 && !detectedService) { - const extracted = this.extractServiceName(subject); - if (extracted) { - detectedService = extracted; - confidence += 10; - } else { - // Use domain as fallback service name (but only if patterns matched) - detectedService = this.formatDomainAsServiceName(domain); - } - } - - // Require high confidence AND a service name - if (confidence >= 70 && detectedService) { - return { - type: 'account', - confidence, - data: { - serviceName: detectedService, - serviceType, - }, - }; - } - - return { type: 'none', confidence: 0 }; - } - - private findKnownService(domain: string): { name: string; type: Account['serviceType'] } | null { - // Direct match - if (this.knownServices[domain]) { - return this.knownServices[domain]; - } - - // Exact or subdomain match against each known service domain - for (const [serviceDomain, info] of Object.entries(this.knownServices)) { - if (isDomainMatch(domain, serviceDomain)) { - return info; - } - } - - return null; - } - - private extractServiceName(subject: string): string { - // Very strict patterns - only match clear service name mentions - const patterns = [ - /^welcome to ([A-Z][a-zA-Z0-9]+(?:\s[A-Z][a-zA-Z0-9]+)?)[!.,]/i, - /thanks for (?:signing up|joining|registering) (?:for |with )?([A-Z][a-zA-Z0-9]+(?:\s[A-Z][a-zA-Z0-9]+)?)[!.,]/i, - /your ([A-Z][a-zA-Z0-9]+(?:\s[A-Z][a-zA-Z0-9]+)?) account (?:has been |is )?(?:created|ready)/i, - ]; - - // Check subject first (more reliable) - for (const pattern of patterns) { - const match = subject.match(pattern); - if (match && match[1]) { - const name = match[1].trim(); - // Validate it looks like a service name (2-30 chars, starts with letter) - if (name.length >= 2 && name.length <= 30 && /^[A-Z]/i.test(name)) { - return name; - } - } - } - - return ''; - } - - private formatDomainAsServiceName(domain: string): string { - if (!domain) return ''; - - // Get the main part of the domain (before the TLD) - const parts = domain.split('.'); - if (parts.length < 2) return ''; - - // For subdomains, try to get the main domain - let mainPart = parts.length > 2 ? parts[parts.length - 2] : parts[0]; - - // Skip common email subdomains - const skipWords = ['mail', 'email', 'noreply', 'no-reply', 'notifications', 'info', 'support', 'news', 'newsletter']; - if (skipWords.includes(mainPart.toLowerCase())) { - mainPart = parts.length > 2 ? parts[parts.length - 2] : parts[0]; - } - - // Capitalize first letter - return mainPart.charAt(0).toUpperCase() + mainPart.slice(1); - } - - getServiceType(domain: string): Account['serviceType'] { - const serviceInfo = this.findKnownService(domain); - return serviceInfo ? serviceInfo.type : 'other'; - } - - createAccountFromEmail(email: Email, serviceName: string, serviceType?: Account['serviceType']): Omit { - const senderDomain = extractDomain(email.sender); - - return { - serviceName, - signupEmailId: email.id, - signupDate: email.date, - serviceType: serviceType || this.getServiceType(senderDomain), - domain: senderDomain, - lastActivityDate: email.date, - emailCount: 1, - }; - } -} - -export const accountDetector = new AccountDetector(); diff --git a/web/src/services/backupService.ts b/web/src/services/backupService.ts index 6015b2e..9902ae5 100644 --- a/web/src/services/backupService.ts +++ b/web/src/services/backupService.ts @@ -87,7 +87,7 @@ class BackupService { if (options.dateRange) { const startTime = options.dateRange.start.getTime(); const endTime = options.dateRange.end.getTime(); - emails = emails.filter((e) => e.date >= startTime && e.date <= endTime); + emails = emails.filter((e) => e.date != null && e.date >= startTime && e.date <= endTime); } // Apply folder filter @@ -97,7 +97,7 @@ class BackupService { backup.emails = emails.map((e) => ({ ...e, - date: new Date(e.date), + date: e.date == null ? null : new Date(e.date), })) as unknown as Email[]; backup.metadata.emailCount = backup.emails.length; @@ -113,7 +113,7 @@ class BackupService { const accounts = await db.accounts.toArray(); backup.accounts = accounts.map((a) => ({ ...a, - signupDate: new Date(a.signupDate), + signupDate: a.signupDate == null ? null : new Date(a.signupDate), })) as unknown as Account[]; backup.metadata.accountCount = backup.accounts.length; } @@ -133,7 +133,7 @@ class BackupService { const contacts = await db.contacts.toArray(); backup.contacts = contacts.map((c) => ({ ...c, - lastEmailDate: new Date(c.lastEmailDate), + lastEmailDate: c.lastEmailDate == null ? null : new Date(c.lastEmailDate), })) as unknown as Contact[]; backup.metadata.contactCount = backup.contacts.length; } @@ -164,7 +164,7 @@ class BackupService { const subscriptions = await db.subscriptions.toArray(); backup.subscriptions = subscriptions.map((s) => ({ ...s, - lastRenewalDate: new Date(s.lastRenewalDate), + lastRenewalDate: s.lastRenewalDate == null ? null : new Date(s.lastRenewalDate), nextRenewalDate: s.nextRenewalDate ? new Date(s.nextRenewalDate) : undefined, emailIds: typeof s.emailIds === 'string' ? JSON.parse(s.emailIds) : (s.emailIds || []), })) as unknown as Subscription[]; @@ -176,7 +176,7 @@ class BackupService { const newsletters = await db.newsletters.toArray(); backup.newsletters = newsletters.map((n) => ({ ...n, - lastEmailDate: new Date(n.lastEmailDate), + lastEmailDate: n.lastEmailDate == null ? null : new Date(n.lastEmailDate), })) as unknown as Newsletter[]; backup.metadata.newsletterCount = backup.newsletters.length; } diff --git a/web/src/services/domainMatch.ts b/web/src/services/domainMatch.ts deleted file mode 100644 index 5942f94..0000000 --- a/web/src/services/domainMatch.ts +++ /dev/null @@ -1,11 +0,0 @@ -/** - * Returns true iff emailDomain is exactly serviceDomain or a subdomain of it. - * Boundary-safe: 'maxwell.com' does NOT match 'max.com', 'pineapple.com' does - * NOT match 'apple.com'. Comparison is case-insensitive and trimmed. - */ -export function isDomainMatch(emailDomain: string, serviceDomain: string): boolean { - const d = emailDomain.trim().toLowerCase(); - const s = serviceDomain.trim().toLowerCase(); - if (!d || !s) return false; - return d === s || d.endsWith('.' + s); -} diff --git a/web/src/services/gmailTakeoutParser.ts b/web/src/services/gmailTakeoutParser.ts deleted file mode 100644 index 9fbb272..0000000 --- a/web/src/services/gmailTakeoutParser.ts +++ /dev/null @@ -1,267 +0,0 @@ -import JSZip from 'jszip'; -import type { Email } from '../types'; -import { mboxParser, type EmailBatchCallback } from './mboxParser'; -import { logger } from '../utils/logger'; - -/** - * Parser for Google Takeout email archives - * Handles the specific ZIP structure from Google Takeout - * - * Optimized for large files: - * - Sequential MBOX processing to reduce memory pressure - * - Explicit cleanup between files to allow garbage collection - * - Streaming batch processing for each MBOX file - */ -class GmailTakeoutParser { - /** - * Parse a Gmail Takeout ZIP file with streaming support - * Processes MBOX files sequentially and calls onBatch for each batch of emails - */ - async parseGmailTakeoutStreaming( - file: File, - onProgress?: (progress: number, message: string) => void, - onBatch?: EmailBatchCallback - ): Promise { - let totalEmailsParsed = 0; - let globalBatchNumber = 0; - const seenEmailKeys = new Set(); - - onProgress?.(0, 'Opening Gmail Takeout archive...'); - - // Validate file size before loading (500MB compressed limit) - const MAX_COMPRESSED_SIZE = 500 * 1024 * 1024; - if (file.size > MAX_COMPRESSED_SIZE) { - throw new Error(`File too large (${(file.size / 1024 / 1024).toFixed(0)}MB). Maximum supported size is 500MB.`); - } - - const zip = await JSZip.loadAsync(file); - - // Check decompressed size to guard against zip bombs (2GB limit) - // JSZip stores uncompressedSize in internal _data property (not in public types) - const MAX_DECOMPRESSED_SIZE = 2 * 1024 * 1024 * 1024; - let totalDecompressedSize = 0; - for (const entry of Object.values(zip.files)) { - if (!entry.dir) { - const entryData = (entry as unknown as { _data?: { uncompressedSize?: number } })._data; - if (entryData && typeof entryData.uncompressedSize === 'number') { - totalDecompressedSize += entryData.uncompressedSize; - } - } - } - if (totalDecompressedSize > MAX_DECOMPRESSED_SIZE) { - throw new Error(`Archive decompressed size exceeds 2GB limit. This may be a malicious file.`); - } - - // Find all MBOX files in the archive - const mboxFiles: string[] = []; - - zip.forEach((path, zipEntry) => { - if ( - !zipEntry.dir && - (path.endsWith('.mbox') || path.includes('Takeout/Mail/')) - ) { - mboxFiles.push(path); - } - }); - - onProgress?.(10, `Found ${mboxFiles.length} mail folders`); - - if (mboxFiles.length === 0) { - throw new Error( - 'No email archives found in this Takeout file. Make sure you selected Mail data during export.' - ); - } - - // Process MBOX files SEQUENTIALLY (not in parallel) to reduce memory pressure - for (let fileIndex = 0; fileIndex < mboxFiles.length; fileIndex++) { - const mboxPath = mboxFiles[fileIndex]; - - try { - const zipEntry = zip.file(mboxPath); - if (!zipEntry) continue; - - // Extract folder name from path - const folderName = this.extractFolderName(mboxPath); - - onProgress?.( - 10 + ((fileIndex + 0.5) / mboxFiles.length) * 80, - `Processing ${folderName} (${fileIndex + 1}/${mboxFiles.length})...` - ); - - // Get file content - this is the memory-intensive part - let content = await zipEntry.async('string'); - - // Create a File object from the content - const mboxFile = new File([content], `${folderName}.mbox`, { - type: 'application/mbox', - }); - - // Clear the content string to free memory before parsing - // @ts-expect-error - intentionally reassigning to help GC - content = null; - - // Parse using streaming MBOX parser with deduplication - const folderMappedBatchCallback: EmailBatchCallback = async (emails) => { - // Deduplicate and add folder ID - const uniqueEmails: Omit[] = []; - - for (const email of emails) { - const key = email.threadId || - `${email.subject}|${email.sender}|${email.date.getTime()}`; - - if (!seenEmailKeys.has(key)) { - seenEmailKeys.add(key); - uniqueEmails.push({ - ...email, - folderId: this.mapGmailFolderToId(folderName), - }); - } - } - - if (uniqueEmails.length > 0 && onBatch) { - await onBatch(uniqueEmails, globalBatchNumber); - globalBatchNumber++; - } - - totalEmailsParsed += uniqueEmails.length; - }; - - // Use streaming parser - await mboxParser.parseMBOXFileStreaming( - mboxFile, - (progress, message) => { - // Combine progress from individual file with overall progress - const baseProgress = 10 + (fileIndex / mboxFiles.length) * 80; - const fileContribution = (80 / mboxFiles.length) * (progress / 100); - onProgress?.( - Math.round(baseProgress + fileContribution), - `${folderName}: ${message}` - ); - }, - folderMappedBatchCallback - ); - - // Update progress after each file - onProgress?.( - 10 + ((fileIndex + 1) / mboxFiles.length) * 80, - `Completed ${folderName} (${fileIndex + 1}/${mboxFiles.length})` - ); - - // Yield to allow garbage collection between files - await new Promise(resolve => setTimeout(resolve, 10)); - - } catch (error) { - logger.warn(`Failed to parse ${mboxPath}:`, error); - } - } - - onProgress?.(100, `Imported ${totalEmailsParsed} unique emails`); - - return totalEmailsParsed; - } - - /** - * Parse a Gmail Takeout ZIP file (legacy method for backwards compatibility) - * For large files, prefer parseGmailTakeoutStreaming - */ - async parseGmailTakeout( - file: File, - onProgress?: (progress: number, message: string) => void - ): Promise[]> { - const emails: Omit[] = []; - - await this.parseGmailTakeoutStreaming( - file, - onProgress, - async (batch) => { - emails.push(...batch); - } - ); - - return emails; - } - - /** - * Extract folder name from file path - */ - private extractFolderName(path: string): string { - // Gmail Takeout structure: Takeout/Mail/Label Name.mbox - const parts = path.split('/'); - const fileName = parts[parts.length - 1]; - return fileName.replace('.mbox', '').replace(/_/g, ' '); - } - - /** - * Map Gmail folder names to standard folder IDs - */ - private mapGmailFolderToId(folderName: string): string { - const lowerName = folderName.toLowerCase(); - - // Standard Gmail folders - if (lowerName.includes('inbox')) return 'inbox'; - if (lowerName.includes('sent')) return 'sent'; - if (lowerName.includes('draft')) return 'drafts'; - if (lowerName.includes('trash') || lowerName.includes('deleted')) - return 'trash'; - if (lowerName.includes('spam') || lowerName.includes('junk')) return 'spam'; - if (lowerName.includes('archive') || lowerName === 'all mail') - return 'archive'; - if (lowerName.includes('starred') || lowerName.includes('important')) - return 'starred'; - - // Custom labels become custom folders - return `gmail-${folderName.toLowerCase().replace(/\s+/g, '-')}`; - } - - /** - * Check if a file is a Gmail Takeout archive - */ - isGmailTakeout(file: File): boolean { - return ( - file.type === 'application/zip' || - file.name.endsWith('.zip') || - file.name.toLowerCase().includes('takeout') - ); - } - - /** - * Validate Gmail Takeout structure - */ - async validateTakeout(file: File): Promise<{ - valid: boolean; - message: string; - folderCount?: number; - }> { - try { - const zip = await JSZip.loadAsync(file); - let mboxCount = 0; - - zip.forEach((path) => { - if (path.endsWith('.mbox') || path.includes('Takeout/Mail/')) { - mboxCount++; - } - }); - - if (mboxCount === 0) { - return { - valid: false, - message: - 'No email data found. Make sure you exported Mail data from Google Takeout.', - }; - } - - return { - valid: true, - message: `Found ${mboxCount} mail folders ready to import`, - folderCount: mboxCount, - }; - } catch { - return { - valid: false, - message: 'Could not read the archive. Make sure it is a valid ZIP file.', - }; - } - } -} - -export const gmailTakeoutParser = new GmailTakeoutParser(); diff --git a/web/src/services/importPipeline.ts b/web/src/services/importPipeline.ts index b729ace..7063705 100644 --- a/web/src/services/importPipeline.ts +++ b/web/src/services/importPipeline.ts @@ -30,11 +30,18 @@ import { updateEmailFolder, updateEmailTags, } from '../db/database'; -import { accountDetector } from './accountDetector'; -import { purchaseDetector } from './purchaseDetector'; -import { subscriptionDetector } from './subscriptionDetector'; -import { newsletterDetector } from './newsletterDetector'; +import { + AccountDetector, + PurchaseDetector, + SubscriptionDetector, + NewsletterDetector, +} from '@technical-1/email-archive-parser'; import { customRulesEngine } from './customRulesEngine'; + +const accountDetector = new AccountDetector(); +const purchaseDetector = new PurchaseDetector(); +const subscriptionDetector = new SubscriptionDetector(); +const newsletterDetector = new NewsletterDetector(); import { extractDomain } from '../utils/emailUtils'; import { logger } from '../utils/logger'; @@ -53,44 +60,52 @@ export function createImportCounts(): OLMProcessingResult { /** Run all four detectors against a single (already-persisted) email. */ export async function runDetection(email: Email, counts: OLMProcessingResult): Promise { // Account signups - const accountResult = accountDetector.detectAccountSignup(email); + const accountResult = accountDetector.detect(email); if (accountResult.type === 'account' && accountResult.data?.serviceName) { const existingAccount = await getAccountByServiceName(accountResult.data.serviceName); if (!existingAccount) { - const accountData = accountDetector.createAccountFromEmail( - email, - accountResult.data.serviceName, - accountResult.data.serviceType as Account['serviceType'], - ); - await insertAccount(accountData); + await insertAccount({ + serviceName: accountResult.data.serviceName, + signupEmailId: email.id, + signupDate: email.date, + serviceType: (accountResult.data.serviceType ?? 'other') as Account['serviceType'], + domain: extractDomain(email.sender), + lastActivityDate: email.date, + emailCount: 1, + }); counts.accounts++; } } // Purchases - const purchaseResult = purchaseDetector.detectPurchase(email); - if (purchaseResult.type === 'purchase' && purchaseResult.data?.amount) { + const purchaseResult = purchaseDetector.detect(email); + // Purchases require a concrete date (Purchase.purchaseDate is non-nullable and + // the dedup window keys off it). Undated emails are skipped for purchase records. + if (purchaseResult.type === 'purchase' && purchaseResult.data?.amount && email.date) { + const purchaseDate = email.date; const merchant = purchaseResult.data.merchant || 'Unknown'; const amount = purchaseResult.data.amount; const orderNumber = purchaseResult.data.orderNumber; const currency = purchaseResult.data.currency; - const existingPurchase = await findDuplicatePurchase(merchant, amount, email.date, orderNumber); + const existingPurchase = await findDuplicatePurchase(merchant, amount, purchaseDate, orderNumber); if (!existingPurchase) { - const purchaseData = purchaseDetector.createPurchaseFromEmail( - email, + await insertPurchase({ + emailId: email.id, merchant, amount, + currency: currency || 'USD', + purchaseDate, orderNumber, - currency, - ); - await insertPurchase(purchaseData); + items: [], + category: purchaseDetector.getCategory(merchant), + }); counts.purchases++; } } // Subscriptions — dedupe by serviceName, then by sender domain. - const subResult = subscriptionDetector.detectSubscription(email); + const subResult = subscriptionDetector.detect(email); if (subResult.isSubscription && subResult.serviceName) { const senderDomain = extractDomain(email.sender); @@ -101,7 +116,7 @@ export async function runDetection(email: Email, counts: OLMProcessingResult): P if (existingSub) { const emailIds = [...new Set([...existingSub.emailIds, email.id!])]; - const isNewerEmail = email.date > existingSub.lastRenewalDate; + const isNewerEmail = !!email.date && (!existingSub.lastRenewalDate || email.date > existingSub.lastRenewalDate); const shouldUpdateAmount = isNewerEmail && subResult.amount != null && subResult.amount > 0; await updateSubscription(existingSub.id!, { @@ -127,13 +142,13 @@ export async function runDetection(email: Email, counts: OLMProcessingResult): P } // Newsletters / promotional - const nlResult = newsletterDetector.detectNewsletter(email); + const nlResult = newsletterDetector.detect(email); if (nlResult.isNewsletter || nlResult.isPromotional) { const existingNL = await getNewsletterBySender(email.sender); if (existingNL) { await updateNewsletter(existingNL.id!, { emailCount: existingNL.emailCount + 1, - lastEmailDate: email.date > existingNL.lastEmailDate ? email.date : existingNL.lastEmailDate, + lastEmailDate: email.date && (!existingNL.lastEmailDate || email.date > existingNL.lastEmailDate) ? email.date : existingNL.lastEmailDate, unsubscribeLink: nlResult.unsubscribeLink || existingNL.unsubscribeLink, }); } else { @@ -218,7 +233,7 @@ export async function processEmailBatch( if (batch.length === 0) return; // Worker messages may serialize dates to strings; coerce defensively. - const emails = batch.map((e) => ({ ...e, date: new Date(e.date) })); + const emails = batch.map((e) => ({ ...e, date: e.date == null ? null : new Date(e.date) })); for (const e of emails) { if (e.folderId) folderIds.add(e.folderId); } diff --git a/web/src/services/mboxParser.ts b/web/src/services/mboxParser.ts deleted file mode 100644 index dc96a46..0000000 --- a/web/src/services/mboxParser.ts +++ /dev/null @@ -1,628 +0,0 @@ -import type { Email } from '../types'; -import { cleanEmailAddress, normalizeSubject } from '../utils/emailUtils'; -import { logger } from '../utils/logger'; -import { decodeQuotedPrintable, decodeRfc2047 } from './mimeUtils'; - -/** - * Callback for streaming email processing - */ -export type EmailBatchCallback = (emails: Omit[], batchNumber: number) => Promise; - -/** - * Parser for MBOX email archive format - * Uses streaming/batched approach for memory efficiency with large files - */ -class MBOXParser { - private readonly CHUNK_SIZE = 5 * 1024 * 1024; // 5MB chunks - private readonly BATCH_SIZE = 100; // Process 100 emails at a time - - /** - * Parse an MBOX file with streaming batch processing - * Calls onBatch with each batch of emails as they're parsed - */ - async parseMBOXFileStreaming( - file: File, - onProgress?: (progress: number, message: string) => void, - onBatch?: EmailBatchCallback - ): Promise { - const fileSize = file.size; - let offset = 0; - let leftover = ''; - let totalEmailsParsed = 0; - let currentBatch: Omit[] = []; - let batchNumber = 0; - - onProgress?.(0, `Processing ${(fileSize / 1024 / 1024).toFixed(1)}MB file...`); - - while (offset < fileSize) { - const chunkEnd = Math.min(offset + this.CHUNK_SIZE, fileSize); - const chunk = file.slice(offset, chunkEnd); - - let chunkText: string; - try { - chunkText = await chunk.text(); - } catch (e) { - logger.error('Error reading chunk:', e); - break; - } - - const textToProcess = leftover + chunkText; - - // Find the last "From " line to know where to split - const lastFromIndex = this.findLastFromLine(textToProcess); - - let processableText: string; - if (lastFromIndex > 0 && chunkEnd < fileSize) { - processableText = textToProcess.substring(0, lastFromIndex); - leftover = textToProcess.substring(lastFromIndex); - } else { - processableText = textToProcess; - leftover = ''; - } - - // Parse emails from this chunk - const chunkEmails = this.parseEmailsFromText(processableText); - - for (const email of chunkEmails) { - currentBatch.push(email); - - // When batch is full, process it - if (currentBatch.length >= this.BATCH_SIZE) { - if (onBatch) { - await onBatch(currentBatch, batchNumber); - } - totalEmailsParsed += currentBatch.length; - batchNumber++; - currentBatch = []; // Clear batch from memory - - // Yield to UI - await new Promise(resolve => setTimeout(resolve, 0)); - } - } - - offset = chunkEnd; - const progress = Math.round((offset / fileSize) * 95); - onProgress?.(progress, `Parsed ${totalEmailsParsed + currentBatch.length} emails (${Math.round(offset / fileSize * 100)}% read)...`); - - // Yield to prevent UI blocking - await new Promise(resolve => setTimeout(resolve, 0)); - } - - // Process any remaining text - if (leftover.trim()) { - const finalEmails = this.parseEmailsFromText(leftover); - for (const email of finalEmails) { - currentBatch.push(email); - } - } - - // Process final batch - if (currentBatch.length > 0 && onBatch) { - await onBatch(currentBatch, batchNumber); - totalEmailsParsed += currentBatch.length; - } - - onProgress?.(100, `Parsed ${totalEmailsParsed} emails successfully`); - return totalEmailsParsed; - } - - /** - * Legacy method for backwards compatibility - * For small files only - use parseMBOXFileStreaming for large files - */ - async parseMBOXFile( - file: File, - onProgress?: (progress: number, message: string) => void - ): Promise[]> { - // For files under 20MB, use simple accumulator approach - if (file.size < 20 * 1024 * 1024) { - const emails: Omit[] = []; - await this.parseMBOXFileStreaming(file, onProgress, async (batch) => { - emails.push(...batch); - }); - return emails; - } - - // For larger files, warn and still use streaming but accumulate - console.warn('Large file detected. Consider using parseMBOXFileStreaming for better memory efficiency.'); - const emails: Omit[] = []; - await this.parseMBOXFileStreaming(file, onProgress, async (batch) => { - emails.push(...batch); - }); - return emails; - } - - /** - * Check if a line is a valid MBOX "From " line - */ - private isFromLine(line: string): boolean { - if (!line.startsWith('From ')) return false; - const dayPattern = /(Mon|Tue|Wed|Thu|Fri|Sat|Sun)/; - return dayPattern.test(line); - } - - /** - * Find the index of the last "From " line in text - */ - private findLastFromLine(text: string): number { - let lastIndex = -1; - let searchStart = text.length - 1; - - // Search backwards for "\nFrom " or "\r\nFrom " pattern - while (searchStart > 0) { - // Try both CRLF and LF - let idx = text.lastIndexOf('\r\nFrom ', searchStart); - let offset = 2; // Skip \r\n - - if (idx === -1) { - idx = text.lastIndexOf('\nFrom ', searchStart); - offset = 1; // Skip \n - } - - if (idx === -1) break; - - const lineStart = idx + offset; - let lineEnd = text.indexOf('\n', lineStart); - if (lineEnd === -1) lineEnd = text.length; - let line = text.substring(lineStart, lineEnd); - // Remove trailing \r if present - if (line.endsWith('\r')) line = line.slice(0, -1); - - if (this.isFromLine(line)) { - lastIndex = lineStart; - break; - } - searchStart = idx - 1; - } - - // Also check if text starts with "From " - if (lastIndex === -1 && text.startsWith('From ')) { - let lineEnd = text.indexOf('\n'); - if (lineEnd === -1) lineEnd = text.length; - let line = text.substring(0, lineEnd); - if (line.endsWith('\r')) line = line.slice(0, -1); - if (this.isFromLine(line)) { - lastIndex = 0; - } - } - - return lastIndex; - } - - /** - * Parse multiple emails from a text block - */ - private parseEmailsFromText(text: string): Omit[] { - const emails: Omit[] = []; - // Normalize CRLF to LF, then split - const normalizedText = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); - const lines = normalizedText.split('\n'); - let currentEmail: string[] = []; - - for (const line of lines) { - if (this.isFromLine(line) && currentEmail.length > 0) { - const email = this.parseEmailFromLines(currentEmail); - if (email) { - emails.push(email); - } - currentEmail = []; - } - currentEmail.push(line); - } - - // Parse last email in chunk - if (currentEmail.length > 0 && currentEmail.some(line => line.trim().length > 0)) { - const email = this.parseEmailFromLines(currentEmail); - if (email) { - emails.push(email); - } - } - - return emails; - } - - /** - * Parse a single email from raw lines - */ - private parseEmailFromLines(lines: string[]): Omit | null { - try { - if (lines.length < 2) return null; - - const headers: Record = {}; - let bodyStartIndex = 0; - let inHeaders = true; - - // Parse headers (skip the "From " line) - for (let i = 1; i < lines.length; i++) { - const line = lines[i]; - - if (line.trim() === '') { - bodyStartIndex = i + 1; - inHeaders = false; - break; - } - - if (inHeaders) { - if (line.match(/^\s+/) && Object.keys(headers).length > 0) { - const lastKey = Object.keys(headers).pop()!; - headers[lastKey] += ' ' + line.trim(); - } else { - const match = line.match(/^([^:]+):\s*(.*)$/); - if (match) { - const key = match[1].toLowerCase(); - headers[key] = match[2]; - } - } - } - } - - if (inHeaders) { - bodyStartIndex = lines.length; - } - - // Extract body content - const bodyLines = lines.slice(bodyStartIndex); - const rawBody = bodyLines.join('\n'); - - // Parse body based on content type - const contentType = headers['content-type'] || 'text/plain'; - let body = ''; - let htmlBody: string | undefined; - - if (contentType.includes('multipart/')) { - // Extract boundary from content-type - const boundaryMatch = contentType.match(/boundary=["']?([^"';\s]+)["']?/i); - if (boundaryMatch) { - const boundary = boundaryMatch[1]; - const parts = this.parseMimeParts(rawBody, boundary); - body = parts.text || ''; - htmlBody = parts.html; - } else { - body = rawBody; - } - } else { - // Single part email - body = rawBody; - const encoding = headers['content-transfer-encoding']?.toLowerCase(); - if (encoding === 'quoted-printable') { - body = this.decodeQuotedPrintable(body); - } else if (encoding === 'base64') { - try { - body = atob(body.replace(/\s/g, '')); - } catch { - // Keep original if decode fails - } - } - - if (contentType.includes('text/html')) { - htmlBody = body; - } - } - - const dateStr = headers['date'] || ''; - const date = this.parseDate(dateStr); - - const from = headers['from'] || ''; - const { email: sender, name: senderName } = this.parseEmailAddress(from); - - const to = headers['to'] || ''; - const recipients = this.parseRecipients(to); - - const subject = this.decodeHeaderValue(headers['subject'] || '(No Subject)'); - - let threadId = headers['x-gm-thrid'] || - headers['thread-topic'] || - headers['references']?.split(/\s+/)[0] || - headers['in-reply-to']; - - if (!threadId) { - const normalizedSubj = normalizeSubject(subject); - if (normalizedSubj) { - threadId = `subject:${normalizedSubj.toLowerCase().replace(/\s+/g, '-')}`; - } - } - - const gmailLabels = headers['x-gmail-labels'] || ''; - const folderId = this.mapGmailLabelsToFolder(gmailLabels); - const isRead = !gmailLabels.toLowerCase().includes('unread'); - const isStarred = gmailLabels.toLowerCase().includes('starred'); - - if (!sender && !subject) { - return null; - } - - // Sanitize field lengths to prevent memory issues with malformed emails - const MAX_SUBJECT_LEN = 1000; - const MAX_BODY_LEN = 10 * 1024 * 1024; // 10MB - const MAX_EMAIL_LEN = 254; // RFC 5321 - - const sanitizedSubject = subject.length > MAX_SUBJECT_LEN ? subject.slice(0, MAX_SUBJECT_LEN) : subject; - const sanitizedBody = body.trim() || (htmlBody ? this.stripHtml(htmlBody) : ''); - const truncatedBody = sanitizedBody.length > MAX_BODY_LEN ? sanitizedBody.slice(0, MAX_BODY_LEN) : sanitizedBody; - const truncatedHtmlBody = htmlBody && htmlBody.length > MAX_BODY_LEN ? htmlBody.slice(0, MAX_BODY_LEN) : htmlBody; - const sanitizedSender = cleanEmailAddress(sender).slice(0, MAX_EMAIL_LEN); - const sanitizedRecipients = recipients.map(r => r.slice(0, MAX_EMAIL_LEN)).slice(0, 1000); - - return { - subject: sanitizedSubject, - sender: sanitizedSender, - senderName: senderName || undefined, - recipients: sanitizedRecipients, - date: date || new Date(), - body: truncatedBody, - htmlBody: truncatedHtmlBody, - attachments: [], - size: Math.min(lines.join('\n').length, 100000), // Cap size calculation - isRead, - isStarred, - folderId, - threadId, - emailType: 'regular', - }; - } catch (error) { - console.warn('Failed to parse email:', error); - return null; - } - } - - /** - * Parse Gmail labels and return the primary folder ID - * Priority: Inbox > Sent > Drafts > Spam > Trash > first custom label > Archive - */ - private mapGmailLabelsToFolder(labels: string): string { - const labelList = this.parseGmailLabels(labels); - - // Priority order for folder assignment - if (labelList.includes('inbox')) return 'inbox'; - if (labelList.includes('sent') || labelList.includes('sent mail')) return 'sent'; - if (labelList.includes('draft') || labelList.includes('drafts')) return 'drafts'; - if (labelList.includes('spam')) return 'spam'; - if (labelList.includes('trash')) return 'trash'; - - // Check for custom labels (not category/system labels) - const customLabels = labelList.filter(l => - !l.startsWith('category ') && - !['opened', 'unread', 'starred', 'important', 'all mail'].includes(l) - ); - - if (customLabels.length > 0) { - // Use first custom label as folder - return this.labelToFolderId(customLabels[0]); - } - - return 'archive'; - } - - /** - * Parse the X-Gmail-Labels header into an array of label names - */ - parseGmailLabels(labelsHeader: string): string[] { - if (!labelsHeader) return []; - - // Labels are comma-separated, may be quoted if they contain special chars - const labels: string[] = []; - let current = ''; - let inQuotes = false; - - for (const char of labelsHeader) { - if (char === '"') { - inQuotes = !inQuotes; - } else if (char === ',' && !inQuotes) { - if (current.trim()) { - labels.push(current.trim().toLowerCase()); - } - current = ''; - } else { - current += char; - } - } - - if (current.trim()) { - labels.push(current.trim().toLowerCase()); - } - - return labels; - } - - /** - * Convert a label name to a valid folder ID - */ - private labelToFolderId(label: string): string { - return label - .toLowerCase() - .replace(/[^a-z0-9\s-]/g, '') - .replace(/\s+/g, '-') - .substring(0, 50); // Limit length - } - - /** - * Get all unique folder IDs that would be created from a labels header - */ - getAllFolderIdsFromLabels(labelsHeader: string): string[] { - const labels = this.parseGmailLabels(labelsHeader); - const folderIds = new Set(); - - // Add system folders if mentioned - if (labels.includes('inbox')) folderIds.add('inbox'); - if (labels.includes('sent') || labels.includes('sent mail')) folderIds.add('sent'); - if (labels.includes('draft') || labels.includes('drafts')) folderIds.add('drafts'); - if (labels.includes('spam')) folderIds.add('spam'); - if (labels.includes('trash')) folderIds.add('trash'); - - // Add custom labels as folders - for (const label of labels) { - if (!label.startsWith('category ') && - !['opened', 'unread', 'starred', 'important', 'all mail', - 'inbox', 'sent', 'sent mail', 'draft', 'drafts', 'spam', 'trash'].includes(label)) { - folderIds.add(this.labelToFolderId(label)); - } - } - - return Array.from(folderIds); - } - - /** - * Parse MIME multipart content and extract text/html parts - */ - private parseMimeParts(body: string, boundary: string): { text?: string; html?: string } { - const result: { text?: string; html?: string } = {}; - - // Split by boundary - const boundaryMarker = '--' + boundary; - const parts = body.split(boundaryMarker); - - for (const part of parts) { - if (!part.trim() || part.trim() === '--') continue; - - // Split headers from content - const headerEndIndex = part.indexOf('\n\n'); - if (headerEndIndex === -1) continue; - - const partHeaders = part.substring(0, headerEndIndex); - let partContent = part.substring(headerEndIndex + 2); - - // Parse part headers - const contentTypeMatch = partHeaders.match(/content-type:\s*([^;\n]+)/i); - const encodingMatch = partHeaders.match(/content-transfer-encoding:\s*(\S+)/i); - - if (!contentTypeMatch) continue; - - const partContentType = contentTypeMatch[1].toLowerCase().trim(); - const partEncoding = encodingMatch?.[1]?.toLowerCase() || '7bit'; - - // Handle nested multipart (multipart/alternative, etc.) - if (partContentType.includes('multipart/')) { - const nestedBoundaryMatch = partHeaders.match(/boundary=["']?([^"';\s\n]+)["']?/i); - if (nestedBoundaryMatch) { - const nestedResult = this.parseMimeParts(partContent, nestedBoundaryMatch[1]); - if (nestedResult.text && !result.text) result.text = nestedResult.text; - if (nestedResult.html && !result.html) result.html = nestedResult.html; - } - continue; - } - - // Decode content - partContent = partContent.trim(); - if (partEncoding === 'base64') { - try { - // Remove whitespace and decode - const cleaned = partContent.replace(/\s/g, ''); - partContent = this.decodeBase64(cleaned); - } catch { - // Keep original if decode fails - } - } else if (partEncoding === 'quoted-printable') { - partContent = this.decodeQuotedPrintable(partContent); - } - - // Store based on content type - if (partContentType.includes('text/plain') && !result.text) { - result.text = partContent; - } else if (partContentType.includes('text/html') && !result.html) { - result.html = partContent; - } - } - - return result; - } - - /** - * Decode base64 with UTF-8 support - */ - private decodeBase64(str: string): string { - try { - // Use TextDecoder for proper UTF-8 handling - const binaryStr = atob(str); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } - return new TextDecoder('utf-8').decode(bytes); - } catch { - // Fallback to simple atob - try { - return atob(str); - } catch { - return str; - } - } - } - - /** - * Strip HTML tags to create plain text - */ - private stripHtml(html: string): string { - return html - .replace(/]*>[\s\S]*?<\/style>/gi, '') - .replace(/]*>[\s\S]*?<\/script>/gi, '') - .replace(/<[^>]+>/g, ' ') - .replace(/ /g, ' ') - .replace(/&/g, '&') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/\s+/g, ' ') - .trim(); - } - - private parseEmailAddress(str: string): { email: string; name?: string } { - const trimmed = this.decodeHeaderValue(str.trim()); - - const angleMatch = trimmed.match(/^(?:"?(.+?)"?\s*)?<([^>]+)>$/); - if (angleMatch) { - return { - name: angleMatch[1]?.trim() || undefined, - email: angleMatch[2]?.trim(), - }; - } - - const emailMatch = trimmed.match(/^([^\s@]+@[^\s@]+\.[^\s@]+)$/); - if (emailMatch) { - return { email: emailMatch[1] }; - } - - return { email: trimmed }; - } - - private parseRecipients(str: string): string[] { - if (!str) return []; - - return str - .split(/[,;]/) - .map((r) => { - const { email } = this.parseEmailAddress(r.trim()); - return cleanEmailAddress(email); - }) - .filter(Boolean); - } - - private parseDate(dateStr: string): Date | null { - if (!dateStr) return null; - - try { - const date = new Date(dateStr); - return isNaN(date.getTime()) ? null : date; - } catch { - return null; - } - } - - private decodeQuotedPrintable(str: string): string { - return decodeQuotedPrintable(str); - } - - private decodeHeaderValue(str: string): string { - return decodeRfc2047(str); - } - - isMBOXFile(file: File): boolean { - return ( - file.name.endsWith('.mbox') || - file.name.endsWith('.mbx') || - file.type === 'application/mbox' - ); - } -} - -export const mboxParser = new MBOXParser(); diff --git a/web/src/services/newsletterDetector.ts b/web/src/services/newsletterDetector.ts deleted file mode 100644 index bc7eec4..0000000 --- a/web/src/services/newsletterDetector.ts +++ /dev/null @@ -1,403 +0,0 @@ -import type { Email, Newsletter } from '../types'; -import { stripHtml, extractDomain } from '../utils/emailUtils'; -import { isDomainMatch } from './domainMatch'; - -/** - * Detector for newsletters and promotional emails - */ -class NewsletterDetector { - // Newsletter subject indicators - private readonly newsletterSubjectPatterns = [ - /\bnewsletter\b/i, - /\bweekly\s+(?:digest|update|roundup|summary)\b/i, - /\bmonthly\s+(?:digest|update|roundup|summary)\b/i, - /\bdaily\s+(?:digest|brief|update)\b/i, - /\b(?:this week|today)\s+(?:in|on|at)\b/i, - /\blatest\s+(?:news|updates|articles)\b/i, - /\bedition\s*#?\d*/i, - /\bissue\s*#?\d*/i, - /\bvol(?:ume)?\.?\s*\d+/i, - ]; - - // Promotional email subject patterns - private readonly promotionalSubjectPatterns = [ - /\b(?:save|get)\s+\d+%?\s*(?:off|discount)?\b/i, - /\bup\s+to\s+\d+%\s+off\b/i, - /\bsale\s+(?:ends?|starts?)\b/i, - /\bflash\s+sale\b/i, - /\blimited\s+time\b/i, - /\bfree\s+(?:shipping|delivery|gift)\b/i, - /\bexclusive\s+(?:offer|deal|discount|access)\b/i, - /\bspecial\s+(?:offer|deal|discount)\b/i, - /\bdon'?t\s+miss\s+(?:out|this)\b/i, - /\blast\s+chance\b/i, - /\bpromo(?:tion)?\s*code\b/i, - /\bcoupon\s*code\b/i, - /\bdiscount\s*code\b/i, - /\buse\s+code\b/i, - /\bbogo\b/i, - /\bbuy\s+\d+\s+get\s+\d+/i, - /\bclearance\b/i, - /\bblack\s+friday\b/i, - /\bcyber\s+monday\b/i, - /\bprime\s+day\b/i, - /\bholiday\s+(?:sale|deals|savings)\b/i, - ]; - - // Body patterns for newsletter/promotional emails - private readonly marketingBodyPatterns = [ - /unsubscribe/i, - /manage\s+(?:your\s+)?(?:email\s+)?preferences/i, - /email\s+preferences/i, - /opt.?out/i, - /if\s+you\s+no\s+longer\s+(?:wish|want)\s+to\s+receive/i, - /to\s+stop\s+receiving\s+(?:these|our)\s+emails/i, - /view\s+(?:in|as)\s+(?:a\s+)?(?:web\s+)?browser/i, - /view\s+(?:this\s+)?(?:email\s+)?online/i, - /having\s+trouble\s+(?:viewing|reading)/i, - /forward\s+to\s+a\s+friend/i, - /share\s+(?:with|this)/i, - /copyright\s+©?\s*\d{4}/i, - /all\s+rights\s+reserved/i, - /privacy\s+policy/i, - /terms\s+(?:of\s+(?:service|use)|and\s+conditions)/i, - ]; - - // Known promotional email domains - private readonly knownPromotionalDomains = new Set([ - 'email.amazonses.com', - 'em.ebay.com', - 'promo.', - 'marketing.', - 'newsletter.', - 'mail.', - 'news.', - 'promotions.', - 'offers.', - 'deals.', - 'updates.', - ]); - - /** - * Detect if an email is a newsletter or promotional email - */ - detectNewsletter(email: Email): { isNewsletter: boolean; isPromotional: boolean; confidence: number; unsubscribeLink?: string } { - const subject = email.subject || ''; - const body = email.body || ''; - const htmlBody = email.htmlBody || ''; - const sender = email.sender || ''; - - let newsletterScore = 0; - let promotionalScore = 0; - - // Check subject for newsletter patterns - for (const pattern of this.newsletterSubjectPatterns) { - if (pattern.test(subject)) { - newsletterScore += 30; - break; - } - } - - // Check subject for promotional patterns - for (const pattern of this.promotionalSubjectPatterns) { - if (pattern.test(subject)) { - promotionalScore += 35; - break; - } - } - - // Check body for marketing patterns - const plainBody = stripHtml(body); - let marketingPatternMatches = 0; - for (const pattern of this.marketingBodyPatterns) { - if (pattern.test(plainBody) || pattern.test(htmlBody)) { - marketingPatternMatches++; - } - } - - if (marketingPatternMatches >= 3) { - newsletterScore += 25; - promotionalScore += 20; - } else if (marketingPatternMatches >= 2) { - newsletterScore += 15; - promotionalScore += 10; - } - - // Check sender domain - const domain = extractDomain(sender); - if (this.isPromotionalSenderDomain(domain)) { - newsletterScore += 20; - promotionalScore += 20; - } - - // Extract unsubscribe link - const unsubscribeLink = this.extractUnsubscribeLink(htmlBody || body); - if (unsubscribeLink) { - newsletterScore += 15; - promotionalScore += 10; - } - - // Check for List-Unsubscribe patterns in body (common in marketing emails) - if (/list.?unsubscribe/i.test(htmlBody)) { - newsletterScore += 10; - } - - const isNewsletter = newsletterScore >= 40; - const isPromotional = promotionalScore >= 40; - const confidence = Math.max(newsletterScore, promotionalScore); - - return { - isNewsletter: isNewsletter && !isPromotional, - isPromotional, - confidence: Math.min(confidence, 100), - unsubscribeLink, - }; - } - - /** - * Decide whether a sender domain looks like a marketing/newsletter sender. - * Entries in knownPromotionalDomains ending in '.' are subdomain markers - * (e.g. 'newsletter.') matched only at the START of the domain so they sit on - * a label boundary; the rest are full domains matched boundary-safely. This - * avoids the old `domain.includes('mail.')` bug that flagged gmail.com/ - * hotmail.com as promotional because they contain the substring "mail.". - */ - private isPromotionalSenderDomain(domain: string): boolean { - const d = domain.trim().toLowerCase(); - if (!d) return false; - - for (const entry of this.knownPromotionalDomains) { - if (entry.endsWith('.')) { - if (d.startsWith(entry)) return true; - } else if (isDomainMatch(d, entry)) { - return true; - } - } - return false; - } - - /** - * Extract unsubscribe link from email HTML - */ - extractUnsubscribeLink(html: string): string | undefined { - if (!html) return undefined; - - // Look for unsubscribe links in anchor tags - const patterns = [ - /]*href=["']([^"']*unsubscribe[^"']*)["'][^>]*>/i, - /]*href=["']([^"']*opt.?out[^"']*)["'][^>]*>/i, - /]*href=["']([^"']*email.?preferences[^"']*)["'][^>]*>/i, - /]*href=["']([^"']*manage.?preferences[^"']*)["'][^>]*>/i, - /]*href=["']([^"']+)["'][^>]*>\s*unsubscribe\s*<\/a>/i, - /]*href=["']([^"']+)["'][^>]*>[^<]*unsubscribe[^<]*<\/a>/i, - ]; - - for (const pattern of patterns) { - const match = html.match(pattern); - if (match && match[1]) { - const link = match[1]; - // Validate it's a proper URL - if (link.startsWith('http://') || link.startsWith('https://')) { - return link; - } - } - } - - // Look for plain text unsubscribe URLs - const urlPattern = /https?:\/\/[^\s<>"]+(?:unsubscribe|opt.?out|preferences)[^\s<>"]*/i; - const urlMatch = html.match(urlPattern); - if (urlMatch) { - return urlMatch[0]; - } - - return undefined; - } - - /** - * Categorize email as newsletter, promotional, or regular - */ - categorize(email: Email): 'newsletter' | 'promotional' | 'regular' { - const result = this.detectNewsletter(email); - - if (result.isPromotional) { - return 'promotional'; - } - if (result.isNewsletter) { - return 'newsletter'; - } - return 'regular'; - } - - /** - * Get emails grouped by sender as potential newsletters - */ - groupBySender(emails: Email[]): Map { - const senderMap = new Map }>(); - - for (const email of emails) { - const result = this.detectNewsletter(email); - if (result.isNewsletter || result.isPromotional) { - const sender = email.sender; - - if (!senderMap.has(sender)) { - senderMap.set(sender, { emails: [], unsubscribeLinks: new Set() }); - } - - const data = senderMap.get(sender)!; - data.emails.push(email); - - if (result.unsubscribeLink) { - data.unsubscribeLinks.add(result.unsubscribeLink); - } - } - } - - const newsletters = new Map(); - - senderMap.forEach((data, sender) => { - const sortedEmails = data.emails.sort( - (a, b) => new Date(b.date).getTime() - new Date(a.date).getTime() - ); - - const latestEmail = sortedEmails[0]; - const unsubscribeLinks = Array.from(data.unsubscribeLinks); - - // Calculate frequency based on email dates - const frequency = this.calculateFrequency(sortedEmails); - - newsletters.set(sender, { - senderEmail: sender, - senderName: latestEmail.senderName || this.extractNameFromEmail(sender), - emailCount: data.emails.length, - lastEmailDate: new Date(latestEmail.date), - frequency, - unsubscribeLink: unsubscribeLinks[0], - isPromotional: this.detectNewsletter(latestEmail).isPromotional, - }); - }); - - return newsletters; - } - - /** - * Calculate sending frequency based on email dates - */ - private calculateFrequency(emails: Email[]): 'daily' | 'weekly' | 'monthly' | 'irregular' { - if (emails.length < 2) { - return 'irregular'; - } - - // Calculate average days between emails - const dates = emails.map(e => new Date(e.date).getTime()); - let totalDays = 0; - - for (let i = 0; i < dates.length - 1; i++) { - const daysDiff = (dates[i] - dates[i + 1]) / (1000 * 60 * 60 * 24); - totalDays += daysDiff; - } - - const avgDays = totalDays / (dates.length - 1); - - if (avgDays <= 2) { - return 'daily'; - } else if (avgDays <= 10) { - return 'weekly'; - } else if (avgDays <= 45) { - return 'monthly'; - } - - return 'irregular'; - } - - /** - * Extract a readable name from an email address - */ - private extractNameFromEmail(email: string): string { - const [localPart, domain] = email.split('@'); - - // Known service mappings for common newsletter senders - const knownSenders: Record = { - 'nytimes.com': 'New York Times', - 'newyorktimes.com': 'New York Times', - 'washingtonpost.com': 'Washington Post', - 'wsj.com': 'Wall Street Journal', - 'amazon.com': 'Amazon', - 'netflix.com': 'Netflix', - 'spotify.com': 'Spotify', - 'linkedin.com': 'LinkedIn', - 'twitter.com': 'Twitter', - 'facebook.com': 'Facebook', - 'instagram.com': 'Instagram', - 'medium.com': 'Medium', - 'substack.com': 'Substack', - 'mailchimp.com': 'Mailchimp', - 'hubspot.com': 'HubSpot', - 'salesforce.com': 'Salesforce', - }; - - // Check if domain matches known service - if (domain) { - const domainLower = domain.toLowerCase(); - for (const [key, name] of Object.entries(knownSenders)) { - if (domainLower.includes(key)) { - return name; - } - } - - // Get main domain part - const domainParts = domain.split('.'); - let mainPart: string; - - // Handle domains like mail.example.com or newsletter.example.com - if (domainParts.length >= 2) { - // Get the second-to-last part (e.g., 'example' from 'mail.example.com') - const potentialName = domainParts[domainParts.length - 2]; - const genericSubdomains = ['mail', 'email', 'noreply', 'no-reply', 'newsletter', 'news', 'info', 'marketing', 'mailer', 'e', 'beta']; - - if (genericSubdomains.includes(domainParts[0].toLowerCase()) && domainParts.length >= 3) { - // Get the domain name instead of the subdomain - mainPart = domainParts[domainParts.length - 2]; - } else { - mainPart = potentialName; - } - } else { - mainPart = domainParts[0]; - } - - // Format the name nicely - if (mainPart && mainPart.length >= 2) { - // Handle compound names like "seaworldparks" -> "Seaworld Parks" - const formatted = mainPart - .replace(/([a-z])([A-Z])/g, '$1 $2') // camelCase - .replace(/[-_]/g, ' ') // separators - .split(' ') - .map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) - .join(' '); - - return formatted; - } - } - - // Fall back to local part as last resort - if (localPart) { - const cleaned = localPart - .replace(/[._-]/g, ' ') - .replace(/\d+/g, '') - .trim(); - - if (cleaned.length > 0) { - return cleaned - .split(' ') - .filter(word => word.length > 0) - .map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) - .join(' '); - } - } - - return 'Unknown'; - } -} - - -export const newsletterDetector = new NewsletterDetector(); - diff --git a/web/src/services/olmParser.ts b/web/src/services/olmParser.ts deleted file mode 100644 index 469dfdb..0000000 --- a/web/src/services/olmParser.ts +++ /dev/null @@ -1,648 +0,0 @@ -import JSZip from 'jszip'; -import type { Email, Contact, CalendarEvent, OLMProcessingResult, OLMProcessingProgress, Account, Subscription, Newsletter } from '../types'; -import { - insertEmail, - insertAccount, - insertPurchase, - findDuplicatePurchase, - insertContact, - insertCalendarEvent, - getAccountByServiceName, - getContactByEmail, - updateContactEmailCount, - insertSubscription, - getSubscriptionByServiceName, - updateSubscription, - insertNewsletter, - getNewsletterBySender, - updateNewsletter, -} from '../db/database'; -import { accountDetector } from './accountDetector'; -import { purchaseDetector } from './purchaseDetector'; -import { subscriptionDetector } from './subscriptionDetector'; -import { newsletterDetector } from './newsletterDetector'; -import { cleanEmailAddress, extractDomain, normalizeSubject } from '../utils/emailUtils'; -import { MAX_COMPRESSED_BYTES, MAX_DECOMPRESSED_BYTES, MAX_SUBJECT_LEN, MAX_BODY_LEN, MAX_EMAIL_LEN } from './mimeUtils'; - -export type ProgressCallback = (progress: OLMProcessingProgress) => void; - -class OLMParser { - async parseOLMFile(file: File, onProgress?: ProgressCallback): Promise { - const result: OLMProcessingResult = { - emails: 0, - contacts: 0, - calendarEvents: 0, - accounts: 0, - purchases: 0, - subscriptions: 0, - newsletters: 0, - }; - - try { - // Stage 1: Extract ZIP - onProgress?.({ - stage: 'extracting', - progress: 0, - message: 'Extracting OLM archive...', - }); - - // Validate file size before loading (500MB compressed limit) - if (file.size > MAX_COMPRESSED_BYTES) { - throw new Error(`File too large (${(file.size / 1024 / 1024).toFixed(0)}MB). Maximum supported size is 500MB.`); - } - - const zip = await JSZip.loadAsync(file); - - // Check decompressed size to guard against zip bombs (2GB limit) - // JSZip stores uncompressedSize in internal _data property (not in public types) - let totalDecompressedSize = 0; - for (const entry of Object.values(zip.files)) { - if (!entry.dir) { - const entryData = (entry as unknown as { _data?: { uncompressedSize?: number } })._data; - if (entryData && typeof entryData.uncompressedSize === 'number') { - totalDecompressedSize += entryData.uncompressedSize; - } - } - } - if (totalDecompressedSize > MAX_DECOMPRESSED_BYTES) { - throw new Error(`Archive decompressed size exceeds 2GB limit. This may be a malicious file.`); - } - - onProgress?.({ - stage: 'extracting', - progress: 100, - message: 'Archive extracted successfully', - }); - - // Get all files in the archive - const files = Object.keys(zip.files); - - // Find email files - they are individual message_XXXXX.xml files in com.microsoft.__Messages folders - const emailFiles = files.filter(f => - f.includes('com.microsoft.__Messages/') && - f.match(/message_\d+\.xml$/) && - !zip.files[f].dir - ); - - // Find contact files - Contacts.xml in Address Book or other locations - const contactFiles = files.filter(f => - (f.includes('Address Book/Contacts.xml') || - (f.includes('/Contacts/') && f.endsWith('.xml'))) && - !zip.files[f].dir - ); - - // Find calendar files - Calendar.xml files (each contains multiple appointments) - const calendarFiles = files.filter(f => - f.includes('/Calendar/') && - f.endsWith('Calendar.xml') && - !zip.files[f].dir - ); - - console.log(`Found ${emailFiles.length} email files, ${contactFiles.length} contact files, ${calendarFiles.length} calendar files`); - - // Stage 2: Parse emails - if (emailFiles.length > 0) { - onProgress?.({ - stage: 'parsing_emails', - progress: 0, - message: `Parsing ${emailFiles.length} emails...`, - }); - - for (let i = 0; i < emailFiles.length; i++) { - try { - const content = await zip.files[emailFiles[i]].async('string'); - const email = this.parseEmailXML(content); - if (email) { - const emailId = await insertEmail(email); - result.emails++; - - // Run detection on this email - const emailWithId = { ...email, id: emailId }; - await this.runDetection(emailWithId, result); - - // Track contacts from email senders - const newContact = await this.trackContact(email); - if (newContact) { - result.contacts++; - } - } - } catch (err) { - console.warn(`Failed to parse email ${emailFiles[i]}:`, err); - } - - if (i % 100 === 0 || i === emailFiles.length - 1) { - onProgress?.({ - stage: 'parsing_emails', - progress: Math.round((i + 1) / emailFiles.length * 100), - message: `Parsed ${i + 1} of ${emailFiles.length} emails`, - }); - } - } - } - - // Stage 3: Parse contacts (each file may contain multiple contacts) - if (contactFiles.length > 0) { - onProgress?.({ - stage: 'parsing_contacts', - progress: 0, - message: `Parsing contacts...`, - }); - - for (let i = 0; i < contactFiles.length; i++) { - try { - const content = await zip.files[contactFiles[i]].async('string'); - const contacts = this.parseContactsXML(content); - for (const contact of contacts) { - // Check if contact already exists (might have been created from email tracking) - if (contact.email) { - const existing = await getContactByEmail(contact.email); - if (existing) { - // Skip duplicate - contact was already created from email - continue; - } - } - await insertContact(contact); - result.contacts++; - } - } catch (err) { - console.warn(`Failed to parse contacts ${contactFiles[i]}:`, err); - } - - onProgress?.({ - stage: 'parsing_contacts', - progress: Math.round((i + 1) / contactFiles.length * 100), - message: `Parsed ${result.contacts} contacts`, - }); - } - } - - // Stage 4: Parse calendar events (each Calendar.xml contains multiple appointments) - if (calendarFiles.length > 0) { - onProgress?.({ - stage: 'parsing_calendar', - progress: 0, - message: `Parsing calendar files...`, - }); - - for (let i = 0; i < calendarFiles.length; i++) { - try { - const content = await zip.files[calendarFiles[i]].async('string'); - const events = this.parseCalendarXML(content); - for (const event of events) { - await insertCalendarEvent(event); - result.calendarEvents++; - } - } catch (err) { - console.warn(`Failed to parse calendar ${calendarFiles[i]}:`, err); - } - - onProgress?.({ - stage: 'parsing_calendar', - progress: Math.round((i + 1) / calendarFiles.length * 100), - message: `Parsed ${result.calendarEvents} calendar events`, - }); - } - } - - onProgress?.({ - stage: 'saving', - progress: 100, - message: 'Processing complete!', - }); - - return result; - } catch (error) { - console.error('Error parsing OLM file:', error); - throw new Error(`Failed to parse OLM file: ${error instanceof Error ? error.message : 'Unknown error'}`); - } - } - - private parseEmailXML(xmlContent: string): Omit | null { - try { - const parser = new DOMParser(); - const doc = parser.parseFromString(xmlContent, 'text/xml'); - - // Check for parsing errors - const parserError = doc.querySelector('parsererror'); - if (parserError) { - console.warn('XML parsing error:', parserError.textContent); - return null; - } - - // Find the email element (could be directly or inside emails wrapper) - const emailElement = doc.querySelector('email') || doc.documentElement; - - const getTextContent = (selectors: string[]): string => { - for (const selector of selectors) { - const element = emailElement.querySelector(selector); - if (element?.textContent) { - return element.textContent.trim(); - } - } - return ''; - }; - - const subject = getTextContent(['OPFMessageCopySubject', 'subject', 'Subject']); - const body = getTextContent(['OPFMessageCopyBody', 'body', 'Body', 'content', 'Content']); - const htmlBody = getTextContent(['OPFMessageCopyHTMLBody', 'htmlBody', 'HtmlBody']); - const preview = getTextContent(['OPFMessageCopyPreview']); - - // Parse sender from OPFMessageCopyFromAddresses - const fromAddresses = emailElement.querySelector('OPFMessageCopyFromAddresses'); - let sender = ''; - let senderName = ''; - if (fromAddresses) { - const emailAddr = fromAddresses.querySelector('emailAddress'); - if (emailAddr) { - sender = emailAddr.getAttribute('OPFContactEmailAddressAddress') || ''; - senderName = emailAddr.getAttribute('OPFContactEmailAddressName') || ''; - } - } - if (!sender) { - sender = getTextContent(['from', 'From', 'sender', 'Sender']); - } - - // Parse date - const dateStr = getTextContent(['OPFMessageCopySentTime', 'OPFMessageCopyReceivedTime', 'sentTime', 'SentTime', 'date', 'Date']); - const date = dateStr ? new Date(dateStr) : new Date(); - - // Parse recipients from OPFMessageCopyToAddresses - const recipients: string[] = []; - const toAddresses = emailElement.querySelector('OPFMessageCopyToAddresses'); - if (toAddresses) { - const emailAddrs = toAddresses.querySelectorAll('emailAddress'); - emailAddrs.forEach(addr => { - const email = addr.getAttribute('OPFContactEmailAddressAddress'); - if (email) { - recipients.push(email); - } - }); - } - if (recipients.length === 0) { - const recipientsStr = getTextContent(['to', 'To', 'recipients']); - if (recipientsStr) { - recipients.push(...recipientsStr.split(/[,;]/).map(r => r.trim()).filter(Boolean)); - } - } - - // Parse isRead status - const isReadStr = getTextContent(['OPFMessageGetIsRead']); - const isRead = isReadStr === '1' || isReadStr.toLowerCase() === 'true'; - - // Parse thread ID from OLM (if available) - let threadId = getTextContent([ - 'OPFMessageCopyThreadTopic', - 'OPFMessageCopyConversationID', - 'threadId', - 'Thread-Topic', - 'In-Reply-To', - 'References', - ]); - - // If no explicit thread ID, generate one from normalized subject - if (!threadId) { - const normalizedSubject = normalizeSubject(subject || ''); - if (normalizedSubject) { - // Create a thread key from normalized subject - threadId = `subject:${normalizedSubject.toLowerCase().replace(/\s+/g, '-')}`; - } - } - - // If we couldn't find a subject, this might not be a valid email - if (!subject && !body && !preview) { - return null; - } - - // Sanitize field lengths to prevent memory issues - const rawSubject = subject || '(No Subject)'; - const rawBody = body || preview || ''; - const sanitizedSender = cleanEmailAddress(sender).slice(0, MAX_EMAIL_LEN); - const sanitizedRecipients = recipients.map(r => r.slice(0, MAX_EMAIL_LEN)).slice(0, 1000); - - return { - subject: rawSubject.length > MAX_SUBJECT_LEN ? rawSubject.slice(0, MAX_SUBJECT_LEN) : rawSubject, - sender: sanitizedSender, - senderName: senderName || undefined, - recipients: sanitizedRecipients, - date: isNaN(date.getTime()) ? new Date() : date, - body: rawBody.length > MAX_BODY_LEN ? rawBody.slice(0, MAX_BODY_LEN) : rawBody, - htmlBody: htmlBody && htmlBody.length > MAX_BODY_LEN ? htmlBody.slice(0, MAX_BODY_LEN) : (htmlBody || undefined), - attachments: [], - size: xmlContent.length, - isRead, - isStarred: false, - folderId: 'inbox', - threadId: threadId || undefined, - emailType: 'regular', - }; - } catch (error) { - console.warn('Failed to parse email XML:', error); - return null; - } - } - - private parseContactsXML(xmlContent: string): Omit[] { - const contacts: Omit[] = []; - - try { - const parser = new DOMParser(); - const doc = parser.parseFromString(xmlContent, 'text/xml'); - - const parserError = doc.querySelector('parsererror'); - if (parserError) { - console.warn('Contact XML parsing error:', parserError.textContent); - return contacts; - } - - // Get all contact elements - const contactElements = doc.querySelectorAll('contact'); - - contactElements.forEach(contactElement => { - const getTextContent = (selectors: string[]): string => { - for (const selector of selectors) { - const element = contactElement.querySelector(selector); - if (element?.textContent) { - return element.textContent.trim(); - } - } - return ''; - }; - - const displayName = getTextContent(['OPFContactCopyDisplayName', 'displayName', 'name', 'Name']); - const firstName = getTextContent(['OPFContactCopyFirstName', 'firstName']); - const lastName = getTextContent(['OPFContactCopyLastName', 'lastName']); - const phone = getTextContent(['OPFContactCopyPhoneNumbers', 'phone', 'Phone']); - - // Get email from email address list - let email = ''; - const emailList = contactElement.querySelector('OPFContactCopyEmailAddressList, OPFContactCopyDefaultEmailAddress'); - if (emailList) { - const emailAddr = emailList.querySelector('contactEmailAddress'); - if (emailAddr) { - email = emailAddr.getAttribute('OPFContactEmailAddressAddress') || ''; - } - } - if (!email) { - email = getTextContent(['email', 'Email', 'emailAddress']); - } - - const name = displayName || `${firstName} ${lastName}`.trim() || email.split('@')[0] || 'Unknown'; - - if (email || name !== 'Unknown') { - contacts.push({ - name, - email: cleanEmailAddress(email), - phone: phone || undefined, - emailCount: 0, - lastEmailDate: new Date(), - }); - } - }); - - // If no contact elements found, try parsing as a single contact - if (contactElements.length === 0) { - const name = doc.querySelector('OPFContactCopyDisplayName, displayName, name')?.textContent?.trim(); - const emailList = doc.querySelector('OPFContactCopyEmailAddressList, OPFContactCopyDefaultEmailAddress'); - let email = ''; - if (emailList) { - const emailAddr = emailList.querySelector('contactEmailAddress'); - if (emailAddr) { - email = emailAddr.getAttribute('OPFContactEmailAddressAddress') || ''; - } - } - - if (email || name) { - contacts.push({ - name: name || email?.split('@')[0] || 'Unknown', - email: cleanEmailAddress(email), - phone: undefined, - emailCount: 0, - lastEmailDate: new Date(), - }); - } - } - } catch (error) { - console.warn('Failed to parse contacts XML:', error); - } - - return contacts; - } - - private parseCalendarXML(xmlContent: string): Omit[] { - const events: Omit[] = []; - - try { - const parser = new DOMParser(); - const doc = parser.parseFromString(xmlContent, 'text/xml'); - - const parserError = doc.querySelector('parsererror'); - if (parserError) { - console.warn('Calendar XML parsing error:', parserError.textContent); - return events; - } - - // Get all appointment elements - const appointmentElements = doc.querySelectorAll('appointment'); - - appointmentElements.forEach(appointmentElement => { - const getTextContent = (selectors: string[]): string => { - for (const selector of selectors) { - const element = appointmentElement.querySelector(selector); - if (element?.textContent) { - return element.textContent.trim(); - } - } - return ''; - }; - - // OLM uses OPFCalendarEventCopySummary for the title - const title = getTextContent([ - 'OPFCalendarEventCopySummary', - 'OPFCalendarEventCopySubject', - 'summary', - 'subject', - 'Subject', - 'title', - 'Title' - ]); - const startDateStr = getTextContent(['OPFCalendarEventCopyStartTime', 'startTime', 'StartTime', 'start']); - const endDateStr = getTextContent(['OPFCalendarEventCopyEndTime', 'endTime', 'EndTime', 'end']); - const location = getTextContent(['OPFCalendarEventCopyLocation', 'location', 'Location']); - const description = getTextContent(['OPFCalendarEventCopyBody', 'OPFCalendarEventCopyNotes', 'body', 'Body', 'description']); - const organizer = getTextContent(['OPFCalendarEventCopyOrganizer', 'organizer', 'Organizer']); - const isAllDayStr = getTextContent(['OPFCalendarEventGetIsAllDayEvent', 'OPFCalendarEventCopyIsAllDay', 'isAllDay', 'AllDay']); - - if (!title) { - return; // Skip appointments without a title - } - - const startDate = startDateStr ? new Date(startDateStr) : new Date(); - const endDate = endDateStr ? new Date(endDateStr) : new Date(startDate.getTime() + 3600000); - - events.push({ - title, - startDate: isNaN(startDate.getTime()) ? new Date() : startDate, - endDate: isNaN(endDate.getTime()) ? new Date() : endDate, - location: location || undefined, - attendees: organizer ? [organizer] : [], - description: description || undefined, - isAllDay: isAllDayStr === '1' || isAllDayStr?.toLowerCase() === 'true', - reminder: false, - isRead: false, // Mark as unread on import - }); - }); - - } catch (error) { - console.warn('Failed to parse calendar XML:', error); - } - - return events; - } - - private async runDetection(email: Email, result: OLMProcessingResult): Promise { - // Detect account signups - const accountResult = accountDetector.detectAccountSignup(email); - if (accountResult.type === 'account' && accountResult.data?.serviceName) { - const existingAccount = await getAccountByServiceName(accountResult.data.serviceName); - if (!existingAccount) { - const accountData = accountDetector.createAccountFromEmail( - email, - accountResult.data.serviceName, - accountResult.data.serviceType as Account['serviceType'] - ); - await insertAccount(accountData); - result.accounts++; - } - } - - // Detect purchases - const purchaseResult = purchaseDetector.detectPurchase(email); - if (purchaseResult.type === 'purchase' && purchaseResult.data?.amount) { - const merchant = purchaseResult.data.merchant || 'Unknown'; - const amount = purchaseResult.data.amount; - const orderNumber = purchaseResult.data.orderNumber; - - // Check for duplicates before inserting - const existingPurchase = await findDuplicatePurchase( - merchant, - amount, - email.date, - orderNumber - ); - - if (!existingPurchase) { - const purchaseData = purchaseDetector.createPurchaseFromEmail( - email, - merchant, - amount, - orderNumber - ); - await insertPurchase(purchaseData); - result.purchases++; - } - } - - // Detect subscriptions - const subResult = subscriptionDetector.detectSubscription(email); - if (subResult.isSubscription && subResult.serviceName) { - // Use normalized sender domain as the grouping key for consistency - const senderDomain = extractDomain(email.sender); - - // Try to find existing subscription by service name first, then by domain - let existingSub = await getSubscriptionByServiceName(subResult.serviceName); - if (!existingSub && senderDomain) { - existingSub = await getSubscriptionByServiceName(senderDomain); - } - - if (existingSub) { - // Update existing subscription with new email - const emailIds = [...new Set([...existingSub.emailIds, email.id!])]; // Dedupe - const isNewerEmail = email.date > existingSub.lastRenewalDate; - - // Only update amount if this is a newer email AND has a detected amount - // This ensures we use the most recent billing amount - const shouldUpdateAmount = isNewerEmail && subResult.amount && subResult.amount > 0; - - await updateSubscription(existingSub.id!, { - emailIds, - lastRenewalDate: isNewerEmail ? email.date : existingSub.lastRenewalDate, - // Keep existing amount if new email doesn't have one, or if it's an older email - monthlyAmount: shouldUpdateAmount ? subResult.amount : existingSub.monthlyAmount, - }); - } else { - // Create new subscription - const newSub: Omit = { - serviceName: subResult.serviceName, - monthlyAmount: subResult.amount || 0, - currency: subResult.currency || 'USD', - frequency: subResult.frequency || 'monthly', - lastRenewalDate: email.date, - emailIds: [email.id!], - isActive: true, - category: subResult.category || 'other', - }; - await insertSubscription(newSub); - result.subscriptions++; - } - } - - // Detect newsletters/promotional emails - const nlResult = newsletterDetector.detectNewsletter(email); - if (nlResult.isNewsletter || nlResult.isPromotional) { - const existingNL = await getNewsletterBySender(email.sender); - if (existingNL) { - // Update existing newsletter - await updateNewsletter(existingNL.id!, { - emailCount: existingNL.emailCount + 1, - lastEmailDate: email.date > existingNL.lastEmailDate ? email.date : existingNL.lastEmailDate, - unsubscribeLink: nlResult.unsubscribeLink || existingNL.unsubscribeLink, - }); - } else { - // Create new newsletter entry - const newNL: Omit = { - senderEmail: email.sender, - senderName: email.senderName || email.sender.split('@')[0], - emailCount: 1, - lastEmailDate: email.date, - unsubscribeLink: nlResult.unsubscribeLink, - isPromotional: nlResult.isPromotional, - }; - await insertNewsletter(newNL); - result.newsletters++; - } - } - } - - /** - * Track contact from email sender - * @returns true if a new contact was created, false if existing was updated - */ - private async trackContact(email: Omit): Promise { - const senderEmail = email.sender; - if (!senderEmail || senderEmail === 'unknown@example.com') return false; - - const existingContact = await getContactByEmail(senderEmail); - if (existingContact) { - // Update existing contact's email count - await updateContactEmailCount( - senderEmail, - existingContact.emailCount + 1, - email.date - ); - return false; // Not a new contact - } else { - // Create new contact from email sender - const senderName = email.senderName || senderEmail.split('@')[0] || 'Unknown'; - await insertContact({ - name: senderName, - email: senderEmail, - phone: undefined, - emailCount: 1, - lastEmailDate: email.date, - }); - return true; // New contact created - } - } -} - -export const olmParser = new OLMParser(); diff --git a/web/src/services/purchaseDetector.ts b/web/src/services/purchaseDetector.ts deleted file mode 100644 index 2b4cfed..0000000 --- a/web/src/services/purchaseDetector.ts +++ /dev/null @@ -1,502 +0,0 @@ -import type { Email, Purchase, DetectionResult } from '../types'; -import { stripHtml, extractDomain } from '../utils/emailUtils'; -import { isDomainMatch } from './domainMatch'; - -class PurchaseDetector { - // Strong subject line patterns for purchases (must be primary purpose) - private readonly strongSubjectPatterns = [ - /^(?:your )?order (?:confirmation|receipt|#)/i, - /^(?:your )?(?:purchase|payment) (?:confirmation|receipt)/i, - /^receipt (?:for|from)/i, - /^invoice (?:for|from|#)/i, - /^thank you for your (?:order|purchase)/i, - /^order #?\w+ (?:confirmed|shipped|delivered)/i, - /^your .{2,30} order/i, - /^shipping confirmation/i, - /^payment received/i, - /^transaction receipt/i, - ]; - - // Strong body patterns that indicate actual purchase confirmation (multi-currency) - private readonly strongBodyPatterns = [ - /order\s+(?:total|summary)[:\s]+[$€£¥₹₩]\s*[\d,]+[.,]?\d*/i, - /(?:amount|total)\s+(?:charged|paid)[:\s]+[$€£¥₹₩]\s*[\d,]+[.,]?\d*/i, - /you (?:have )?(?:paid|purchased|ordered)/i, - /thank you for your (?:order|purchase) (?:of|from)/i, - /your order has been (?:confirmed|placed|received)/i, - /payment of [$€£¥₹₩]\s*[\d,]+[.,]?\d*/i, - /transaction amount[:\s]+[$€£¥₹₩]\s*[\d,]+[.,]?\d*/i, - /order #\s*[A-Z0-9-]{5,}/i, - /order number[:\s]+[A-Z0-9-]{5,}/i, - /betrag[:\s]+€\s*[\d,]+[.,]\d{2}/i, // German - /montant[:\s]+€\s*[\d\s]+[.,]\d{2}/i, // French - /importe[:\s]+€\s*[\d,]+[.,]\d{2}/i, // Spanish - ]; - - // Patterns that indicate this is NOT a purchase (promotional emails, etc.) - private readonly antiPatterns = [ - /save \$\d+/i, - /up to \d+% off/i, - /free shipping/i, - /sale ends/i, - /limited time/i, - /discount code/i, - /promo code/i, - /shop now/i, - /buy now/i, - /subscribe/i, - /unsubscribe/i, - /view in browser/i, - ]; - - // Known merchant domains for reliable detection - private readonly knownMerchants: Record = { - 'amazon.com': 'Amazon', - 'ebay.com': 'eBay', - 'etsy.com': 'Etsy', - 'paypal.com': 'PayPal', - 'stripe.com': 'Stripe', - 'square.com': 'Square', - 'shopify.com': 'Shopify', - 'apple.com': 'Apple', - 'google.com': 'Google', - 'microsoft.com': 'Microsoft', - 'netflix.com': 'Netflix', - 'spotify.com': 'Spotify', - 'hulu.com': 'Hulu', - 'starbucks.com': 'Starbucks', - 'mcdonalds.com': "McDonald's", - 'uber.com': 'Uber', - 'ubereats.com': 'Uber Eats', - 'doordash.com': 'DoorDash', - 'grubhub.com': 'Grubhub', - 'instacart.com': 'Instacart', - 'walmart.com': 'Walmart', - 'target.com': 'Target', - 'bestbuy.com': 'Best Buy', - 'costco.com': 'Costco', - 'homedepot.com': 'Home Depot', - 'lowes.com': "Lowe's", - 'nordstrom.com': 'Nordstrom', - 'macys.com': "Macy's", - 'kohls.com': "Kohl's", - 'gap.com': 'Gap', - 'oldnavy.com': 'Old Navy', - 'nike.com': 'Nike', - 'adidas.com': 'Adidas', - 'newegg.com': 'Newegg', - 'bhphotovideo.com': 'B&H Photo', - 'dell.com': 'Dell', - 'hp.com': 'HP', - 'lenovo.com': 'Lenovo', - 'aliexpress.com': 'AliExpress', - 'wish.com': 'Wish', - 'chewy.com': 'Chewy', - 'wayfair.com': 'Wayfair', - 'ikea.com': 'IKEA', - 'sephora.com': 'Sephora', - 'ulta.com': 'Ulta', - 'airbnb.com': 'Airbnb', - 'booking.com': 'Booking.com', - 'expedia.com': 'Expedia', - 'southwest.com': 'Southwest Airlines', - 'delta.com': 'Delta Airlines', - 'united.com': 'United Airlines', - 'american.com': 'American Airlines', - 'lyft.com': 'Lyft', - 'seamless.com': 'Seamless', - 'postmates.com': 'Postmates', - 'caviar.com': 'Caviar', - 'ticketmaster.com': 'Ticketmaster', - 'stubhub.com': 'StubHub', - 'seatgeek.com': 'SeatGeek', - 'eventbrite.com': 'Eventbrite', - 'steamgames.com': 'Steam', - 'steampowered.com': 'Steam', - 'epicgames.com': 'Epic Games', - 'playstation.com': 'PlayStation', - 'xbox.com': 'Xbox', - 'nintendo.com': 'Nintendo', - }; - - // Merchant category mappings - private readonly merchantCategories: Record = { - 'amazon': 'ecommerce', - 'ebay': 'ecommerce', - 'etsy': 'ecommerce', - 'walmart': 'ecommerce', - 'target': 'ecommerce', - 'costco': 'ecommerce', - 'wayfair': 'ecommerce', - 'aliexpress': 'ecommerce', - 'wish': 'ecommerce', - 'shopify': 'ecommerce', - 'best buy': 'technology', - 'newegg': 'technology', - 'b&h photo': 'technology', - 'apple': 'technology', - 'dell': 'technology', - 'hp': 'technology', - 'lenovo': 'technology', - 'microsoft': 'technology', - 'paypal': 'payment', - 'stripe': 'payment', - 'square': 'payment', - 'venmo': 'payment', - 'netflix': 'entertainment', - 'spotify': 'entertainment', - 'hulu': 'entertainment', - 'disney+': 'entertainment', - 'hbo max': 'entertainment', - 'steam': 'entertainment', - 'epic games': 'entertainment', - 'playstation': 'entertainment', - 'xbox': 'entertainment', - 'nintendo': 'entertainment', - 'ticketmaster': 'entertainment', - 'stubhub': 'entertainment', - 'seatgeek': 'entertainment', - 'eventbrite': 'entertainment', - 'starbucks': 'food', - 'mcdonalds': 'food', - "mcdonald's": 'food', - 'doordash': 'food', - 'grubhub': 'food', - 'uber eats': 'food', - 'instacart': 'food', - 'seamless': 'food', - 'postmates': 'food', - 'caviar': 'food', - 'uber': 'transportation', - 'lyft': 'transportation', - 'southwest': 'travel', - 'delta': 'travel', - 'united': 'travel', - 'american': 'travel', - 'airbnb': 'travel', - 'booking.com': 'travel', - 'expedia': 'travel', - 'home depot': 'home', - "lowe's": 'home', - 'ikea': 'home', - 'nordstrom': 'fashion', - "macy's": 'fashion', - "kohl's": 'fashion', - 'gap': 'fashion', - 'old navy': 'fashion', - 'nike': 'fashion', - 'adidas': 'fashion', - 'sephora': 'beauty', - 'ulta': 'beauty', - 'chewy': 'pets', - }; - - detectPurchase(email: Email): DetectionResult { - const subject = email.subject || ''; - const body = stripHtml(email.body || ''); - const sender = email.sender || ''; - - // Check for anti-patterns first (promotional emails) - const combinedText = `${subject} ${body}`; - let antiPatternMatches = 0; - for (const pattern of this.antiPatterns) { - if (pattern.test(combinedText)) { - antiPatternMatches++; - } - } - // If too many promotional patterns, it's likely not a real purchase - if (antiPatternMatches >= 3) { - return { type: 'none', confidence: 0 }; - } - - let confidence = 0; - let amount = 0; - let currency = 'USD'; - let merchant = ''; - let orderNumber = ''; - - // Check if sender is from a known merchant - const domain = extractDomain(sender); - const knownMerchant = this.findKnownMerchant(domain); - if (knownMerchant) { - merchant = knownMerchant; - confidence += 30; - } - - // Check strong subject patterns - for (const pattern of this.strongSubjectPatterns) { - if (pattern.test(subject)) { - confidence += 35; - break; - } - } - - // Check strong body patterns - for (const pattern of this.strongBodyPatterns) { - if (pattern.test(body)) { - confidence += 25; - break; - } - } - - // Only extract amount if we have some confidence this is a purchase - if (confidence >= 30) { - const extracted = this.extractAmount(body); - amount = extracted.amount; - currency = extracted.currency; - - if (amount > 0 && amount < 10000) { // Reasonable purchase amount - confidence += 20; - } else if (amount >= 10000) { - // Large amounts require extra validation - confidence += 10; - } - - // Extract order number - orderNumber = this.extractOrderNumber(body); - if (orderNumber && this.isValidOrderNumber(orderNumber)) { - confidence += 15; - } - - // If no known merchant, try to extract from email - if (!merchant) { - merchant = this.formatDomainAsMerchant(domain); - } - } - - // Require high confidence AND a reasonable amount - if (confidence >= 70 && amount > 0 && merchant) { - return { - type: 'purchase', - confidence, - data: { - merchant, - amount, - currency, - orderNumber: this.isValidOrderNumber(orderNumber) ? orderNumber : undefined, - }, - }; - } - - return { type: 'none', confidence: 0 }; - } - - private extractAmount(text: string): { amount: number; currency: string } { - // Multi-currency context patterns - const contextPatterns = [ - // USD - { currency: 'USD', pattern: /(?:order\s+)?total[:\s]+\$\s*([\d,]+\.\d{2})/i }, - { currency: 'USD', pattern: /(?:amount|total)\s+(?:charged|paid|due)[:\s]+\$\s*([\d,]+\.\d{2})/i }, - { currency: 'USD', pattern: /payment\s+(?:of|amount)[:\s]+\$\s*([\d,]+\.\d{2})/i }, - { currency: 'USD', pattern: /grand\s+total[:\s]+\$\s*([\d,]+\.\d{2})/i }, - // EUR - { currency: 'EUR', pattern: /(?:order\s+)?total[:\s]+€\s*([\d\s,]+[.,]\d{2})/i }, - { currency: 'EUR', pattern: /(?:amount|total)\s+(?:charged|paid|due)[:\s]+€\s*([\d\s,]+[.,]\d{2})/i }, - { currency: 'EUR', pattern: /betrag[:\s]+€\s*([\d,]+[.,]\d{2})/i }, - { currency: 'EUR', pattern: /montant[:\s]+€\s*([\d\s]+[.,]\d{2})/i }, - { currency: 'EUR', pattern: /importe[:\s]+€\s*([\d,]+[.,]\d{2})/i }, - // GBP - { currency: 'GBP', pattern: /(?:order\s+)?total[:\s]+£\s*([\d,]+\.\d{2})/i }, - { currency: 'GBP', pattern: /(?:amount|total)\s+(?:charged|paid|due)[:\s]+£\s*([\d,]+\.\d{2})/i }, - // JPY - { currency: 'JPY', pattern: /(?:order\s+)?total[:\s]+¥\s*([\d,]+)/i }, - { currency: 'JPY', pattern: /(?:amount|total)[:\s]+¥\s*([\d,]+)/i }, - // CAD - { currency: 'CAD', pattern: /(?:order\s+)?total[:\s]+C\$\s*([\d,]+\.\d{2})/i }, - // AUD - { currency: 'AUD', pattern: /(?:order\s+)?total[:\s]+A\$\s*([\d,]+\.\d{2})/i }, - // INR - { currency: 'INR', pattern: /(?:order\s+)?total[:\s]+₹\s*([\d,]+\.\d{2})/i }, - // CHF - { currency: 'CHF', pattern: /(?:order\s+)?total[:\s]+CHF\s*([\d',]+\.\d{2})/i }, - ]; - - for (const { currency, pattern } of contextPatterns) { - const match = text.match(pattern); - if (match && match[1]) { - const amount = this.parseAmount(match[1], currency); - if (amount > 0) { - return { amount, currency }; - } - } - } - - // Fallback: detect currency from any amount found - const currencyMatches = [ - { currency: 'EUR', regex: /€\s*([\d\s,]+[.,]\d{2})/g }, - { currency: 'GBP', regex: /£\s*([\d,]+\.\d{2})/g }, - { currency: 'JPY', regex: /¥\s*([\d,]+)/g }, - { currency: 'INR', regex: /₹\s*([\d,]+[.,]\d{2})/g }, - { currency: 'USD', regex: /\$\s*([\d,]+\.\d{2})/g }, - ]; - - for (const { currency, regex } of currencyMatches) { - const allAmounts = [...text.matchAll(regex)]; - if (allAmounts.length >= 1 && allAmounts.length <= 5) { - const amounts = allAmounts - .map(m => this.parseAmount(m[1], currency)) - .filter(a => a > 0 && a < 500000); - - if (amounts.length > 0) { - return { amount: Math.max(...amounts), currency }; - } - } - } - - return { amount: 0, currency: 'USD' }; - } - - private parseAmount(amountStr: string, currency: string): number { - // Strip spaces (used as thousands separators in some locales) and apostrophes (CHF) - let cleaned = amountStr.replace(/[\s']/g, ''); - - // CHF uses apostrophe for thousands (already stripped above) and dot for decimal, - // so it behaves like a comma-decimal locale ONLY for the comma separator. - const commaDecimalLocale = currency === 'EUR' || currency === 'BRL'; - const commaDecimalOrCHF = commaDecimalLocale || currency === 'CHF'; - const lastComma = cleaned.lastIndexOf(','); - const lastDot = cleaned.lastIndexOf('.'); - - if (lastComma !== -1 && lastDot !== -1) { - // Both separators present: the LAST one is the decimal separator. - if (lastComma > lastDot) { - // comma is decimal, dot is thousands (e.g. 1.234,56) - cleaned = cleaned.replace(/\./g, '').replace(',', '.'); - } else { - // dot is decimal, comma is thousands (e.g. 1,234.56) - cleaned = cleaned.replace(/,/g, ''); - } - } else if (lastComma !== -1) { - // Only commas present. - const tail = cleaned.slice(lastComma + 1); - // Treat as decimal if 1-2 trailing digits AND (comma-decimal locale OR exactly one comma) - const oneComma = cleaned.indexOf(',') === lastComma; - if (tail.length >= 1 && tail.length <= 2 && (commaDecimalOrCHF || oneComma)) { - cleaned = cleaned.replace(',', '.'); - } else { - // comma(s) are thousands separators - cleaned = cleaned.replace(/,/g, ''); - } - } else if (lastDot !== -1) { - // Only dots present. - const tail = cleaned.slice(lastDot + 1); - const oneDot = cleaned.indexOf('.') === lastDot; - // In comma-decimal locales a lone dot is a thousands separator (1.234 -> 1234), - // UNLESS it clearly looks like cents in a non-comma locale. - if (commaDecimalLocale) { - // dot is thousands -> drop it (1.234 -> 1234, 1.234.567 -> 1234567) - cleaned = cleaned.replace(/\./g, ''); - } else if (!(tail.length >= 1 && tail.length <= 2 && oneDot)) { - // non-comma locale but dot is grouping (e.g. 1.234.567) - cleaned = cleaned.replace(/\./g, ''); - } - // else: dot is the decimal point, leave as-is - } - - const amount = parseFloat(cleaned); - return isNaN(amount) ? 0 : amount; - } - - private extractOrderNumber(text: string): string { - const patterns = [ - /order\s*(?:#|number|no\.?)[:\s]*([A-Z0-9][A-Z0-9-]{4,20})/i, - /confirmation\s*(?:#|number|no\.?)[:\s]*([A-Z0-9][A-Z0-9-]{4,20})/i, - /(?:order|reference)\s+(?:id|#)[:\s]*([A-Z0-9][A-Z0-9-]{4,20})/i, - /tracking\s*(?:#|number)[:\s]*([A-Z0-9][A-Z0-9-]{8,30})/i, - ]; - - for (const pattern of patterns) { - const match = text.match(pattern); - if (match && match[1]) { - const orderNum = match[1].trim(); - if (this.isValidOrderNumber(orderNum)) { - return orderNum; - } - } - } - - return ''; - } - - private isValidOrderNumber(orderNum: string): boolean { - if (!orderNum || orderNum.length < 5 || orderNum.length > 30) { - return false; - } - // Must start with alphanumeric - if (!/^[A-Z0-9]/i.test(orderNum)) { - return false; - } - // Must contain mostly alphanumeric with possible hyphens - if (!/^[A-Z0-9-]+$/i.test(orderNum)) { - return false; - } - // Should not look like CSS (no common CSS patterns) - const cssPatterns = ['-collapse', '-color', '-width', '-height', '-size', '-weight', '-style']; - for (const pattern of cssPatterns) { - if (orderNum.toLowerCase().includes(pattern)) { - return false; - } - } - return true; - } - - private findKnownMerchant(domain: string): string | null { - // Direct match - if (this.knownMerchants[domain]) { - return this.knownMerchants[domain]; - } - - // Exact or subdomain match against each known merchant domain - for (const [merchantDomain, name] of Object.entries(this.knownMerchants)) { - if (isDomainMatch(domain, merchantDomain)) { - return name; - } - } - - return null; - } - - private formatDomainAsMerchant(domain: string): string { - if (!domain) return ''; - - const parts = domain.split('.'); - if (parts.length < 2) return ''; - - let mainPart = parts.length > 2 ? parts[parts.length - 2] : parts[0]; - - // Skip common email subdomains - const skipWords = ['mail', 'email', 'noreply', 'no-reply', 'notifications', 'info', 'support', 'orders', 'receipts', 'billing']; - if (skipWords.includes(mainPart.toLowerCase()) && parts.length > 2) { - mainPart = parts[parts.length - 2]; - } - - return mainPart.charAt(0).toUpperCase() + mainPart.slice(1); - } - - getPurchaseCategory(merchant: string): string { - const lowerMerchant = merchant.toLowerCase(); - - for (const [key, category] of Object.entries(this.merchantCategories)) { - if (lowerMerchant.includes(key.toLowerCase())) { - return category; - } - } - - return 'other'; - } - - createPurchaseFromEmail(email: Email, merchant: string, amount: number, orderNumber?: string, currency: string = 'USD'): Omit { - return { - emailId: email.id, - merchant, - amount, - currency, - purchaseDate: email.date, - orderNumber, - items: [], - category: this.getPurchaseCategory(merchant), - }; - } -} - -export const purchaseDetector = new PurchaseDetector(); diff --git a/web/src/services/searchParser.ts b/web/src/services/searchParser.ts index 8289307..fd89550 100644 --- a/web/src/services/searchParser.ts +++ b/web/src/services/searchParser.ts @@ -303,8 +303,9 @@ export function filterEmails(emails: Email[], search: ParsedSearch, bodyMatch?: } } - // Date year filter + // Date year filter — undated emails can't satisfy any date filter, so exclude them. if (search.dateYear) { + if (!email.date) return false; const emailYear = new Date(email.date).getFullYear(); if (emailYear !== search.dateYear) { return false; @@ -313,6 +314,7 @@ export function filterEmails(emails: Email[], search: ParsedSearch, bodyMatch?: // Date range filters if (search.dateFrom) { + if (!email.date) return false; const emailDate = new Date(email.date); emailDate.setHours(0, 0, 0, 0); const fromDate = new Date(search.dateFrom); @@ -323,6 +325,7 @@ export function filterEmails(emails: Email[], search: ParsedSearch, bodyMatch?: } if (search.dateTo) { + if (!email.date) return false; const emailDate = new Date(email.date); emailDate.setHours(23, 59, 59, 999); const toDate = new Date(search.dateTo); diff --git a/web/src/services/subscriptionDetector.ts b/web/src/services/subscriptionDetector.ts deleted file mode 100644 index dacb2f0..0000000 --- a/web/src/services/subscriptionDetector.ts +++ /dev/null @@ -1,310 +0,0 @@ -import type { Email, Subscription } from '../types'; -import { stripHtml, extractDomain, formatDomainAsName } from '../utils/emailUtils'; -import { isDomainMatch } from './domainMatch'; - -/** - * Detector for recurring subscription services - */ -class SubscriptionDetector { - // Known subscription services - private readonly knownSubscriptions: Record = { - // Streaming - 'netflix.com': { name: 'Netflix', category: 'streaming' }, - 'spotify.com': { name: 'Spotify', category: 'streaming' }, - 'hulu.com': { name: 'Hulu', category: 'streaming' }, - 'disneyplus.com': { name: 'Disney+', category: 'streaming' }, - 'hbomax.com': { name: 'HBO Max', category: 'streaming' }, - 'max.com': { name: 'Max', category: 'streaming' }, - 'appletv.apple.com': { name: 'Apple TV+', category: 'streaming' }, - 'primevideo.com': { name: 'Prime Video', category: 'streaming' }, - 'peacocktv.com': { name: 'Peacock', category: 'streaming' }, - 'paramountplus.com': { name: 'Paramount+', category: 'streaming' }, - 'crunchyroll.com': { name: 'Crunchyroll', category: 'streaming' }, - 'audible.com': { name: 'Audible', category: 'streaming' }, - 'youtube.com': { name: 'YouTube Premium', category: 'streaming' }, - 'pandora.com': { name: 'Pandora', category: 'streaming' }, - 'deezer.com': { name: 'Deezer', category: 'streaming' }, - 'tidal.com': { name: 'Tidal', category: 'streaming' }, - 'twitch.tv': { name: 'Twitch', category: 'streaming' }, - // Software - 'adobe.com': { name: 'Adobe Creative Cloud', category: 'software' }, - 'microsoft.com': { name: 'Microsoft 365', category: 'software' }, - 'office365.com': { name: 'Microsoft 365', category: 'software' }, - 'dropbox.com': { name: 'Dropbox', category: 'software' }, - 'notion.so': { name: 'Notion', category: 'software' }, - '1password.com': { name: '1Password', category: 'software' }, - 'lastpass.com': { name: 'LastPass', category: 'software' }, - 'bitwarden.com': { name: 'Bitwarden', category: 'software' }, - 'grammarly.com': { name: 'Grammarly', category: 'software' }, - 'canva.com': { name: 'Canva Pro', category: 'software' }, - 'figma.com': { name: 'Figma', category: 'software' }, - 'slack.com': { name: 'Slack', category: 'software' }, - 'zoom.us': { name: 'Zoom', category: 'software' }, - 'github.com': { name: 'GitHub', category: 'software' }, - 'jetbrains.com': { name: 'JetBrains', category: 'software' }, - // VPN - 'nordvpn.com': { name: 'NordVPN', category: 'software' }, - 'expressvpn.com': { name: 'ExpressVPN', category: 'software' }, - 'surfshark.com': { name: 'Surfshark', category: 'software' }, - // News/Publications - 'nytimes.com': { name: 'New York Times', category: 'news' }, - 'washingtonpost.com': { name: 'Washington Post', category: 'news' }, - 'wsj.com': { name: 'Wall Street Journal', category: 'news' }, - 'economist.com': { name: 'The Economist', category: 'news' }, - 'medium.com': { name: 'Medium', category: 'news' }, - 'substack.com': { name: 'Substack', category: 'news' }, - // Fitness - 'peloton.com': { name: 'Peloton', category: 'fitness' }, - 'classpass.com': { name: 'ClassPass', category: 'fitness' }, - 'myfitnesspal.com': { name: 'MyFitnessPal', category: 'fitness' }, - 'strava.com': { name: 'Strava', category: 'fitness' }, - 'fitbit.com': { name: 'Fitbit Premium', category: 'fitness' }, - 'calm.com': { name: 'Calm', category: 'fitness' }, - 'headspace.com': { name: 'Headspace', category: 'fitness' }, - // Other - 'amazon.com': { name: 'Amazon Prime', category: 'other' }, - 'costco.com': { name: 'Costco Membership', category: 'other' }, - 'linkedin.com': { name: 'LinkedIn Premium', category: 'other' }, - 'evernote.com': { name: 'Evernote', category: 'other' }, - }; - - // Subscription-related subject patterns - private readonly subjectPatterns = [ - /subscription\s+(?:confirmed?|renewed?|receipt)/i, - /your\s+(?:monthly|yearly|annual)\s+(?:subscription|membership|plan)/i, - /(?:subscription|membership)\s+(?:renewal|billing|payment)/i, - /(?:thank you|thanks)\s+for\s+(?:subscribing|your subscription)/i, - /your\s+\w+\s+(?:subscription|membership)\s+(?:is active|has been renewed)/i, - /billing\s+(?:receipt|statement|confirmation)/i, - /payment\s+(?:confirmation|receipt)\s+for\s+(?:subscription|membership)/i, - /auto.?renew(?:al|ed)?/i, - /recurring\s+(?:payment|charge|billing)/i, - /your\s+next\s+(?:bill|payment)\s+(?:date|is)/i, - ]; - - // Body patterns for subscription emails - private readonly bodyPatterns = [ - /subscription\s+(?:plan|type)[:\s]+/i, - /billing\s+period[:\s]+/i, - /next\s+(?:billing|payment)\s+date[:\s]+/i, - /auto.?renew(?:s|al)?\s+on/i, - /(?:monthly|annual|yearly)\s+(?:subscription|membership|plan)/i, - /(?:subscription|membership)\s+(?:fee|price|cost)[:\s]+[$€£]/i, - /renews?\s+(?:on|every)\s+/i, - /recurring\s+(?:charge|payment)[:\s]+/i, - /cancel\s+(?:anytime|subscription|membership)/i, - ]; - - /** - * Detect if an email is a subscription-related email - */ - detectSubscription(email: Email): { isSubscription: boolean; serviceName?: string; category?: Subscription['category']; amount?: number; currency?: string; frequency?: Subscription['frequency'] } { - const subject = email.subject || ''; - const body = stripHtml(email.body || ''); - const sender = email.sender || ''; - const domain = extractDomain(sender); - - let isSubscription = false; - let serviceName: string | undefined; - let category: Subscription['category'] | undefined; - let amount: number | undefined; - let currency: string | undefined; - let frequency: Subscription['frequency'] | undefined; - - // Check known subscription services - const knownService = this.findKnownSubscription(domain); - if (knownService) { - serviceName = knownService.name; - category = knownService.category; - } - - // Check subject patterns - for (const pattern of this.subjectPatterns) { - if (pattern.test(subject)) { - isSubscription = true; - break; - } - } - - // Check body patterns - if (!isSubscription) { - let bodyMatches = 0; - for (const pattern of this.bodyPatterns) { - if (pattern.test(body)) { - bodyMatches++; - } - } - if (bodyMatches >= 2) { - isSubscription = true; - } - } - - // Extract subscription amount - if (isSubscription) { - const extracted = this.extractAmount(body); - amount = extracted.amount; - currency = extracted.currency; - frequency = this.detectFrequency(body); - - // Try to extract service name from subject if not known - if (!serviceName) { - serviceName = this.extractServiceName(subject, body); - } - - // Fallback to sender name or domain - if (!serviceName || serviceName.length < 3) { - // Use sender name if available - if (email.senderName && email.senderName.length > 2) { - serviceName = email.senderName; - } else { - // Use formatted domain as fallback - serviceName = formatDomainAsName(domain); - } - } - } - - return { - isSubscription, - serviceName, - category: category || 'other', - amount, - currency, - frequency, - }; - } - - /** - * Find known subscription service by domain - */ - private findKnownSubscription(domain: string): { name: string; category: Subscription['category'] } | null { - if (this.knownSubscriptions[domain]) { - return this.knownSubscriptions[domain]; - } - - for (const [subDomain, info] of Object.entries(this.knownSubscriptions)) { - if (isDomainMatch(domain, subDomain)) { - return info; - } - } - - return null; - } - - /** - * Extract subscription amount from text - * Only trusts a currency value that sits within a billing-context window. - */ - private extractAmount(text: string): { amount?: number; currency: string } { - // Billing-context keywords that must appear NEAR the price to trust it - const billingContext = /(?:charged?|charge|bill(?:ed|ing)?|renew(?:s|al|ed)?|recurring|payment|per\s+(?:month|year|week)|\/(?:mo|month|yr|year|wk|week))/i; - - const currencyPatterns: { symbol: string; pattern: RegExp }[] = [ - { symbol: 'USD', pattern: /\$\s*([\d,]+\.\d{2})/g }, - { symbol: 'EUR', pattern: /€\s*([\d,]+[.,]\d{2})/g }, - { symbol: 'GBP', pattern: /£\s*([\d,]+\.\d{2})/g }, - { symbol: 'JPY', pattern: /¥\s*([\d,]+)/g }, - ]; - - for (const { symbol, pattern } of currencyPatterns) { - for (const match of text.matchAll(pattern)) { - const idx = match.index ?? 0; - // Window of +/- 40 chars around the matched price - const window = text.slice(Math.max(0, idx - 40), idx + match[0].length + 40); - if (!billingContext.test(window)) continue; - - const raw = match[1].replace(/,/g, ''); - const amount = parseFloat(raw); - if (!isNaN(amount) && amount > 0) { - return { amount, currency: symbol }; - } - } - } - - return { currency: 'USD' }; - } - - /** - * Detect billing frequency from text - * Returns a frequency only when a billing/charge verb or per-X phrase anchors it. - * Returns undefined when there is no billing signal. - */ - private detectFrequency(text: string): Subscription['frequency'] | undefined { - // Frequency is only trusted when tied to a billing/charge verb or a per-X phrase. - const yearly = /(?:bill(?:ed)?|charged?|renew(?:s|al|ed)?|recurring)[^.]*?(?:yearly|annual(?:ly)?|per\s+year|\/(?:yr|year))|(?:per\s+year|\/(?:yr|year))/i; - const weekly = /(?:bill(?:ed)?|charged?|renew(?:s|al|ed)?|recurring)[^.]*?(?:weekly|per\s+week|\/(?:wk|week))|(?:per\s+week|\/(?:wk|week))/i; - const monthly = /(?:bill(?:ed)?|charged?|renew(?:s|al|ed)?|recurring)[^.]*?(?:monthly|per\s+month|\/(?:mo|month)|each\s+month)|(?:per\s+month|\/(?:mo|month)|each\s+month)/i; - - if (yearly.test(text)) return 'yearly'; - if (weekly.test(text)) return 'weekly'; - if (monthly.test(text)) return 'monthly'; - return undefined; // no billing signal -> unknown - } - - /** - * Extract service name from email content - */ - private extractServiceName(subject: string, body: string): string | undefined { - // Try to extract from subject - const subjectPatterns = [ - /(?:your\s+)?([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)\s+subscription/i, - /subscription\s+(?:to|for)\s+([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)/i, - /welcome\s+to\s+([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)/i, - /([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)\s+(?:membership|premium|pro|plus)/i, - ]; - - for (const pattern of subjectPatterns) { - const match = subject.match(pattern); - if (match && match[1]) { - const name = match[1].trim(); - // Validate it's a reasonable service name - if (name.length >= 2 && name.length <= 30 && this.isValidServiceName(name)) { - return name; - } - } - } - - // Try to extract from body - const bodyPatterns = [ - /(?:subscribing to|subscription to)\s+([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)/i, - /thank you for (?:joining|subscribing to)\s+([A-Z][a-zA-Z0-9]+(?:\s+[A-Z][a-zA-Z0-9]+)?)/i, - ]; - - for (const pattern of bodyPatterns) { - const match = body.match(pattern); - if (match && match[1]) { - const name = match[1].trim(); - if (name.length >= 2 && name.length <= 30 && this.isValidServiceName(name)) { - return name; - } - } - } - - return undefined; - } - - /** - * Check if a string is a valid service name (not generic words) - */ - private isValidServiceName(name: string): boolean { - const invalidWords = [ - 'your', 'the', 'this', 'that', 'our', 'monthly', 'annual', 'yearly', - 'weekly', 'subscription', 'membership', 'billing', 'payment', 'account', - 'email', 'newsletter', 'update', 'notification', 'com', 'org', 'net', - 'edu', 'gov', 'mail', 'info', 'noreply', 'reply' - ]; - return !invalidWords.includes(name.toLowerCase()); - } - - /** - * Get all known subscription services - */ - getKnownServices(): { domain: string; name: string; category: Subscription['category'] }[] { - return Object.entries(this.knownSubscriptions).map(([domain, info]) => ({ - domain, - ...info, - })); - } -} - -export const subscriptionDetector = new SubscriptionDetector(); - diff --git a/web/src/services/threadingService.ts b/web/src/services/threadingService.ts index fc58209..4f72bd2 100644 --- a/web/src/services/threadingService.ts +++ b/web/src/services/threadingService.ts @@ -34,7 +34,7 @@ class ThreadingService { for (const [threadKey, threadEmails] of threadMap) { // Sort emails by date (oldest first for conversation view) const sortedEmails = [...threadEmails].sort( - (a, b) => new Date(a.date).getTime() - new Date(b.date).getTime() + (a, b) => (a.date?.getTime() ?? Infinity) - (b.date?.getTime() ?? Infinity) ); const thread = this.createThread(threadKey, sortedEmails); @@ -81,7 +81,16 @@ class ThreadingService { */ private createThread(threadKey: string, emails: Email[]): EmailThread { const firstEmail = emails[0]; - const lastEmail = emails[emails.length - 1]; + + // Thread message dates derive only from emails that actually have a date. + // Undated emails are ignored for the thread's first/last timestamps; if no + // email in the thread has a date, fall back to the epoch (sorts last/oldest). + const validDates = emails + .map(e => e.date) + .filter((d): d is Date => d != null) + .map(d => d.getTime()); + const firstMessageDate = validDates.length ? new Date(Math.min(...validDates)) : new Date(0); + const lastMessageDate = validDates.length ? new Date(Math.max(...validDates)) : new Date(0); // Collect all unique participants const participants = new Set(); @@ -111,8 +120,8 @@ class ThreadingService { subject: firstEmail.subject, emails, participants: Array.from(participants), - firstMessageDate: new Date(firstEmail.date), - lastMessageDate: new Date(lastEmail.date), + firstMessageDate, + lastMessageDate, messageCount: emails.length, unreadCount, hasAttachments, @@ -144,7 +153,7 @@ class ThreadingService { } const sortedEmails = [...threadEmails].sort( - (a, b) => new Date(a.date).getTime() - new Date(b.date).getTime() + (a, b) => (a.date?.getTime() ?? Infinity) - (b.date?.getTime() ?? Infinity) ); return this.createThread(threadKey, sortedEmails); diff --git a/web/src/types/index.ts b/web/src/types/index.ts index 7abf144..4699368 100644 --- a/web/src/types/index.ts +++ b/web/src/types/index.ts @@ -6,7 +6,7 @@ export interface Email { senderName?: string; recipients: string[]; cc?: string[]; - date: Date; + date: Date | null; body: string; htmlBody?: string; attachments: Attachment[]; @@ -38,10 +38,10 @@ export interface Account { id?: number; serviceName: string; signupEmailId?: number; - signupDate: Date; + signupDate: Date | null; serviceType: 'streaming' | 'ecommerce' | 'social' | 'banking' | 'communication' | 'development' | 'other'; domain: string; - lastActivityDate?: Date; + lastActivityDate?: Date | null; emailCount: number; } @@ -68,7 +68,7 @@ export interface Contact { notes?: string; tags?: string[]; emailCount: number; - lastEmailDate: Date; + lastEmailDate: Date | null; } // Calendar event types @@ -167,7 +167,7 @@ export interface Subscription { monthlyAmount: number; currency: string; frequency: 'weekly' | 'monthly' | 'yearly'; - lastRenewalDate: Date; + lastRenewalDate: Date | null; nextRenewalDate?: Date; emailIds: number[]; isActive: boolean; @@ -180,7 +180,7 @@ export interface Newsletter { senderEmail: string; senderName: string; emailCount: number; - lastEmailDate: Date; + lastEmailDate: Date | null; frequency?: 'daily' | 'weekly' | 'monthly' | 'irregular'; unsubscribeLink?: string; isPromotional: boolean; diff --git a/web/src/utils/emailUtils.ts b/web/src/utils/emailUtils.ts index bb7621d..7a3b559 100644 --- a/web/src/utils/emailUtils.ts +++ b/web/src/utils/emailUtils.ts @@ -51,27 +51,6 @@ export function extractDomain(email: string): string { return cleaned.substring(atIndex + 1).toLowerCase(); } -/** - * Format a date for display - */ -export function formatDate(date: Date): string { - const now = new Date(); - const diff = now.getTime() - date.getTime(); - const days = Math.floor(diff / (1000 * 60 * 60 * 24)); - - if (days === 0) { - return date.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' }); - } else if (days === 1) { - return 'Yesterday'; - } else if (days < 7) { - return date.toLocaleDateString([], { weekday: 'long' }); - } else if (date.getFullYear() === now.getFullYear()) { - return date.toLocaleDateString([], { month: 'short', day: 'numeric' }); - } else { - return date.toLocaleDateString([], { year: 'numeric', month: 'short', day: 'numeric' }); - } -} - /** * Format file size for display */ @@ -128,85 +107,3 @@ export function getInitials(name: string): string { return (parts[0][0] + parts[parts.length > 1 ? 1 : 0][0]).toUpperCase(); } - -/** - * Get a consistent color for a string (for avatars, etc.) - */ -export function getColorForString(str: string): string { - const colors = [ - 'bg-red-500', - 'bg-orange-500', - 'bg-amber-500', - 'bg-yellow-500', - 'bg-lime-500', - 'bg-green-500', - 'bg-emerald-500', - 'bg-teal-500', - 'bg-cyan-500', - 'bg-sky-500', - 'bg-blue-500', - 'bg-indigo-500', - 'bg-violet-500', - 'bg-purple-500', - 'bg-fuchsia-500', - 'bg-pink-500', - 'bg-rose-500', - ]; - - let hash = 0; - for (let i = 0; i < str.length; i++) { - hash = str.charCodeAt(i) + ((hash << 5) - hash); - } - - return colors[Math.abs(hash) % colors.length]; -} - -/** - * Format a domain as a readable service/sender name - */ -export function formatDomainAsName(domain: string): string { - if (!domain) return ''; - - // Remove common prefixes (subdomains used for email) - let name = domain - .replace(/^(mail|email|noreply|no-reply|billing|notifications?|support|info|newsletter|news|updates?|marketing|promo|alerts?|digest|reply|bounce|mailer|sender|e\.)\./i, ''); - - // Extract main domain part - const parts = name.split('.'); - - // Handle TLDs - get the main domain name - if (parts.length >= 2) { - // Check for country-code second-level domains (e.g., co.uk, com.au) - const lastTwo = parts.slice(-2).join('.'); - const countrySecondLevel = ['co.uk', 'co.au', 'com.au', 'org.uk', 'co.nz', 'com.br']; - - if (countrySecondLevel.includes(lastTwo.toLowerCase()) && parts.length >= 3) { - name = parts[parts.length - 3]; - } else { - name = parts[parts.length - 2]; - } - } else { - name = parts[0]; - } - - // Skip if result is too generic - const genericNames = ['mail', 'email', 'noreply', 'info', 'support', 'contact', 'hello', 'team']; - if (genericNames.includes(name.toLowerCase())) { - // Try to get the domain root - if (parts.length >= 2) { - name = parts[0]; - } - } - - // Handle common compound domains - name = name - .replace(/[_-]/g, ' ') // Convert separators to spaces - .replace(/([a-z])([A-Z])/g, '$1 $2'); // Split camelCase - - // Capitalize each word - return name - .split(/\s+/) - .filter(word => word.length > 0) - .map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) - .join(' ') || domain; -} diff --git a/web/src/workers/__tests__/toAppEmail.test.ts b/web/src/workers/__tests__/toAppEmail.test.ts new file mode 100644 index 0000000..36a1db6 --- /dev/null +++ b/web/src/workers/__tests__/toAppEmail.test.ts @@ -0,0 +1,21 @@ +import { describe, it, expect } from 'vitest'; +import type { Email as LibEmail } from '@technical-1/email-archive-parser'; +import { toAppEmail } from '../toAppEmail'; + +const lib: LibEmail = { + subject: 'Hello', sender: 'a@example.com', senderName: 'A', + recipients: ['me@example.com'], date: null, + body: 'plain body', htmlBody: '

rich body

', + attachments: [], size: 123, isRead: true, isStarred: false, folderId: 'inbox', +}; + +describe('toAppEmail', () => { + it('adds app-only fields and preserves a null date', () => { + const e = toAppEmail(lib); + expect(e.emailType).toBe('regular'); + expect(e.date).toBeNull(); + expect(typeof e.snippet).toBe('string'); + expect(e.snippet!.length).toBeGreaterThan(0); + expect('id' in e).toBe(false); + }); +}); diff --git a/web/src/workers/parserWorker.ts b/web/src/workers/parserWorker.ts index acbc259..ec12b88 100644 --- a/web/src/workers/parserWorker.ts +++ b/web/src/workers/parserWorker.ts @@ -6,83 +6,19 @@ */ import JSZip from 'jszip'; +import { MBOXParser, OLMParser } from '@technical-1/email-archive-parser'; import type { Email, Contact, CalendarEvent } from '../types'; +import { toAppEmail } from './toAppEmail'; import type { WorkerInputMessage, WorkerOutputMessage, WorkerParseContext } from './parserWorker.types'; import { - decodeQuotedPrintable, - decodeRfc2047, - isMboxFromLine, - makeSnippet, - MAX_SUBJECT_LEN, - MAX_BODY_LEN, - MAX_EMAIL_LEN, MAX_COMPRESSED_BYTES, MAX_DECOMPRESSED_BYTES, } from '../services/mimeUtils'; -// ============================================================================ -// Worker-compatible utility functions (no DOM dependencies) -// ============================================================================ - -// Keep in sync with cleanEmailAddress in ../utils/emailUtils.ts -function cleanEmailAddress(email: string): string { - if (!email) return 'unknown@example.com'; - const cleaned = email.replace(/[<>]/g, '').trim(); - const match = cleaned.match(/([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/); - if (match) return match[1].replace(/[.,;:!?]+$/, '').toLowerCase(); - const bareMatch = cleaned.match(/([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+)/); - if (bareMatch) return bareMatch[1].replace(/[.,;:!?]+$/, '').toLowerCase(); - // Never leak display-name text; use the sentinel downstream code checks for. - return 'unknown@example.com'; -} - -function normalizeSubject(subject: string): string { - if (!subject) return ''; - let normalized = subject; - const prefixPattern = /^(re|fwd|fw|aw|sv|vs|antw|r):\s*/i; - while (prefixPattern.test(normalized)) { - normalized = normalized.replace(prefixPattern, ''); - } - return normalized.trim().toLowerCase().replace(/\s+/g, ' '); -} - -function decodeBase64(str: string): string { - try { - const binaryStr = atob(str); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } - return new TextDecoder('utf-8').decode(bytes); - } catch { - try { - return atob(str); - } catch { - return str; - } - } -} - -function stripHtml(html: string): string { - // Worker-compatible HTML stripping (no DOMParser) - return html - .replace(/]*>[\s\S]*?<\/style>/gi, '') - .replace(/]*>[\s\S]*?<\/script>/gi, '') - .replace(/<[^>]+>/g, ' ') - .replace(/ /g, ' ') - .replace(/&/g, '&') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/\s+/g, ' ') - .trim(); -} - // ============================================================================ // Worker context and messaging // ============================================================================ @@ -152,431 +88,35 @@ function sendError(message: string, stage?: string) { // MBOX Parser (Worker version) // ============================================================================ -const MBOX_CHUNK_SIZE = 5 * 1024 * 1024; // 5MB chunks const BATCH_SIZE = 100; -function findLastFromLine(text: string): number { - let lastIndex = -1; - let searchStart = text.length - 1; - - while (searchStart > 0) { - let idx = text.lastIndexOf('\r\nFrom ', searchStart); - let offset = 2; - - if (idx === -1) { - idx = text.lastIndexOf('\nFrom ', searchStart); - offset = 1; - } - - if (idx === -1) break; - - const lineStart = idx + offset; - let lineEnd = text.indexOf('\n', lineStart); - if (lineEnd === -1) lineEnd = text.length; - let line = text.substring(lineStart, lineEnd); - if (line.endsWith('\r')) line = line.slice(0, -1); - - if (isMboxFromLine(line)) { - lastIndex = lineStart; - break; - } - searchStart = idx - 1; - } - - if (lastIndex === -1 && text.startsWith('From ')) { - let lineEnd = text.indexOf('\n'); - if (lineEnd === -1) lineEnd = text.length; - let line = text.substring(0, lineEnd); - if (line.endsWith('\r')) line = line.slice(0, -1); - if (isMboxFromLine(line)) { - lastIndex = 0; - } - } - - return lastIndex; -} - -function parseEmailAddress(str: string): { email: string; name?: string } { - const trimmed = decodeRfc2047(str.trim()); - - const angleMatch = trimmed.match(/^(?:"?(.+?)"?\s*)?<([^>]+)>$/); - if (angleMatch) { - return { - name: angleMatch[1]?.trim() || undefined, - email: angleMatch[2]?.trim(), - }; - } - - const emailMatch = trimmed.match(/^([^\s@]+@[^\s@]+\.[^\s@]+)$/); - if (emailMatch) { - return { email: emailMatch[1] }; - } - - return { email: trimmed }; -} - -function parseRecipients(str: string): string[] { - if (!str) return []; - return str - .split(/[,;]/) - .map((r) => { - const { email } = parseEmailAddress(r.trim()); - return cleanEmailAddress(email); - }) - .filter(Boolean); -} - -function parseGmailLabels(labelsHeader: string): string[] { - if (!labelsHeader) return []; - - const labels: string[] = []; - let current = ''; - let inQuotes = false; - - for (const char of labelsHeader) { - if (char === '"') { - inQuotes = !inQuotes; - } else if (char === ',' && !inQuotes) { - if (current.trim()) { - labels.push(current.trim().toLowerCase()); - } - current = ''; - } else { - current += char; - } - } - - if (current.trim()) { - labels.push(current.trim().toLowerCase()); - } - - return labels; -} - -function mapGmailLabelsToFolder(labels: string): string { - const labelList = parseGmailLabels(labels); - - if (labelList.includes('inbox')) return 'inbox'; - if (labelList.includes('sent') || labelList.includes('sent mail')) return 'sent'; - if (labelList.includes('draft') || labelList.includes('drafts')) return 'drafts'; - if (labelList.includes('spam')) return 'spam'; - if (labelList.includes('trash')) return 'trash'; - - const customLabels = labelList.filter(l => - !l.startsWith('category ') && - !['opened', 'unread', 'starred', 'important', 'all mail'].includes(l) - ); - - if (customLabels.length > 0) { - return customLabels[0] - .toLowerCase() - .replace(/[^a-z0-9\s-]/g, '') - .replace(/\s+/g, '-') - .substring(0, 50); - } - - return 'archive'; -} - -function parseMimeParts(body: string, boundary: string): { text?: string; html?: string } { - const result: { text?: string; html?: string } = {}; - - const boundaryMarker = '--' + boundary; - const parts = body.split(boundaryMarker); - - for (const part of parts) { - if (!part.trim() || part.trim() === '--') continue; - - const headerEndIndex = part.indexOf('\n\n'); - if (headerEndIndex === -1) continue; - - const partHeaders = part.substring(0, headerEndIndex); - let partContent = part.substring(headerEndIndex + 2); - - const contentTypeMatch = partHeaders.match(/content-type:\s*([^;\n]+)/i); - const encodingMatch = partHeaders.match(/content-transfer-encoding:\s*(\S+)/i); - - if (!contentTypeMatch) continue; - - const partContentType = contentTypeMatch[1].toLowerCase().trim(); - const partEncoding = encodingMatch?.[1]?.toLowerCase() || '7bit'; - const partCharset = partHeaders.match(/charset=["']?([^"';\s]+)["']?/i)?.[1]; - - if (partContentType.includes('multipart/')) { - const nestedBoundaryMatch = partHeaders.match(/boundary=["']?([^"';\s\n]+)["']?/i); - if (nestedBoundaryMatch) { - const nestedResult = parseMimeParts(partContent, nestedBoundaryMatch[1]); - if (nestedResult.text && !result.text) result.text = nestedResult.text; - if (nestedResult.html && !result.html) result.html = nestedResult.html; - } - continue; - } - - partContent = partContent.trim(); - if (partEncoding === 'base64') { - try { - const cleaned = partContent.replace(/\s/g, ''); - partContent = decodeBase64(cleaned); - } catch { - // Keep original if decode fails - } - } else if (partEncoding === 'quoted-printable') { - partContent = decodeQuotedPrintable(partContent, partCharset); - } - - if (partContentType.includes('text/plain') && !result.text) { - result.text = partContent; - } else if (partContentType.includes('text/html') && !result.html) { - result.html = partContent; - } - } - - return result; -} - -function parseEmailFromLines(lines: string[]): Omit | null { - try { - if (lines.length < 2) return null; - - const headers: Record = {}; - let bodyStartIndex = 0; - let inHeaders = true; - - for (let i = 1; i < lines.length; i++) { - const line = lines[i]; - - if (line.trim() === '') { - bodyStartIndex = i + 1; - inHeaders = false; - break; - } - - if (inHeaders) { - if (line.match(/^\s+/) && Object.keys(headers).length > 0) { - const lastKey = Object.keys(headers).pop()!; - headers[lastKey] += ' ' + line.trim(); - } else { - const match = line.match(/^([^:]+):\s*(.*)$/); - if (match) { - const key = match[1].toLowerCase(); - headers[key] = match[2]; - } - } - } - } - - if (inHeaders) { - bodyStartIndex = lines.length; - } - - const bodyLines = lines.slice(bodyStartIndex); - // mboxrd un-escaping: body lines that were escaped as ">From "/">>From " - // (one extra ">") are restored by stripping a single leading ">". - const rawBody = bodyLines.join('\n').replace(/^>(>*From )/gm, '$1'); - - const contentType = headers['content-type'] || 'text/plain'; - let body = ''; - let htmlBody: string | undefined; - - if (contentType.includes('multipart/')) { - const boundaryMatch = contentType.match(/boundary=["']?([^"';\s]+)["']?/i); - if (boundaryMatch) { - const boundary = boundaryMatch[1]; - const parts = parseMimeParts(rawBody, boundary); - body = parts.text || ''; - htmlBody = parts.html; - } else { - body = rawBody; - } - } else { - body = rawBody; - const encoding = headers['content-transfer-encoding']?.toLowerCase(); - if (encoding === 'quoted-printable') { - const bodyCharset = contentType.match(/charset=["']?([^"';\s]+)["']?/i)?.[1]; - body = decodeQuotedPrintable(body, bodyCharset); - } else if (encoding === 'base64') { - try { - body = atob(body.replace(/\s/g, '')); - } catch { - // Keep original if decode fails - } - } - - if (contentType.includes('text/html')) { - htmlBody = body; - } - } - - const dateStr = headers['date'] || ''; - let date: Date | null = null; - if (dateStr) { - try { - const parsed = new Date(dateStr); - date = isNaN(parsed.getTime()) ? null : parsed; - } catch { - date = null; - } - } - - const from = headers['from'] || ''; - const { email: sender, name: senderName } = parseEmailAddress(from); - - const to = headers['to'] || ''; - const recipients = parseRecipients(to); - - const subject = decodeRfc2047(headers['subject'] || '(No Subject)'); - - let threadId = headers['x-gm-thrid'] || - headers['thread-topic'] || - headers['references']?.split(/\s+/)[0] || - headers['in-reply-to']; - - if (!threadId) { - const normalizedSubj = normalizeSubject(subject); - if (normalizedSubj) { - threadId = `subject:${normalizedSubj.toLowerCase().replace(/\s+/g, '-')}`; - } - } - - const gmailLabels = headers['x-gmail-labels'] || ''; - const folderId = mapGmailLabelsToFolder(gmailLabels); - const isRead = !gmailLabels.toLowerCase().includes('unread'); - const isStarred = gmailLabels.toLowerCase().includes('starred'); - - if (!sender && !subject) { - return null; - } - - return { - subject: subject.length > MAX_SUBJECT_LEN ? subject.slice(0, MAX_SUBJECT_LEN) : subject, - sender: cleanEmailAddress(sender).slice(0, MAX_EMAIL_LEN), - senderName: senderName || undefined, - recipients: recipients.map(r => r.slice(0, MAX_EMAIL_LEN)).slice(0, 1000), - date: date || new Date(), - body: (() => { - const b = body.trim() || (htmlBody ? stripHtml(htmlBody) : ''); - return b.length > MAX_BODY_LEN ? b.slice(0, MAX_BODY_LEN) : b; - })(), - htmlBody: htmlBody && htmlBody.length > MAX_BODY_LEN ? htmlBody.slice(0, MAX_BODY_LEN) : htmlBody, - snippet: makeSnippet(htmlBody || body), - attachments: [], - size: Math.min(lines.join('\n').length, 100000), - isRead, - isStarred, - folderId, - threadId, - emailType: 'regular', - }; - } catch (error) { - console.warn('Failed to parse email:', error); - return null; - } -} - -function parseEmailsFromText(text: string): Omit[] { - const emails: Omit[] = []; - const normalizedText = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); - const lines = normalizedText.split('\n'); - let currentEmail: string[] = []; - - for (const line of lines) { - if (isMboxFromLine(line) && currentEmail.length > 0) { - const email = parseEmailFromLines(currentEmail); - if (email) { - emails.push(email); - } - currentEmail = []; - } - currentEmail.push(line); - } - - if (currentEmail.length > 0 && currentEmail.some(line => line.trim().length > 0)) { - const email = parseEmailFromLines(currentEmail); - if (email) { - emails.push(email); - } - } - - return emails; -} - async function parseMBOXFile(file: File): Promise { - const fileSize = file.size; - let offset = 0; - let leftover = ''; - let batchNumber = 0; - let currentBatch: Omit[] = []; - - reportProgress('extracting', 0, `Processing ${(fileSize / 1024 / 1024).toFixed(1)}MB file...`); - - while (offset < fileSize && !ctx.isCancelled) { - const chunkEnd = Math.min(offset + MBOX_CHUNK_SIZE, fileSize); - const chunk = file.slice(offset, chunkEnd); - - let chunkText: string; - try { - chunkText = await chunk.text(); - } catch (e) { - console.error('Error reading chunk:', e); - break; - } - - const textToProcess = leftover + chunkText; - const lastFromIndex = findLastFromLine(textToProcess); - - let processableText: string; - if (lastFromIndex > 0 && chunkEnd < fileSize) { - processableText = textToProcess.substring(0, lastFromIndex); - leftover = textToProcess.substring(lastFromIndex); - } else { - processableText = textToProcess; - leftover = ''; - } - - const chunkEmails = parseEmailsFromText(processableText); - - for (const email of chunkEmails) { - currentBatch.push(email); - - if (currentBatch.length >= BATCH_SIZE) { - sendEmailBatch(currentBatch, batchNumber, false); - ctx.totalEmailsParsed += currentBatch.length; - batchNumber++; - currentBatch = []; - - // Yield to allow message processing - await new Promise(resolve => setTimeout(resolve, 0)); - } - } - - offset = chunkEnd; - const progress = Math.round((offset / fileSize) * 95); - reportProgress( - 'parsing_emails', - progress, - `Parsed ${ctx.totalEmailsParsed + currentBatch.length} emails (${Math.round(offset / fileSize * 100)}% read)...` - ); - - await new Promise(resolve => setTimeout(resolve, 0)); - } - - // Process remaining text - if (leftover.trim() && !ctx.isCancelled) { - const finalEmails = parseEmailsFromText(leftover); - for (const email of finalEmails) { - currentBatch.push(email); + const parser = new MBOXParser(); + let lastBatch = 0; + // NOTE: the library's parseStreaming has no cancellation hook, so once started + // it reads the whole file. Honoring ctx.isCancelled here stops us EMITTING + // batches/progress, but the library keeps parsing in the background until done + // (it yields to the event loop between chunks, so the worker stays responsive). + // A future lib enhancement could accept an AbortSignal to abort mid-file. + await parser.parseStreaming( + file, + (p) => { + if (ctx.isCancelled) return; + // Map the library's 'complete' stage onto the worker's 'saving' stage. + const stage = p.stage === 'complete' ? 'saving' : p.stage; + reportProgress(stage as 'extracting' | 'parsing_emails' | 'saving', p.progress, p.message); + }, + async (batch, n) => { + if (ctx.isCancelled) return; + const mapped = batch.map(toAppEmail); + sendEmailBatch(mapped, n, false); + ctx.totalEmailsParsed += mapped.length; + lastBatch = n; } - } - - // Send final batch - if (currentBatch.length > 0) { - sendEmailBatch(currentBatch, batchNumber, true); - ctx.totalEmailsParsed += currentBatch.length; - } else { - // Send empty final batch to signal completion - sendEmailBatch([], batchNumber, true); - } - + ); + if (ctx.isCancelled) return; + // parseStreaming flushed its final batch above; signal end-of-stream. + sendEmailBatch([], lastBatch + 1, true); reportProgress('saving', 100, `Parsed ${ctx.totalEmailsParsed} emails successfully`); } @@ -584,357 +124,46 @@ async function parseMBOXFile(file: File): Promise { // OLM Parser (Worker version) // ============================================================================ -function parseOLMEmailXML(xmlContent: string): Omit | null { - try { - // Simple XML parsing without DOMParser (not available in workers in some browsers) - // We'll use regex-based parsing for reliability - - const getTagContent = (content: string, tagName: string): string => { - const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)`, 'i'); - const match = content.match(regex); - return match ? match[1].trim() : ''; - }; - - const getAttribute = (content: string, tagName: string, attrName: string): string => { - const tagRegex = new RegExp(`<${tagName}[^>]*${attrName}="([^"]*)"[^>]*>`, 'i'); - const match = content.match(tagRegex); - return match ? match[1] : ''; - }; - - const subject = getTagContent(xmlContent, 'OPFMessageCopySubject') || - getTagContent(xmlContent, 'subject') || - '(No Subject)'; - const body = getTagContent(xmlContent, 'OPFMessageCopyBody') || - getTagContent(xmlContent, 'body') || ''; - const htmlBody = getTagContent(xmlContent, 'OPFMessageCopyHTMLBody') || - getTagContent(xmlContent, 'htmlBody') || undefined; - const preview = getTagContent(xmlContent, 'OPFMessageCopyPreview'); - - // Parse sender - const fromAddresses = getTagContent(xmlContent, 'OPFMessageCopyFromAddresses'); - let sender = ''; - let senderName = ''; - if (fromAddresses) { - sender = getAttribute(fromAddresses, 'emailAddress', 'OPFContactEmailAddressAddress'); - senderName = getAttribute(fromAddresses, 'emailAddress', 'OPFContactEmailAddressName'); - } - if (!sender) { - sender = getTagContent(xmlContent, 'from') || getTagContent(xmlContent, 'sender') || ''; - } - - // Parse date - const dateStr = getTagContent(xmlContent, 'OPFMessageCopySentTime') || - getTagContent(xmlContent, 'OPFMessageCopyReceivedTime') || - getTagContent(xmlContent, 'date') || ''; - const date = dateStr ? new Date(dateStr) : new Date(); - - // Parse recipients - const recipients: string[] = []; - const toAddresses = getTagContent(xmlContent, 'OPFMessageCopyToAddresses'); - if (toAddresses) { - const emailMatches = toAddresses.matchAll(/OPFContactEmailAddressAddress="([^"]+)"/g); - for (const match of emailMatches) { - recipients.push(match[1]); - } - } - - // Parse isRead status - const isReadStr = getTagContent(xmlContent, 'OPFMessageGetIsRead'); - const isRead = isReadStr === '1' || isReadStr.toLowerCase() === 'true'; - - // Parse thread ID - let threadId = getTagContent(xmlContent, 'OPFMessageCopyThreadTopic') || - getTagContent(xmlContent, 'OPFMessageCopyConversationID') || ''; - - if (!threadId) { - const normalizedSubject = normalizeSubject(subject); - if (normalizedSubject) { - threadId = `subject:${normalizedSubject.toLowerCase().replace(/\s+/g, '-')}`; - } - } - - if (!subject && !body && !preview) { - return null; - } - - return { - subject: (subject || '(No Subject)').slice(0, MAX_SUBJECT_LEN), - sender: cleanEmailAddress(sender).slice(0, MAX_EMAIL_LEN), - senderName: senderName || undefined, - recipients: recipients.map(r => r.slice(0, MAX_EMAIL_LEN)).slice(0, 1000), - date: isNaN(date.getTime()) ? new Date() : date, - body: (() => { const b = body || preview || ''; return b.length > MAX_BODY_LEN ? b.slice(0, MAX_BODY_LEN) : b; })(), - htmlBody: htmlBody && htmlBody.length > MAX_BODY_LEN ? htmlBody.slice(0, MAX_BODY_LEN) : (htmlBody || undefined), - snippet: makeSnippet(htmlBody || body || preview || ''), - attachments: [], - size: xmlContent.length, - isRead, - isStarred: false, - folderId: 'inbox', - threadId: threadId || undefined, - emailType: 'regular', - }; - } catch (error) { - console.warn('Failed to parse OLM email XML:', error); - return null; - } -} - -function parseOLMContactsXML(xmlContent: string): Omit[] { - const contacts: Omit[] = []; - - try { - const getTagContent = (content: string, tagName: string): string => { - const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)`, 'i'); - const match = content.match(regex); - return match ? match[1].trim() : ''; - }; - - const getAttribute = (content: string, tagName: string, attrName: string): string => { - const tagRegex = new RegExp(`<${tagName}[^>]*${attrName}="([^"]*)"[^>]*>`, 'i'); - const match = content.match(tagRegex); - return match ? match[1] : ''; - }; - - // Find all contact elements - const contactMatches = xmlContent.matchAll(/]*>([\s\S]*?)<\/contact>/gi); - - for (const contactMatch of contactMatches) { - const contactContent = contactMatch[1]; - - const displayName = getTagContent(contactContent, 'OPFContactCopyDisplayName') || - getTagContent(contactContent, 'displayName') || ''; - const firstName = getTagContent(contactContent, 'OPFContactCopyFirstName') || ''; - const lastName = getTagContent(contactContent, 'OPFContactCopyLastName') || ''; - const phone = getTagContent(contactContent, 'OPFContactCopyPhoneNumbers') || ''; - - // Get email - let email = ''; - const emailList = getTagContent(contactContent, 'OPFContactCopyEmailAddressList') || - getTagContent(contactContent, 'OPFContactCopyDefaultEmailAddress'); - if (emailList) { - email = getAttribute(emailList, 'contactEmailAddress', 'OPFContactEmailAddressAddress'); - } - if (!email) { - email = getTagContent(contactContent, 'email') || ''; - } - - const name = displayName || `${firstName} ${lastName}`.trim() || email.split('@')[0] || 'Unknown'; - - if (email || name !== 'Unknown') { - contacts.push({ - name, - email: cleanEmailAddress(email), - phone: phone || undefined, - emailCount: 0, - lastEmailDate: new Date(), - }); - } - } - } catch (error) { - console.warn('Failed to parse OLM contacts XML:', error); - } - - return contacts; -} - -function parseOLMCalendarXML(xmlContent: string): Omit[] { - const events: Omit[] = []; - - try { - const getTagContent = (content: string, tagName: string): string => { - const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)`, 'i'); - const match = content.match(regex); - return match ? match[1].trim() : ''; - }; - - // Find all appointment elements - const appointmentMatches = xmlContent.matchAll(/]*>([\s\S]*?)<\/appointment>/gi); - - for (const appointmentMatch of appointmentMatches) { - const appointmentContent = appointmentMatch[1]; - - const title = getTagContent(appointmentContent, 'OPFCalendarEventCopySummary') || - getTagContent(appointmentContent, 'OPFCalendarEventCopySubject') || - getTagContent(appointmentContent, 'summary') || ''; - const startDateStr = getTagContent(appointmentContent, 'OPFCalendarEventCopyStartTime') || ''; - const endDateStr = getTagContent(appointmentContent, 'OPFCalendarEventCopyEndTime') || ''; - const location = getTagContent(appointmentContent, 'OPFCalendarEventCopyLocation') || ''; - const description = getTagContent(appointmentContent, 'OPFCalendarEventCopyBody') || ''; - const organizer = getTagContent(appointmentContent, 'OPFCalendarEventCopyOrganizer') || ''; - const isAllDayStr = getTagContent(appointmentContent, 'OPFCalendarEventGetIsAllDayEvent') || ''; - - if (!title) continue; - - const startDate = startDateStr ? new Date(startDateStr) : new Date(); - const endDate = endDateStr ? new Date(endDateStr) : new Date(startDate.getTime() + 3600000); - - events.push({ - title, - startDate: isNaN(startDate.getTime()) ? new Date() : startDate, - endDate: isNaN(endDate.getTime()) ? new Date() : endDate, - location: location || undefined, - attendees: organizer ? [organizer] : [], - description: description || undefined, - isAllDay: isAllDayStr === '1' || isAllDayStr.toLowerCase() === 'true', - reminder: false, - isRead: false, - }); - } - } catch (error) { - console.warn('Failed to parse OLM calendar XML:', error); - } - - return events; -} - async function parseOLMFile(file: File): Promise { - reportProgress('extracting', 0, 'Extracting OLM archive...'); - if (file.size > MAX_COMPRESSED_BYTES) { throw new Error(`File too large (${(file.size / 1024 / 1024).toFixed(0)}MB). Maximum supported size is 500MB.`); } + reportProgress('extracting', 0, 'Extracting OLM archive...'); - const zip = await JSZip.loadAsync(file); - - let totalDecompressedSize = 0; - for (const entry of Object.values(zip.files)) { - if (!entry.dir) { - const entryData = (entry as unknown as { _data?: { uncompressedSize?: number } })._data; - if (entryData && typeof entryData.uncompressedSize === 'number') { - totalDecompressedSize += entryData.uncompressedSize; - } - } - } - if (totalDecompressedSize > MAX_DECOMPRESSED_BYTES) { - throw new Error('Archive decompressed size exceeds 2GB limit. This may be a malicious file.'); - } - - reportProgress('extracting', 100, 'Archive extracted successfully'); - - const files = Object.keys(zip.files); - - const emailFiles = files.filter(f => - f.includes('com.microsoft.__Messages/') && - f.match(/message_\d+\.xml$/) && - !zip.files[f].dir - ); - - const contactFiles = files.filter(f => - (f.includes('Address Book/Contacts.xml') || - (f.includes('/Contacts/') && f.endsWith('.xml'))) && - !zip.files[f].dir - ); - - const calendarFiles = files.filter(f => - f.includes('/Calendar/') && - f.endsWith('Calendar.xml') && - !zip.files[f].dir - ); - - console.log(`Found ${emailFiles.length} emails, ${contactFiles.length} contact files, ${calendarFiles.length} calendar files`); - - // Parse emails - if (emailFiles.length > 0) { - reportProgress('parsing_emails', 0, `Parsing ${emailFiles.length} emails...`); - - let currentBatch: Omit[] = []; - let batchNumber = 0; - - for (let i = 0; i < emailFiles.length && !ctx.isCancelled; i++) { - try { - const content = await zip.files[emailFiles[i]].async('string'); - const email = parseOLMEmailXML(content); - if (email) { - currentBatch.push(email); - - if (currentBatch.length >= BATCH_SIZE) { - sendEmailBatch(currentBatch, batchNumber, false); - ctx.totalEmailsParsed += currentBatch.length; - batchNumber++; - currentBatch = []; - await new Promise(resolve => setTimeout(resolve, 0)); - } - } - } catch (err) { - console.warn(`Failed to parse email ${emailFiles[i]}:`, err); - } - - if (i % 100 === 0 || i === emailFiles.length - 1) { - reportProgress( - 'parsing_emails', - Math.round((i + 1) / emailFiles.length * 100), - `Parsed ${ctx.totalEmailsParsed + currentBatch.length} of ${emailFiles.length} emails` - ); - } - } - - // Send final email batch - if (currentBatch.length > 0) { - sendEmailBatch(currentBatch, batchNumber, true); - ctx.totalEmailsParsed += currentBatch.length; - } else { - sendEmailBatch([], batchNumber, true); - } - } else { - sendEmailBatch([], 0, true); - } - - // Parse contacts - if (contactFiles.length > 0 && !ctx.isCancelled) { - reportProgress('parsing_contacts', 0, 'Parsing contacts...'); - - const allContacts: Omit[] = []; - - for (let i = 0; i < contactFiles.length && !ctx.isCancelled; i++) { - try { - const content = await zip.files[contactFiles[i]].async('string'); - const contacts = parseOLMContactsXML(content); - allContacts.push(...contacts); - } catch (err) { - console.warn(`Failed to parse contacts ${contactFiles[i]}:`, err); - } - } - - if (allContacts.length > 0) { - sendContactBatch(allContacts, 0, true); - ctx.totalContactsParsed = allContacts.length; - } else { - sendContactBatch([], 0, true); - } - - reportProgress('parsing_contacts', 100, `Parsed ${ctx.totalContactsParsed} contacts`); - } else { - sendContactBatch([], 0, true); - } - - // Parse calendar events - if (calendarFiles.length > 0 && !ctx.isCancelled) { - reportProgress('parsing_calendar', 0, 'Parsing calendar files...'); - - const allEvents: Omit[] = []; - - for (let i = 0; i < calendarFiles.length && !ctx.isCancelled; i++) { - try { - const content = await zip.files[calendarFiles[i]].async('string'); - const events = parseOLMCalendarXML(content); - allEvents.push(...events); - } catch (err) { - console.warn(`Failed to parse calendar ${calendarFiles[i]}:`, err); - } - } - - if (allEvents.length > 0) { - sendCalendarBatch(allEvents, 0, true); - ctx.totalCalendarEventsParsed = allEvents.length; - } else { - sendCalendarBatch([], 0, true); - } - - reportProgress('parsing_calendar', 100, `Parsed ${ctx.totalCalendarEventsParsed} calendar events`); - } else { - sendCalendarBatch([], 0, true); - } + const result = await new OLMParser().parse(file, { + onProgress: (p) => { + if (ctx.isCancelled) return; + const stage = p.stage === 'complete' ? 'saving' : p.stage; + reportProgress(stage as 'extracting' | 'parsing_emails' | 'parsing_contacts' | 'parsing_calendar' | 'saving', p.progress, p.message); + }, + }); + if (ctx.isCancelled) return; + + // Stream emails to the main thread in BATCH_SIZE chunks (worker contract). + for (let i = 0; i < result.emails.length; i += BATCH_SIZE) { + if (ctx.isCancelled) return; + const slice = result.emails.slice(i, i + BATCH_SIZE).map(toAppEmail); + const isLast = i + BATCH_SIZE >= result.emails.length; + sendEmailBatch(slice, Math.floor(i / BATCH_SIZE), isLast); + ctx.totalEmailsParsed += slice.length; + await new Promise((r) => setTimeout(r, 0)); + } + if (result.emails.length === 0) sendEmailBatch([], 0, true); + + // Contacts (library extracts Address Book + sender-derived) and calendar events. + const contacts = result.contacts.map((c) => ({ + name: c.name, + email: c.email, + phone: c.phone, + emailCount: c.emailCount, + lastEmailDate: c.lastEmailDate, + })); + sendContactBatch(contacts, 0, true); + ctx.totalContactsParsed = contacts.length; + + const events = result.calendarEvents.map((ev) => ({ ...ev, isRead: false })); + sendCalendarBatch(events, 0, true); + ctx.totalCalendarEventsParsed = events.length; reportProgress('saving', 100, 'Processing complete!'); } @@ -943,111 +172,93 @@ async function parseOLMFile(file: File): Promise { // Gmail Takeout Parser (Worker version) // ============================================================================ +function mapTakeoutFolder(folderName: string): string { + const n = folderName.toLowerCase(); + if (n.includes('inbox')) return 'inbox'; + if (n.includes('sent')) return 'sent'; + if (n.includes('draft')) return 'drafts'; + if (n.includes('trash') || n.includes('deleted')) return 'trash'; + if (n.includes('spam') || n.includes('junk')) return 'spam'; + if (n.includes('archive') || n === 'all mail') return 'archive'; + if (n.includes('starred') || n.includes('important')) return 'starred'; + return `gmail-${n.replace(/\s+/g, '-')}`; +} + async function parseGmailTakeoutFile(file: File): Promise { reportProgress('extracting', 0, 'Opening Gmail Takeout archive...'); - if (file.size > MAX_COMPRESSED_BYTES) { throw new Error(`File too large (${(file.size / 1024 / 1024).toFixed(0)}MB). Maximum supported size is 500MB.`); } - const zip = await JSZip.loadAsync(file); let totalDecompressedSize = 0; for (const entry of Object.values(zip.files)) { if (!entry.dir) { const entryData = (entry as unknown as { _data?: { uncompressedSize?: number } })._data; - if (entryData && typeof entryData.uncompressedSize === 'number') { - totalDecompressedSize += entryData.uncompressedSize; - } + if (entryData && typeof entryData.uncompressedSize === 'number') totalDecompressedSize += entryData.uncompressedSize; } } if (totalDecompressedSize > MAX_DECOMPRESSED_BYTES) { throw new Error('Archive decompressed size exceeds 2GB limit. This may be a malicious file.'); } - const mboxFiles: string[] = []; zip.forEach((path, zipEntry) => { - if (!zipEntry.dir && (path.endsWith('.mbox') || path.includes('Takeout/Mail/'))) { - mboxFiles.push(path); - } + if (!zipEntry.dir && (path.endsWith('.mbox') || path.includes('Takeout/Mail/'))) mboxFiles.push(path); }); - reportProgress('extracting', 10, `Found ${mboxFiles.length} mail folders`); - - if (mboxFiles.length === 0) { - throw new Error('No email archives found in this Takeout file.'); - } + if (mboxFiles.length === 0) throw new Error('No email archives found in this Takeout file.'); let batchNumber = 0; const seenEmailKeys = new Set(); + const parser = new MBOXParser(); - // Process MBOX files sequentially to reduce memory pressure for (let fileIndex = 0; fileIndex < mboxFiles.length && !ctx.isCancelled; fileIndex++) { const mboxPath = mboxFiles[fileIndex]; - try { const zipEntry = zip.file(mboxPath); if (!zipEntry) continue; - const folderName = mboxPath.split('/').pop()?.replace('.mbox', '').replace(/_/g, ' ') || 'Unknown'; - - reportProgress( - 'parsing_emails', - 10 + (fileIndex / mboxFiles.length) * 80, - `Processing ${folderName} (${fileIndex + 1}/${mboxFiles.length})...` - ); + const folderId = mapTakeoutFolder(folderName); + reportProgress('parsing_emails', 10 + (fileIndex / mboxFiles.length) * 80, `Processing ${folderName} (${fileIndex + 1}/${mboxFiles.length})...`); - // Get content and parse (let so we can null it for GC) - let content: string | null = await zipEntry.async('string'); - const emails = parseEmailsFromText(content); + const text = await zipEntry.async('string'); + const mboxFile = new File([text], `${folderName}.mbox`, { type: 'application/mbox' }); - // Deduplicate and batch let currentBatch: Omit[] = []; - - for (const email of emails) { - // Create unique key for deduplication - const key = email.threadId || `${email.subject}|${email.sender}|${email.date.getTime()}`; - - if (!seenEmailKeys.has(key)) { + await parser.parseStreaming(mboxFile, undefined, async (batch) => { + if (ctx.isCancelled) return; + for (const libEmail of batch) { + const e = toAppEmail(libEmail); + const key = e.threadId || `${e.subject}|${e.sender}|${e.date ? e.date.getTime() : 'nodate'}`; + if (seenEmailKeys.has(key)) continue; seenEmailKeys.add(key); - currentBatch.push(email); - + currentBatch.push({ ...e, folderId }); if (currentBatch.length >= BATCH_SIZE) { - sendEmailBatch(currentBatch, batchNumber, false); + sendEmailBatch(currentBatch, batchNumber++, false); ctx.totalEmailsParsed += currentBatch.length; - batchNumber++; currentBatch = []; - await new Promise(resolve => setTimeout(resolve, 0)); + await new Promise((r) => setTimeout(r, 0)); } } - } + }); - // Send remaining emails from this file if (currentBatch.length > 0) { - const isLast = fileIndex === mboxFiles.length - 1; - sendEmailBatch(currentBatch, batchNumber, isLast); + sendEmailBatch(currentBatch, batchNumber++, false); ctx.totalEmailsParsed += currentBatch.length; - batchNumber++; } - - // Help garbage collection - content = null; - } catch (error) { console.warn(`Failed to parse ${mboxPath}:`, error); } } - // If no emails were sent (all files failed), send empty final batch - if (ctx.totalEmailsParsed === 0) { - sendEmailBatch([], 0, true); - } - - // Send empty contact and calendar batches (Gmail Takeout doesn't include these) + if (ctx.isCancelled) return; + // Always emit a final isLast batch so the stream terminates cleanly even if the + // last folder produced no new emails. + sendEmailBatch([], batchNumber, true); + // Gmail Takeout has no contacts/calendar. sendContactBatch([], 0, true); sendCalendarBatch([], 0, true); - reportProgress('saving', 100, `Imported ${ctx.totalEmailsParsed} unique emails`); } diff --git a/web/src/workers/toAppEmail.ts b/web/src/workers/toAppEmail.ts new file mode 100644 index 0000000..79fa4b1 --- /dev/null +++ b/web/src/workers/toAppEmail.ts @@ -0,0 +1,28 @@ +import type { Email as LibEmail } from '@technical-1/email-archive-parser'; +import type { Email } from '../types'; +import { makeSnippet } from '../services/mimeUtils'; + +/** + * Map a library Email (no app-specific fields) to an EmailAnalyzer row shape. + * Adds snippet + emailType; preserves a null date (library v3 semantics). + */ +export function toAppEmail(e: LibEmail): Omit { + return { + subject: e.subject, + sender: e.sender, + senderName: e.senderName, + recipients: e.recipients, + cc: e.cc, + date: e.date, // Date | null + body: e.body, + htmlBody: e.htmlBody, + attachments: [], + size: e.size, + isRead: e.isRead, + isStarred: e.isStarred, + folderId: e.folderId, + threadId: e.threadId, + snippet: makeSnippet(e.htmlBody || e.body || ''), + emailType: 'regular', + }; +}