diff --git a/tests/unsubmitted_forms/README.md b/tests/unsubmitted_forms/README.md new file mode 100644 index 0000000..159345c --- /dev/null +++ b/tests/unsubmitted_forms/README.md @@ -0,0 +1,70 @@ +# Cleanup Unsubmitted Forms - Solution + +## Video Explanation + +[Watch the Loom video](https://www.loom.com/share/3d0413ae4c2845cab8a3e3c93b62f9e3) + +## Issues Found & Fixes + +### 1. Date Calculation Bug +**Problem:** Missing `* 1000` for milliseconds conversion - was only subtracting ~10 minutes instead of 7 days. + +**Fix:** Added `* 1000` to convert seconds to milliseconds. + +### 2. Query Logic Error +**Problem:** Original query only found tokens from exactly 7 days ago (a 24-hour window), missing tokens that were 8, 9, 10+ days old. + +**Fix:** Changed to `lt: cutoffDate` to find all tokens older than 7 days. + +### 3. Missing Submission Status Check +**Problem:** Query fetched all old tokens regardless of whether the form was submitted, risking deletion of valid submitted data. + +**Fix:** Added `submittedAt: null` filter to only fetch unsubmitted tokens. + +### 4. Wrong Relationship Deletion +**Problem:** Relationship query only filtered by `product_id` and `status`, which could accidentally delete relationships belonging to other valid tokens that share the same product. + +**Fix:** Added `entity_id: { in: entityIds }` to the relationship query to ensure we only delete relationships tied to the specific expired entities. + +### 5. Foreign Key Constraint Order +**Problem:** Deletion order could fail if child records reference parent tables. + +**Fix:** Reordered deletions: relationships → corpus items → tokens → entities (children before parents). + +### 6. Null EntityId Handling +**Problem:** Passing null/undefined values to delete queries would cause errors. + +**Fix:** Filter out null values using `filter((id): id is string => Boolean(id))` before batch operations. + +### 7. N+1 Query Problem (Performance) +**Problem:** Loop with individual `findFirst` and `$transaction` per token would cause thousands of DB calls at scale. + +**Fix:** Refactored to batch approach using `deleteMany` with `{ in: [...] }` in a single transaction. + +### 8. Memory Issues with Large Datasets + +**Problem:** Loading all expired tokens at once could crash the process with Out of Memory errors. + +**Fix:** Implemented batching with `take: BATCH_SIZE` (1000 records) in a while loop to process in chunks. + +### 9. Duplicate IDs in Queries + +**Problem:** Multiple tokens could share the same entityId/productId, causing unnecessary parameter bloat in IN clauses. + +**Fix:** Deduplicate IDs using `[...new Set(...)]` before batch operations. + +## Final Solution Features + +- **Configurable constants:** `BATCH_SIZE` and `RETENTION_DAYS` at the top for easy adjustment +- **Batched processing:** Handles millions of records without memory issues +- **Safe relationship deletion:** Only deletes relationships tied to expired entities +- **Atomic transactions:** Each batch is processed in a single transaction +- **Observability:** Logs total counts of deleted records for monitoring + +## Trade-offs Considered + +- **Batching:** Chose 1000 records per batch as a balance between performance and memory. Could be tuned based on server resources. + +- **Extra query for relationships:** Added a `findMany` to get relationship IDs before deletion. This adds one query per batch but ensures we never delete the wrong relationships. + +- **Atomicity per batch:** Each batch is atomic, not the entire job. If the job fails mid-way, some batches will have been committed. This is acceptable for a cleanup job - it can safely be re-run. diff --git a/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts b/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts index e7bd3e2..4125723 100644 --- a/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts +++ b/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts @@ -7,74 +7,114 @@ This is to prevent the database from being cluttered with unused tokens and entities. */ -/* Task Instructions: - * 1. Read and understand the code below - * 2. Identify ALL issues in the code (there are multiple) - * 3. Fix the issues and create a working solution - * 4. Create a PR with clear commit messages - * 5. Record a 3-5 minute Loom video explaining: - * - What issues you found - * - How you fixed them - * - Any trade-offs you considered - * - * Focus on: correctness, performance, error handling, and code clarity - * Expected time: 45-60 minutes - */ - // For the purpose of this test you can ignore that the imports are not working. import type { JobScheduleQueue } from "@prisma/client"; import { prisma } from "../endpoints/middleware/prisma"; import { update_job_status } from "./generic_scheduler"; +const BATCH_SIZE = 1000; +const RETENTION_DAYS = 7; + export const cleanup_unsubmitted_forms = async (job: JobScheduleQueue) => { try { - //Find forms that were created 7 days ago and have not been submitted - const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60); - const sevenDaysAgoPlusOneDay = new Date( - sevenDaysAgo.getTime() + 24 * 60 * 60 * 1000 + const cutoffDate = new Date( + Date.now() - RETENTION_DAYS * 24 * 60 * 60 * 1000, ); - const expiredTokens = await prisma.publicFormsTokens.findMany({ - where: { - createdAt: { - gte: sevenDaysAgo, // greater than or equal to 7 days ago - lt: sevenDaysAgoPlusOneDay, // but less than 7 days ago + 1 day + let totalTokensDeleted = 0; + let totalEntitiesDeleted = 0; + let totalRelationshipsDeleted = 0; + let totalCorpusDeleted = 0; + + // Process in batches to handle large datasets without memory issues + while (true) { + const expiredTokens = await prisma.publicFormsTokens.findMany({ + where: { + createdAt: { lt: cutoffDate }, + submittedAt: null, }, - }, - }); + select: { + token: true, + entityId: true, + productId: true, + }, + take: BATCH_SIZE, + }); + + if (expiredTokens.length === 0) { + break; + } + + // Deduplicate identifiers for batch operations + const tokenStrings = [...new Set(expiredTokens.map((t) => t.token))]; - for (const token of expiredTokens) { - const relationship = await prisma.relationship.findFirst({ + const entityIds = [ + ...new Set( + expiredTokens + .map((t) => t.entityId) + .filter((id): id is string => Boolean(id)), + ), + ]; + + const productIds = [ + ...new Set( + expiredTokens + .map((t) => t.productId) + .filter((id): id is string => Boolean(id)), + ), + ]; + + // Find only relationships that belong to these specific expired tokens + // This prevents accidentally deleting relationships for non-expired tokens + // that may share the same product_id + const relationshipsToDelete = await prisma.relationship.findMany({ where: { - product_id: token.productId, + product_id: { in: productIds }, status: "new", + // Ensure we only delete relationships tied to expired entities + entity_id: { in: entityIds }, }, + select: { id: true }, }); - if (relationship) { - await prisma.$transaction([ - // Delete relationship - prisma.relationship.delete({ - where: { id: relationship.id }, - }), - // // Delete the token - prisma.publicFormsTokens.delete({ - where: { token: token.token }, - }), - // Delete all corpus items associated with the entity - prisma.new_corpus.deleteMany({ - where: { - entity_id: token.entityId || "", - }, - }), - // Delete the entity (company) - prisma.entity.delete({ - where: { id: token.entityId || "" }, - }), - ]); + const relationshipIds = relationshipsToDelete.map((r) => r.id); + + // Execute all deletions in a single atomic transaction + // Order respects foreign key constraints: children before parents + const results = await prisma.$transaction([ + prisma.relationship.deleteMany({ + where: { id: { in: relationshipIds } }, + }), + + prisma.new_corpus.deleteMany({ + where: { entity_id: { in: entityIds } }, + }), + + prisma.publicFormsTokens.deleteMany({ + where: { token: { in: tokenStrings } }, + }), + + prisma.entity.deleteMany({ + where: { id: { in: entityIds } }, + }), + ]); + + totalRelationshipsDeleted += results[0].count; + totalCorpusDeleted += results[1].count; + totalTokensDeleted += results[2].count; + totalEntitiesDeleted += results[3].count; + + // If we got fewer than BATCH_SIZE, we've processed everything + if (expiredTokens.length < BATCH_SIZE) { + break; } } + console.log( + `Cleanup complete: ${totalTokensDeleted} tokens, ${totalEntitiesDeleted} entities, ` + + `${totalRelationshipsDeleted} relationships, ${totalCorpusDeleted} corpus items deleted`, + ); + await update_job_status(job.id, "completed"); } catch (error) { console.error("Error cleaning up unsubmitted forms:", error);