vula-africa · Ochiengsteven · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/tests/unsubmitted_forms/README.md b/tests/unsubmitted_forms/README.md
@@ -0,0 +1,70 @@
+# Cleanup Unsubmitted Forms - Solution
+
+## Video Explanation
+
+[Watch the Loom video](https://www.loom.com/share/3d0413ae4c2845cab8a3e3c93b62f9e3)
+
+## Issues Found & Fixes
+
+### 1. Date Calculation Bug
+**Problem:** Missing `* 1000` for milliseconds conversion - was only subtracting ~10 minutes instead of 7 days.
+
+**Fix:** Added `* 1000` to convert seconds to milliseconds.
+
+### 2. Query Logic Error
+**Problem:** Original query only found tokens from exactly 7 days ago (a 24-hour window), missing tokens that were 8, 9, 10+ days old.
+
+**Fix:** Changed to `lt: cutoffDate` to find all tokens older than 7 days.
+
+### 3. Missing Submission Status Check
+**Problem:** Query fetched all old tokens regardless of whether the form was submitted, risking deletion of valid submitted data.
+
+**Fix:** Added `submittedAt: null` filter to only fetch unsubmitted tokens.
+
+### 4. Wrong Relationship Deletion
+**Problem:** Relationship query only filtered by `product_id` and `status`, which could accidentally delete relationships belonging to other valid tokens that share the same product.
+
+**Fix:** Added `entity_id: { in: entityIds }` to the relationship query to ensure we only delete relationships tied to the specific expired entities.
+
+### 5. Foreign Key Constraint Order
+**Problem:** Deletion order could fail if child records reference parent tables.
+
+**Fix:** Reordered deletions: relationships → corpus items → tokens → entities (children before parents).
+
+### 6. Null EntityId Handling
+**Problem:** Passing null/undefined values to delete queries would cause errors.
+
+**Fix:** Filter out null values using `filter((id): id is string => Boolean(id))` before batch operations.
+
+### 7. N+1 Query Problem (Performance)
+**Problem:** Loop with individual `findFirst` and `$transaction` per token would cause thousands of DB calls at scale.
+
+**Fix:** Refactored to batch approach using `deleteMany` with `{ in: [...] }` in a single transaction.
+
+### 8. Memory Issues with Large Datasets
+
+**Problem:** Loading all expired tokens at once could crash the process with Out of Memory errors.
+
+**Fix:** Implemented batching with `take: BATCH_SIZE` (1000 records) in a while loop to process in chunks.
+
+### 9. Duplicate IDs in Queries
+
+**Problem:** Multiple tokens could share the same entityId/productId, causing unnecessary parameter bloat in IN clauses.
+
+**Fix:** Deduplicate IDs using `[...new Set(...)]` before batch operations.
+
+## Final Solution Features
+
+- **Configurable constants:** `BATCH_SIZE` and `RETENTION_DAYS` at the top for easy adjustment
+- **Batched processing:** Handles millions of records without memory issues
+- **Safe relationship deletion:** Only deletes relationships tied to expired entities
+- **Atomic transactions:** Each batch is processed in a single transaction
+- **Observability:** Logs total counts of deleted records for monitoring
+
+## Trade-offs Considered
+
+- **Batching:** Chose 1000 records per batch as a balance between performance and memory. Could be tuned based on server resources.
+
+- **Extra query for relationships:** Added a `findMany` to get relationship IDs before deletion. This adds one query per batch but ensures we never delete the wrong relationships.
+
+- **Atomicity per batch:** Each batch is atomic, not the entire job. If the job fails mid-way, some batches will have been committed. This is acceptable for a cleanup job - it can safely be re-run.
diff --git a/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts b/tests/unsubmitted_forms/cleanup_unsubmitted_forms.ts
@@ -7,74 +7,114 @@
  This is to prevent the database from being cluttered with unused tokens and entities.
  */
 
-/* Task Instructions:
- * 1. Read and understand the code below
- * 2. Identify ALL issues in the code (there are multiple)
- * 3. Fix the issues and create a working solution
- * 4. Create a PR with clear commit messages
- * 5. Record a 3-5 minute Loom video explaining:
- *    - What issues you found
- *    - How you fixed them
- *    - Any trade-offs you considered
- *
- * Focus on: correctness, performance, error handling, and code clarity
- * Expected time: 45-60 minutes
- */
-
 // For the purpose of this test you can ignore that the imports are not working.
 import type { JobScheduleQueue } from "@prisma/client";
 import { prisma } from "../endpoints/middleware/prisma";
 import { update_job_status } from "./generic_scheduler";
 
+const BATCH_SIZE = 1000;
+const RETENTION_DAYS = 7;
+
 export const cleanup_unsubmitted_forms = async (job: JobScheduleQueue) => {
   try {
-    //Find forms that were created 7 days ago and have not been submitted
-    const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60);
-    const sevenDaysAgoPlusOneDay = new Date(
-      sevenDaysAgo.getTime() + 24 * 60 * 60 * 1000
+    const cutoffDate = new Date(
+      Date.now() - RETENTION_DAYS * 24 * 60 * 60 * 1000,
     );
 
-    const expiredTokens = await prisma.publicFormsTokens.findMany({
-      where: {
-        createdAt: {
-          gte: sevenDaysAgo, // greater than or equal to 7 days ago
-          lt: sevenDaysAgoPlusOneDay, // but less than 7 days ago + 1 day
+    let totalTokensDeleted = 0;
+    let totalEntitiesDeleted = 0;
+    let totalRelationshipsDeleted = 0;
+    let totalCorpusDeleted = 0;
+
+    // Process in batches to handle large datasets without memory issues
+    while (true) {
+      const expiredTokens = await prisma.publicFormsTokens.findMany({
+        where: {
+          createdAt: { lt: cutoffDate },
+          submittedAt: null,
         },
-      },
-    });
+        select: {
+          token: true,
+          entityId: true,
+          productId: true,
+        },
+        take: BATCH_SIZE,
+      });
+
+      if (expiredTokens.length === 0) {
+        break;
+      }
+
+      // Deduplicate identifiers for batch operations
+      const tokenStrings = [...new Set(expiredTokens.map((t) => t.token))];
 
-    for (const token of expiredTokens) {
-      const relationship = await prisma.relationship.findFirst({
+      const entityIds = [
+        ...new Set(
+          expiredTokens
+            .map((t) => t.entityId)
+            .filter((id): id is string => Boolean(id)),
+        ),
+      ];
+
+      const productIds = [
+        ...new Set(
+          expiredTokens
+            .map((t) => t.productId)
+            .filter((id): id is string => Boolean(id)),
+        ),
+      ];
+
+      // Find only relationships that belong to these specific expired tokens
+      // This prevents accidentally deleting relationships for non-expired tokens
+      // that may share the same product_id
+      const relationshipsToDelete = await prisma.relationship.findMany({
         where: {
-          product_id: token.productId,
+          product_id: { in: productIds },
           status: "new",
+          // Ensure we only delete relationships tied to expired entities
+          entity_id: { in: entityIds },
         },
+        select: { id: true },
       });
 
-      if (relationship) {
-        await prisma.$transaction([
-          // Delete relationship
-          prisma.relationship.delete({
-            where: { id: relationship.id },
-          }),
-          // // Delete the token
-          prisma.publicFormsTokens.delete({
-            where: { token: token.token },
-          }),
-          // Delete all corpus items associated with the entity
-          prisma.new_corpus.deleteMany({
-            where: {
-              entity_id: token.entityId || "",
-            },
-          }),
-          // Delete the entity (company)
-          prisma.entity.delete({
-            where: { id: token.entityId || "" },
-          }),
-        ]);
+      const relationshipIds = relationshipsToDelete.map((r) => r.id);
+
+      // Execute all deletions in a single atomic transaction
+      // Order respects foreign key constraints: children before parents
+      const results = await prisma.$transaction([
+        prisma.relationship.deleteMany({
+          where: { id: { in: relationshipIds } },
+        }),
+
+        prisma.new_corpus.deleteMany({
+          where: { entity_id: { in: entityIds } },
+        }),
+
+        prisma.publicFormsTokens.deleteMany({
+          where: { token: { in: tokenStrings } },
+        }),
+
+        prisma.entity.deleteMany({
+          where: { id: { in: entityIds } },
+        }),
+      ]);
+
+      totalRelationshipsDeleted += results[0].count;
+      totalCorpusDeleted += results[1].count;
+      totalTokensDeleted += results[2].count;
+      totalEntitiesDeleted += results[3].count;
+
+      // If we got fewer than BATCH_SIZE, we've processed everything
+      if (expiredTokens.length < BATCH_SIZE) {
+        break;
       }
     }
 
+    console.log(
+      `Cleanup complete: ${totalTokensDeleted} tokens, ${totalEntitiesDeleted} entities, ` +
+        `${totalRelationshipsDeleted} relationships, ${totalCorpusDeleted} corpus items deleted`,
+    );
+
     await update_job_status(job.id, "completed");
   } catch (error) {
     console.error("Error cleaning up unsubmitted forms:", error);